diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 1010 |
1 files changed, 721 insertions, 289 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 4906b44af850..6c5967e80132 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -91,6 +91,7 @@ #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/skbuff.h> +#include <linux/kthread.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/net_namespace.h> @@ -98,8 +99,10 @@ #include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> +#include <net/dsa.h> #include <net/dst.h> #include <net/dst_metadata.h> +#include <net/gro.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> @@ -144,6 +147,7 @@ #include <linux/indirect_call_wrapper.h> #include <net/devlink.h> #include <linux/pm_runtime.h> +#include <linux/prandom.h> #include "net-sysfs.h" @@ -1067,19 +1071,6 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); -struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) -{ - struct net_device *dev; - - ASSERT_RTNL(); - for_each_netdev(net, dev) - if (dev->type == type) - return dev; - - return NULL; -} -EXPORT_SYMBOL(__dev_getfirstbyhwtype); - struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -1130,7 +1121,7 @@ EXPORT_SYMBOL(__dev_get_by_flags); * @name: name string * * Network device names need to be valid file names to - * to allow sysfs to work. We also disallow any kind of + * allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) @@ -1468,6 +1459,25 @@ void netdev_state_change(struct net_device *dev) EXPORT_SYMBOL(netdev_state_change); /** + * __netdev_notify_peers - notify network peers about existence of @dev, + * to be called when rtnl lock is already held. + * @dev: network device + * + * Generate traffic such that interested network peers are aware of + * @dev, such as by generating a gratuitous ARP. This may be used when + * a device wants to inform the rest of the network about some sort of + * reconfiguration such as a failover event or virtual machine + * migration. + */ +void __netdev_notify_peers(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); +} +EXPORT_SYMBOL(__netdev_notify_peers); + +/** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * @@ -1480,12 +1490,32 @@ EXPORT_SYMBOL(netdev_state_change); void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); - call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); - call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); + __netdev_notify_peers(dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); +static int napi_threaded_poll(void *data); + +static int napi_kthread_create(struct napi_struct *n) +{ + int err = 0; + + /* Create and wake up the kthread once to put it in + * TASK_INTERRUPTIBLE mode to avoid the blocked task + * warning and work with loadavg. + */ + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", + n->dev->name, n->napi_id); + if (IS_ERR(n->thread)) { + err = PTR_ERR(n->thread); + pr_err("kthread_run failed with err %d\n", err); + n->thread = NULL; + } + + return err; +} + static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; @@ -2187,28 +2217,14 @@ static inline void net_timestamp_set(struct sk_buff *skb) bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { - unsigned int len; - - if (!(dev->flags & IFF_UP)) - return false; - - len = dev->mtu + dev->hard_header_len + VLAN_HLEN; - if (skb->len <= len) - return true; - - /* if TSO is enabled, we don't care about the length as the packet - * could be forwarded without being segmented before - */ - if (skb_is_gso(skb)) - return true; - - return false; + return __is_skb_forwardable(dev, skb, true); } EXPORT_SYMBOL_GPL(is_skb_forwardable); -int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb, + bool check_mtu) { - int ret = ____dev_forward_skb(dev, skb); + int ret = ____dev_forward_skb(dev, skb, check_mtu); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); @@ -2217,6 +2233,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) return ret; } + +int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, true); +} EXPORT_SYMBOL_GPL(__dev_forward_skb); /** @@ -2243,6 +2264,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_forward_skb); +int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) +{ + return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); +} + static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) @@ -3204,7 +3230,7 @@ int skb_checksum_help(struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; - if (unlikely(skb_shinfo(skb)->gso_size)) { + if (unlikely(skb_is_gso(skb))) { skb_warn_bad_offload(skb); return -EINVAL; } @@ -3493,6 +3519,11 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (gso_segs > dev->gso_max_segs) return features & ~NETIF_F_GSO_MASK; + if (!skb_shinfo(skb)->gso_type) { + skb_warn_bad_offload(skb); + return features & ~NETIF_F_GSO_MASK; + } + /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now @@ -3557,6 +3588,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, dev_queue_xmit_nit(skb, dev); len = skb->len; + PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies); trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); @@ -3604,11 +3636,22 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { - if (unlikely(skb->csum_not_inet)) + if (unlikely(skb_csum_is_sctp(skb))) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); - return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb); + if (features & NETIF_F_HW_CSUM) + return 0; + + if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { + switch (skb->csum_offset) { + case offsetof(struct tcphdr, check): + case offsetof(struct udphdr, check): + return 0; + } + } + + return skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); @@ -3864,6 +3907,8 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ + qdisc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { @@ -4069,7 +4114,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) - __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. @@ -4129,6 +4174,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (!skb) goto out; + PRANDOM_ADD_NOISE(skb, dev, txq, jiffies); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { @@ -4176,7 +4222,7 @@ int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) } EXPORT_SYMBOL(dev_queue_xmit_accel); -int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; @@ -4194,6 +4240,7 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); + PRANDOM_ADD_NOISE(skb, dev, txq, jiffies); local_bh_disable(); @@ -4205,17 +4252,13 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) dev_xmit_recursion_dec(); local_bh_enable(); - - if (!dev_xmit_complete(ret)) - kfree_skb(skb); - return ret; drop: atomic_long_inc(&dev->tx_dropped); kfree_skb_list(skb); return NET_XMIT_DROP; } -EXPORT_SYMBOL(dev_direct_xmit); +EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines @@ -4240,6 +4283,22 @@ int gro_normal_batch __read_mostly = 8; static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { + struct task_struct *thread; + + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { + /* Paired with smp_mb__before_atomic() in + * napi_enable()/dev_set_threaded(). + * Use READ_ONCE() to guarantee a complete + * read on napi->thread. Only call + * wake_up_process() when it's not NULL. + */ + thread = READ_ONCE(napi->thread); + if (thread) { + wake_up_process(thread); + return; + } + } + list_add_tail(&napi->poll_list, &sd->poll_list); __raise_softirq_irqoff(NET_RX_SOFTIRQ); } @@ -4591,14 +4650,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; - void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; + u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; bool orig_bcast; - int hlen, off; - u32 mac_len; + int off; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4630,15 +4689,16 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, * header. */ mac_len = skb->data - skb_mac_header(skb); - hlen = skb_headlen(skb) + mac_len; - xdp->data = skb->data - mac_len; - xdp->data_meta = xdp->data; - xdp->data_end = xdp->data + hlen; - xdp->data_hard_start = skb->data - skb_headroom(skb); + hard_start = skb->data - skb_headroom(skb); /* SKB "head" area always have tailroom for skb_shared_info */ - xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; - xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + frame_sz = (void *)skb_end_pointer(skb) - hard_start; + frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + rxqueue = netif_get_rxqueue(skb); + xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); + xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, + skb_headlen(skb) + mac_len, true); orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4646,9 +4706,6 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; - rxqueue = netif_get_rxqueue(skb); - xdp->rxq = &rxqueue->xdp_rxq; - act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ @@ -4840,6 +4897,21 @@ int netif_rx_ni(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx_ni); +int netif_rx_any_context(struct sk_buff *skb) +{ + /* + * If invoked from contexts which do not invoke bottom half + * processing either at return from interrupt or when softrqs are + * reenabled, use netif_rx_ni() which invokes bottomhalf processing + * directly. + */ + if (in_interrupt()) + return netif_rx(skb); + else + return netif_rx_ni(skb); +} +EXPORT_SYMBOL(netif_rx_any_context); + static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -4868,8 +4940,6 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) else __kfree_skb_defer(skb); } - - __kfree_skb_flush(); } if (sd->output_queue) { @@ -4914,7 +4984,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); static inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) + struct net_device *orig_dev, bool *another) { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); @@ -4934,6 +5004,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_skb_cb(skb)->mru = 0; + qdisc_skb_cb(skb)->post_ct = false; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); @@ -4958,7 +5030,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, * redirecting to another netdev */ __skb_push(skb, skb->mac_len); - skb_do_redirect(skb); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } return NULL; case TC_ACT_CONSUMED: return NULL; @@ -5119,8 +5195,7 @@ another_round: skb_reset_mac_len(skb); } - if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || - skb->protocol == cpu_to_be16(ETH_P_8021AD)) { + if (eth_type_vlan(skb->protocol)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; @@ -5147,7 +5222,12 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { - skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); + bool another = false; + + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; if (!skb) goto out; @@ -5192,15 +5272,14 @@ skip_classify: } } - if (unlikely(skb_vlan_tag_present(skb))) { + if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) { check_vlan_id: if (skb_vlan_tag_get_id(skb)) { /* Vlan id is non 0 and vlan_do_receive() above couldn't * find vlan device. */ skb->pkt_type = PACKET_OTHERHOST; - } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || - skb->protocol == cpu_to_be16(ETH_P_8021AD)) { + } else if (eth_type_vlan(skb->protocol)) { /* Outer header is 802.1P with vlan 0, inner header is * 802.1Q or 802.1AD and vlan_do_receive() above could * not find vlan dev for vlan id 0. @@ -5441,15 +5520,20 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) if (new) { u32 i; + mutex_lock(&new->aux->used_maps_mutex); + /* generic XDP does not work with DEVMAPs that can * have a bpf_prog installed on an entry */ for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i])) - return -EINVAL; - if (cpu_map_prog_allowed(new->aux->used_maps[i])) + if (dev_map_can_have_prog(new->aux->used_maps[i]) || + cpu_map_prog_allowed(new->aux->used_maps[i])) { + mutex_unlock(&new->aux->used_maps_mutex); return -EINVAL; + } } + + mutex_unlock(&new->aux->used_maps_mutex); } switch (xdp->command) { @@ -5621,17 +5705,60 @@ static void flush_backlog(struct work_struct *work) local_bh_enable(); } +static bool flush_required(int cpu) +{ +#if IS_ENABLED(CONFIG_RPS) + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + bool do_flush; + + local_irq_disable(); + rps_lock(sd); + + /* as insertion into process_queue happens with the rps lock held, + * process_queue access may race only with dequeue + */ + do_flush = !skb_queue_empty(&sd->input_pkt_queue) || + !skb_queue_empty_lockless(&sd->process_queue); + rps_unlock(sd); + local_irq_enable(); + + return do_flush; +#endif + /* without RPS we can't safely check input_pkt_queue: during a + * concurrent remote skb_queue_splice() we can detect as empty both + * input_pkt_queue and process_queue even if the latter could end-up + * containing a lot of packets. + */ + return true; +} + static void flush_all_backlogs(void) { + static cpumask_t flush_cpus; unsigned int cpu; + /* since we are under rtnl lock protection we can use static data + * for the cpumask and avoid allocating on stack the possibly + * large mask + */ + ASSERT_RTNL(); + get_online_cpus(); - for_each_online_cpu(cpu) - queue_work_on(cpu, system_highpri_wq, - per_cpu_ptr(&flush_works, cpu)); + cpumask_clear(&flush_cpus); + for_each_online_cpu(cpu) { + if (flush_required(cpu)) { + queue_work_on(cpu, system_highpri_wq, + per_cpu_ptr(&flush_works, cpu)); + cpumask_set_cpu(cpu, &flush_cpus); + } + } - for_each_online_cpu(cpu) + /* we can have in flight packet[s] on the cpus we are not flushing, + * synchronize_net() in unregister_netdevice_many() will take care of + * them + */ + for_each_cpu(cpu, &flush_cpus) flush_work(per_cpu_ptr(&flush_works, cpu)); put_online_cpus(); @@ -5650,15 +5777,14 @@ static void gro_normal_list(struct napi_struct *napi) /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) { list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) + napi->rx_count += segs; + if (napi->rx_count >= gro_normal_batch) gro_normal_list(napi); } -INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; @@ -5692,7 +5818,7 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb); + gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); return NET_RX_SUCCESS; } @@ -5827,10 +5953,6 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) napi_gro_complete(napi, oldest); } -INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, - struct sk_buff *)); static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); @@ -5969,31 +6091,20 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -static void napi_skb_free_stolen_head(struct sk_buff *skb) -{ - skb_dst_drop(skb); - skb_ext_put(skb); - kmem_cache_free(skbuff_head_cache, skb); -} - static gro_result_t napi_skb_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb); - break; - - case GRO_DROP: - kfree_skb(skb); + gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else - __kfree_skb(skb); + __kfree_skb_defer(skb); break; case GRO_HELD: @@ -6070,11 +6181,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb); - break; - - case GRO_DROP: - napi_reuse_skb(napi, skb); + gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: @@ -6138,9 +6245,6 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); - if (!skb) - return GRO_DROP; - trace_napi_gro_frags_entry(skb); ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); @@ -6293,7 +6397,7 @@ EXPORT_SYMBOL(__napi_schedule); * @n: napi context * * Test if NAPI routine is already running, and if not mark - * it as running. This is used as a condition variable + * it as running. This is used as a condition variable to * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ @@ -6381,7 +6485,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); - new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | + NAPIF_STATE_PREFER_BUSY_POLL); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. @@ -6418,10 +6523,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) #if defined(CONFIG_NET_RX_BUSY_POLL) -#define BUSY_POLL_BUDGET 8 +static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +{ + if (!skip_schedule) { + gro_normal_list(napi); + __napi_schedule(napi); + return; + } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) + if (napi->gro_bitmask) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + napi_gro_flush(napi, HZ >= 1000); + } + + gro_normal_list(napi); + clear_bit(NAPI_STATE_SCHED, &napi->state); +} + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, + u16 budget) { + bool skip_schedule = false; + unsigned long timeout; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6438,29 +6563,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) local_bh_disable(); + if (prefer_busy_poll) { + napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + timeout = READ_ONCE(napi->dev->gro_flush_timeout); + if (napi->defer_hard_irqs_count && timeout) { + hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); + skip_schedule = true; + } + } + /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ - rc = napi->poll(napi, BUSY_POLL_BUDGET); + rc = napi->poll(napi, budget); /* We can't gro_normal_list() here, because napi->poll() might have * rearmed the napi (napi_complete_done()) in which case it could * already be running on another CPU. */ - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) { - /* As the whole budget was spent, we still own the napi so can - * safely handle the rx_list. - */ - gro_normal_list(napi); - __napi_schedule(napi); - } + if (rc == budget) + __busy_poll_stop(napi, skip_schedule); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), - void *loop_end_arg) + void *loop_end_arg, bool prefer_busy_poll, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); @@ -6488,17 +6617,23 @@ restart: * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | - NAPIF_STATE_IN_BUSY_POLL)) + NAPIF_STATE_IN_BUSY_POLL)) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | - NAPIF_STATE_SCHED) != val) + NAPIF_STATE_SCHED) != val) { + if (prefer_busy_poll) + set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; + } have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - work = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + work = napi_poll(napi, budget); + trace_napi_poll(napi, work, budget); gro_normal_list(napi); count: if (work > 0) @@ -6511,7 +6646,7 @@ count: if (unlikely(need_resched())) { if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); rcu_read_unlock(); cond_resched(); @@ -6522,7 +6657,7 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock); + busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); preempt_enable(); out: rcu_read_unlock(); @@ -6533,8 +6668,7 @@ EXPORT_SYMBOL(napi_busy_loop); static void napi_hash_add(struct napi_struct *napi) { - if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || - test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) return; spin_lock(&napi_hash_lock); @@ -6555,20 +6689,14 @@ static void napi_hash_add(struct napi_struct *napi) /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ -bool napi_hash_del(struct napi_struct *napi) +static void napi_hash_del(struct napi_struct *napi) { - bool rcu_sync_needed = false; - spin_lock(&napi_hash_lock); - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { - rcu_sync_needed = true; - hlist_del_rcu(&napi->napi_hash_node); - } + hlist_del_init_rcu(&napi->napi_hash_node); + spin_unlock(&napi_hash_lock); - return rcu_sync_needed; } -EXPORT_SYMBOL_GPL(napi_hash_del); static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) { @@ -6580,8 +6708,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (!napi_disable_pending(napi) && - !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) { + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); __napi_schedule_irqoff(napi); + } return HRTIMER_NORESTART; } @@ -6597,10 +6727,57 @@ static void init_gro_hash(struct napi_struct *napi) napi->gro_bitmask = 0; } +int dev_set_threaded(struct net_device *dev, bool threaded) +{ + struct napi_struct *napi; + int err = 0; + + if (dev->threaded == threaded) + return 0; + + if (threaded) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (!napi->thread) { + err = napi_kthread_create(napi); + if (err) { + threaded = false; + break; + } + } + } + } + + dev->threaded = threaded; + + /* Make sure kthread is created before THREADED bit + * is set. + */ + smp_mb__before_atomic(); + + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (threaded) + set_bit(NAPI_STATE_THREADED, &napi->state); + else + clear_bit(NAPI_STATE_THREADED, &napi->state); + } + + return err; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { + if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) + return; + INIT_LIST_HEAD(&napi->poll_list); + INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; init_gro_hash(napi); @@ -6620,6 +6797,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); + /* Create kthread for this napi if dev->threaded is set. + * Clear dev->threaded if kthread creation failed so that + * threaded mode will not be enabled in napi_enable(). + */ + if (dev->threaded && napi_kthread_create(napi)) + dev->threaded = 0; } EXPORT_SYMBOL(netif_napi_add); @@ -6635,10 +6818,30 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); + clear_bit(NAPI_STATE_THREADED, &n->state); } EXPORT_SYMBOL(napi_disable); +/** + * napi_enable - enable NAPI scheduling + * @n: NAPI context + * + * Resume NAPI from being scheduled on this context. + * Must be paired with napi_disable. + */ +void napi_enable(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + smp_mb__before_atomic(); + clear_bit(NAPI_STATE_SCHED, &n->state); + clear_bit(NAPI_STATE_NPSVC, &n->state); + if (n->dev->threaded && n->thread) + set_bit(NAPI_STATE_THREADED, &n->state); +} +EXPORT_SYMBOL(napi_enable); + static void flush_gro_hash(struct napi_struct *napi) { int i; @@ -6653,28 +6856,29 @@ static void flush_gro_hash(struct napi_struct *napi) } /* Must be called in process context */ -void netif_napi_del(struct napi_struct *napi) +void __netif_napi_del(struct napi_struct *napi) { - might_sleep(); - if (napi_hash_del(napi)) - synchronize_net(); - list_del_init(&napi->dev_list); + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) + return; + + napi_hash_del(napi); + list_del_rcu(&napi->dev_list); napi_free_frags(napi); flush_gro_hash(napi); napi->gro_bitmask = 0; + + if (napi->thread) { + kthread_stop(napi->thread); + napi->thread = NULL; + } } -EXPORT_SYMBOL(netif_napi_del); +EXPORT_SYMBOL(__netif_napi_del); -static int napi_poll(struct napi_struct *n, struct list_head *repoll) +static int __napi_poll(struct napi_struct *n, bool *repoll) { - void *have; int work, weight; - list_del_init(&n->poll_list); - - have = netpoll_poll_lock(n); - weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race @@ -6694,7 +6898,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) n->poll, work, weight); if (likely(work < weight)) - goto out_unlock; + return work; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code @@ -6703,7 +6907,20 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); - goto out_unlock; + return work; + } + + /* The NAPI context has more processing work, but busy-polling + * is preferred. Exit early. + */ + if (napi_prefer_busy_poll(n)) { + if (napi_complete_done(n, work)) { + /* If timeout is not set, we need to make sure + * that the NAPI is re-scheduled. + */ + napi_schedule(n); + } + return work; } if (n->gro_bitmask) { @@ -6721,17 +6938,78 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); - goto out_unlock; + return work; } - list_add_tail(&n->poll_list, repoll); + *repoll = true; + + return work; +} + +static int napi_poll(struct napi_struct *n, struct list_head *repoll) +{ + bool do_repoll = false; + void *have; + int work; + + list_del_init(&n->poll_list); + + have = netpoll_poll_lock(n); + + work = __napi_poll(n, &do_repoll); + + if (do_repoll) + list_add_tail(&n->poll_list, repoll); -out_unlock: netpoll_poll_unlock(have); return work; } +static int napi_thread_wait(struct napi_struct *napi) +{ + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop() && !napi_disable_pending(napi)) { + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + WARN_ON(!list_empty(&napi->poll_list)); + __set_current_state(TASK_RUNNING); + return 0; + } + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return -1; +} + +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + void *have; + + while (!napi_thread_wait(napi)) { + for (;;) { + bool repoll = false; + + local_bh_disable(); + + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + local_bh_enable(); + + if (!repoll) + break; + + cond_resched(); + } + } + return 0; +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -6750,7 +7028,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) - goto out; + return; break; } @@ -6777,8 +7055,6 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); -out: - __kfree_skb_flush(); } struct netdev_adjacent { @@ -6844,7 +7120,7 @@ bool netdev_has_upper_dev(struct net_device *dev, EXPORT_SYMBOL(netdev_has_upper_dev); /** - * netdev_has_upper_dev_all - Check if device is linked to an upper device + * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * @@ -7990,6 +8266,39 @@ struct net_device *netdev_get_xmit_slave(struct net_device *dev, } EXPORT_SYMBOL(netdev_get_xmit_slave); +static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, + struct sock *sk) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_sk_get_lower_dev) + return NULL; + return ops->ndo_sk_get_lower_dev(dev, sk); +} + +/** + * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket + * @dev: device + * @sk: the socket + * + * %NULL is returned if no lower device is found. + */ + +struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, + struct sock *sk) +{ + struct net_device *lower; + + lower = netdev_sk_get_lower_dev(dev, sk); + while (lower) { + dev = lower; + lower = netdev_sk_get_lower_dev(dev, sk); + } + + return dev; +} +EXPORT_SYMBOL(netdev_sk_get_lowest_dev); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; @@ -8082,7 +8391,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private); /** - * netdev_lower_change - Dispatch event about lower device state change + * netdev_lower_state_changed - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * @@ -8612,6 +8921,48 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, } EXPORT_SYMBOL(dev_set_mac_address); +static DECLARE_RWSEM(dev_addr_sem); + +int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, + struct netlink_ext_ack *extack) +{ + int ret; + + down_write(&dev_addr_sem); + ret = dev_set_mac_address(dev, sa, extack); + up_write(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address_user); + +int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +{ + size_t size = sizeof(sa->sa_data); + struct net_device *dev; + int ret = 0; + + down_read(&dev_addr_sem); + rcu_read_lock(); + + dev = dev_get_by_name_rcu(net, dev_name); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + if (!dev->addr_len) + memset(sa->sa_data, 0, size); + else + memcpy(sa->sa_data, dev->dev_addr, + min_t(size_t, size, dev->addr_len)); + sa->sa_family = dev->type; + +unlock: + rcu_read_unlock(); + up_read(&dev_addr_sem); + return ret; +} +EXPORT_SYMBOL(dev_get_mac_address); + /** * dev_change_carrier - Change device carrier * @dev: device @@ -8827,7 +9178,7 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode) return dev->netdev_ops->ndo_bpf; default: return NULL; - }; + } } static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev, @@ -8846,6 +9197,17 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev, return dev->xdp_state[mode].prog; } +static u8 dev_xdp_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog || dev->xdp_state[i].link) + count++; + return count; +} + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@ -8936,6 +9298,7 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack struct bpf_xdp_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog, u32 flags) { + unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; enum bpf_xdp_mode mode; bpf_op_t bpf_op; @@ -8951,11 +9314,17 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment"); return -EINVAL; } - /* just one XDP mode bit should be set, zero defaults to SKB mode */ - if (hweight32(flags & XDP_FLAGS_MODES) > 1) { + /* just one XDP mode bit should be set, zero defaults to drv/skb mode */ + if (num_modes > 1) { NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set"); return -EINVAL; } + /* avoid ambiguity if offload + drv/skb mode progs are both loaded */ + if (!num_modes && dev_xdp_prog_count(dev) > 1) { + NL_SET_ERR_MSG(extack, + "More than one program loaded, unset mode is ambiguous"); + return -EINVAL; + } /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified"); @@ -9293,106 +9662,6 @@ static void net_set_todo(struct net_device *dev) dev_net(dev)->dev_unreg_count++; } -static void rollback_registered_many(struct list_head *head) -{ - struct net_device *dev, *tmp; - LIST_HEAD(close_head); - - BUG_ON(dev_boot_phase); - ASSERT_RTNL(); - - list_for_each_entry_safe(dev, tmp, head, unreg_list) { - /* Some devices call without registering - * for initialization unwind. Remove those - * devices and proceed with the remaining. - */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - pr_debug("unregister_netdevice: device %s/%p never was registered\n", - dev->name, dev); - - WARN_ON(1); - list_del(&dev->unreg_list); - continue; - } - dev->dismantle = true; - BUG_ON(dev->reg_state != NETREG_REGISTERED); - } - - /* If device is running, close it first. */ - list_for_each_entry(dev, head, unreg_list) - list_add_tail(&dev->close_list, &close_head); - dev_close_many(&close_head, true); - - list_for_each_entry(dev, head, unreg_list) { - /* And unlink it from device chain. */ - unlist_netdevice(dev); - - dev->reg_state = NETREG_UNREGISTERING; - } - flush_all_backlogs(); - - synchronize_net(); - - list_for_each_entry(dev, head, unreg_list) { - struct sk_buff *skb = NULL; - - /* Shutdown queueing discipline. */ - dev_shutdown(dev); - - dev_xdp_uninstall(dev); - - /* Notify protocols, that we are about to destroy - * this device. They should clean all the things. - */ - call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL, 0); - - /* - * Flush the unicast and multicast chains - */ - dev_uc_flush(dev); - dev_mc_flush(dev); - - netdev_name_node_alt_flush(dev); - netdev_name_node_free(dev->name_node); - - if (dev->netdev_ops->ndo_uninit) - dev->netdev_ops->ndo_uninit(dev); - - if (skb) - rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); - - /* Notifier chain MUST detach us all upper devices. */ - WARN_ON(netdev_has_any_upper_dev(dev)); - WARN_ON(netdev_has_any_lower_dev(dev)); - - /* Remove entries from kobject tree */ - netdev_unregister_kobject(dev); -#ifdef CONFIG_XPS - /* Remove XPS queueing entries */ - netif_reset_xps_queues_gt(dev, 0); -#endif - } - - synchronize_net(); - - list_for_each_entry(dev, head, unreg_list) - dev_put(dev); -} - -static void rollback_registered(struct net_device *dev) -{ - LIST_HEAD(single); - - list_add(&dev->unreg_list, &single); - rollback_registered_many(&single); - list_del(&single); -} - static netdev_features_t netdev_sync_upper_features(struct net_device *lower, struct net_device *upper, netdev_features_t features) { @@ -9513,6 +9782,22 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + if (features & NETIF_F_HW_TLS_TX) { + bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == + (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + bool hw_csum = features & NETIF_F_HW_CSUM; + + if (!ip_csum && !hw_csum) { + netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_TX; + } + } + + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } + return features; } @@ -9533,7 +9818,7 @@ int __netdev_update_features(struct net_device *dev) /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); - /* some features can't be enabled if they're off an an upper device */ + /* some features can't be enabled if they're off on an upper device */ netdev_for_each_upper_dev_rcu(dev, upper, iter) features = netdev_sync_upper_features(dev, upper, features); @@ -9688,7 +9973,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) rx[i].dev = dev; /* XDP RX-queue setup */ - err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0); if (err < 0) goto err_rxq_info; } @@ -9849,7 +10134,7 @@ int register_netdevice(struct net_device *dev) dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF); dev->features |= NETIF_F_SOFT_FEATURES; - if (dev->netdev_ops->ndo_udp_tunnel_add) { + if (dev->udp_tunnel_nic_info) { dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; } @@ -9924,17 +10209,10 @@ int register_netdevice(struct net_device *dev) ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { - rollback_registered(dev); - rcu_barrier(); - - dev->reg_state = NETREG_UNREGISTERED; - /* We should put the kobject that hold in - * netdev_unregister_kobject(), otherwise - * the net device cannot be freed when - * driver calls free_netdev(), because the - * kobject is being hold. - */ - kobject_put(&dev->dev.kobj); + /* Expect explicit free_netdev() on failure */ + dev->needs_free_netdev = false; + unregister_netdevice_queue(dev, NULL); + goto out; } /* * Prevent userspace races by waiting until the network @@ -10037,6 +10315,8 @@ int netdev_refcnt_read(const struct net_device *dev) } EXPORT_SYMBOL(netdev_refcnt_read); +#define WAIT_REFS_MIN_MSECS 1 +#define WAIT_REFS_MAX_MSECS 250 /** * netdev_wait_allrefs - wait until all references are gone. * @dev: target net_device @@ -10052,7 +10332,7 @@ EXPORT_SYMBOL(netdev_refcnt_read); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; - int refcnt; + int wait = 0, refcnt; linkwatch_forget_dev(dev); @@ -10086,7 +10366,13 @@ static void netdev_wait_allrefs(struct net_device *dev) rebroadcast_time = jiffies; } - msleep(250); + if (!wait) { + rcu_barrier(); + wait = WAIT_REFS_MIN_MSECS; + } else { + msleep(wait); + wait = min(wait << 1, WAIT_REFS_MAX_MSECS); + } refcnt = netdev_refcnt_read(dev); @@ -10134,7 +10420,7 @@ void netdev_run_todo(void) struct net_device *dev = list_first_entry(&unlink_list, struct net_device, unlink_list); - list_del(&dev->unlink_list); + list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } #endif @@ -10249,6 +10535,55 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } EXPORT_SYMBOL(dev_get_stats); +/** + * dev_fetch_sw_netstats - get per-cpu network device statistics + * @s: place to store stats + * @netstats: per-cpu network stats to read from + * + * Read per-cpu network statistics and populate the related fields in @s. + */ +void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, + const struct pcpu_sw_netstats __percpu *netstats) +{ + int cpu; + + for_each_possible_cpu(cpu) { + const struct pcpu_sw_netstats *stats; + struct pcpu_sw_netstats tmp; + unsigned int start; + + stats = per_cpu_ptr(netstats, cpu); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + tmp.rx_packets = stats->rx_packets; + tmp.rx_bytes = stats->rx_bytes; + tmp.tx_packets = stats->tx_packets; + tmp.tx_bytes = stats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + + s->rx_packets += tmp.rx_packets; + s->rx_bytes += tmp.rx_bytes; + s->tx_packets += tmp.tx_packets; + s->tx_bytes += tmp.tx_bytes; + } +} +EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats); + +/** + * dev_get_tstats64 - ndo_get_stats64 implementation + * @dev: device to get statistics from + * @s: place to store stats + * + * Populate @s from dev->stats and dev->tstats. Can be used as + * ndo_get_stats64() callback. + */ +void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s) +{ + netdev_stats_to_stats64(s, &dev->stats); + dev_fetch_sw_netstats(s, dev->tstats); +} +EXPORT_SYMBOL_GPL(dev_get_tstats64); + struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); @@ -10421,6 +10756,17 @@ void free_netdev(struct net_device *dev) struct napi_struct *p, *n; might_sleep(); + + /* When called immediately after register_netdevice() failed the unwind + * handling may still be dismantling the device. Handle that case by + * deferring the free. + */ + if (dev->reg_state == NETREG_UNREGISTERING) { + ASSERT_RTNL(); + dev->needs_free_netdev = true; + return; + } + netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -10487,9 +10833,10 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) if (head) { list_move_tail(&dev->unreg_list, head); } else { - rollback_registered(dev); - /* Finish processing unregister after unlock */ - net_set_todo(dev); + LIST_HEAD(single); + + list_add(&dev->unreg_list, &single); + unregister_netdevice_many(&single); } } EXPORT_SYMBOL(unregister_netdevice_queue); @@ -10503,14 +10850,100 @@ EXPORT_SYMBOL(unregister_netdevice_queue); */ void unregister_netdevice_many(struct list_head *head) { - struct net_device *dev; + struct net_device *dev, *tmp; + LIST_HEAD(close_head); + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + if (list_empty(head)) + return; + + list_for_each_entry_safe(dev, tmp, head, unreg_list) { + /* Some devices call without registering + * for initialization unwind. Remove those + * devices and proceed with the remaining. + */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + pr_debug("unregister_netdevice: device %s/%p never was registered\n", + dev->name, dev); + + WARN_ON(1); + list_del(&dev->unreg_list); + continue; + } + dev->dismantle = true; + BUG_ON(dev->reg_state != NETREG_REGISTERED); + } + + /* If device is running, close it first. */ + list_for_each_entry(dev, head, unreg_list) + list_add_tail(&dev->close_list, &close_head); + dev_close_many(&close_head, true); - if (!list_empty(head)) { - rollback_registered_many(head); - list_for_each_entry(dev, head, unreg_list) - net_set_todo(dev); - list_del(head); + list_for_each_entry(dev, head, unreg_list) { + /* And unlink it from device chain. */ + unlist_netdevice(dev); + + dev->reg_state = NETREG_UNREGISTERING; } + flush_all_backlogs(); + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + struct sk_buff *skb = NULL; + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + dev_xdp_uninstall(dev); + + /* Notify protocols, that we are about to destroy + * this device. They should clean all the things. + */ + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); + + if (!dev->rtnl_link_ops || + dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, + GFP_KERNEL, NULL, 0); + + /* + * Flush the unicast and multicast chains + */ + dev_uc_flush(dev); + dev_mc_flush(dev); + + netdev_name_node_alt_flush(dev); + netdev_name_node_free(dev->name_node); + + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); + + if (skb) + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); + + /* Notifier chain MUST detach us all upper devices. */ + WARN_ON(netdev_has_any_upper_dev(dev)); + WARN_ON(netdev_has_any_lower_dev(dev)); + + /* Remove entries from kobject tree */ + netdev_unregister_kobject(dev); +#ifdef CONFIG_XPS + /* Remove XPS queueing entries */ + netif_reset_xps_queues_gt(dev, 0); +#endif + } + + synchronize_net(); + + list_for_each_entry(dev, head, unreg_list) { + dev_put(dev); + net_set_todo(dev); + } + + list_del(head); } EXPORT_SYMBOL(unregister_netdevice_many); @@ -11048,8 +11481,7 @@ static int __init net_dev_init(void) INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS - sd->csd.func = rps_trigger_softirq; - sd->csd.info = sd; + INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif |