From 5f8dc33e8ee7e59bee3bc6dc2088807a384b285a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Nov 2015 14:54:01 +0100 Subject: net: fix feature changes on devices without ndo_set_features When __netdev_update_features() was updated to ensure some features are disabled on new lower devices, an error was introduced for devices which don't have the ndo_set_features() method set. Before we'll just set the new features, but now we return an error and don't set them. Fix this by returning the old behaviour and setting err to 0 when ndo_set_features is not present. Fixes: e7868a85e1b2 ("net/core: ensure features get disabled on new lower devs") CC: Jarod Wilson CC: Jiri Pirko CC: Ido Schimmel CC: Sander Eikelenboom CC: Andy Gospodarek CC: Florian Fainelli Signed-off-by: Nikolay Aleksandrov Reviewed-by: Jiri Pirko Reviewed-by: Andy Gospodarek Reviewed-by: Jarod Wilson Tested-by: Florian Fainelli Tested-by: Dave Young Tested-by: Christian Borntraeger Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index ab9b8d0d115e4..4a1d198dbbfff 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6426,6 +6426,8 @@ int __netdev_update_features(struct net_device *dev) if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); + else + err = 0; if (unlikely(err < 0)) { netdev_err(dev, -- cgit v1.2.3 From 00ee5927177792a6e139d50b6b7564d35705556a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Nov 2015 15:20:24 +0100 Subject: net: fix __netdev_update_features return on ndo_set_features failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If ndo_set_features fails __netdev_update_features() will return -1 but this is wrong because it is expected to return 0 if no features were changed (see netdev_update_features()), which will cause a netdev notifier to be called without any actual changes. Fix this by returning 0 if ndo_set_features fails. Fixes: 6cb6a27c45ce ("net: Call netdev_features_change() from netdev_update_features()") CC: Michał Mirosław Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 4a1d198dbbfff..1974aee005a65 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6433,7 +6433,7 @@ int __netdev_update_features(struct net_device *dev) netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); - return -1; + return 0; } sync_lower: -- cgit v1.2.3 From 88ad4175b201ae24be5e9b7752cf33c1306b64e4 Mon Sep 17 00:00:00 2001 From: Bjørn Mork Date: Mon, 16 Nov 2015 19:16:40 +0100 Subject: net/core: use netdev name in warning if no parent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A recent flaw in the netdev feature setting resulted in warnings like this one from VLAN interfaces: WARNING: CPU: 1 PID: 4975 at net/core/dev.c:2419 skb_warn_bad_offload+0xbc/0xcb() : caps=(0x00000000001b5820, 0x00000000001b5829) len=2782 data_len=0 gso_size=1348 gso_type=16 ip_summed=3 The ":" is supposed to be preceded by a driver name, but in this case it is an empty string since the device has no parent. There are many types of network devices without a parent. The anonymous warnings for these devices can be hard to debug. Log the network device name instead in these cases to assist further debugging. This is mostly similar to how __netdev_printk() handles orphan devices. Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- net/core/dev.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 1974aee005a65..5dbc86ea6b580 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2403,17 +2403,20 @@ static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features = 0; struct net_device *dev = skb->dev; - const char *driver = ""; + const char *name = ""; if (!net_ratelimit()) return; - if (dev && dev->dev.parent) - driver = dev_driver_string(dev->dev.parent); - + if (dev) { + if (dev->dev.parent) + name = dev_driver_string(dev->dev.parent); + else + name = netdev_name(dev); + } WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " "gso_type=%d ip_summed=%d\n", - driver, dev ? &dev->features : &null_features, + name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features, skb->len, skb->data_len, skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, skb->ip_summed); -- cgit v1.2.3 From 17b85d29e82cc3c874a497a8bc5764d6a2b043e2 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 17 Nov 2015 15:49:06 +0100 Subject: net/core: revert "net: fix __netdev_update_features return.." and add comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 00ee59271777 ("net: fix __netdev_update_features return on ndo_set_features failure") and adds a comment explaining why it's okay to return a value other than 0 upon error. Some drivers might actually change flags and return an error so it's better to fire a spurious notification rather than miss these. CC: Michał Mirosław Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 5dbc86ea6b580..ae00b894e6755 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6436,7 +6436,10 @@ int __netdev_update_features(struct net_device *dev) netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); - return 0; + /* return non-0 since some features might have changed and + * it's better to fire a spurious notification than miss it + */ + return -1; } sync_lower: -- cgit v1.2.3 From 52bd2d62ce6758d811edcbd2256eb9ea7f6a56cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:50 -0800 Subject: net: better skb->sender_cpu and skb->napi_id cohabitation skb->sender_cpu and skb->napi_id share a common storage, and we had various bugs about this. We had to call skb_sender_cpu_clear() in some places to not leave a prior skb->napi_id and fool netdev_pick_tx() As suggested by Alexei, we could split the space so that these errors can not happen. 0 value being reserved as the common (not initialized) value, let's reserve [1 .. NR_CPUS] range for valid sender_cpu, and [NR_CPUS+1 .. ~0U] for valid napi_id. This will allow proper busy polling support over tunnels. Signed-off-by: Eric Dumazet Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 --- net/core/dev.c | 33 ++++++++++++++++----------------- 2 files changed, 16 insertions(+), 20 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4355129fff91b..c9c394bf07710 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1082,9 +1082,6 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) static inline void skb_sender_cpu_clear(struct sk_buff *skb) { -#ifdef CONFIG_XPS - skb->sender_cpu = 0; -#endif } #ifdef NET_SKBUFF_DATA_USES_OFFSET diff --git a/net/core/dev.c b/net/core/dev.c index ae00b894e6755..2582c24a75c69 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -182,7 +182,7 @@ EXPORT_SYMBOL(dev_base_lock); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); -static unsigned int napi_gen_id; +static unsigned int napi_gen_id = NR_CPUS; static DEFINE_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; @@ -3021,7 +3021,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, int queue_index = 0; #ifdef CONFIG_XPS - if (skb->sender_cpu == 0) + u32 sender_cpu = skb->sender_cpu - 1; + + if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif @@ -4676,25 +4678,22 @@ EXPORT_SYMBOL_GPL(napi_by_id); void napi_hash_add(struct napi_struct *napi) { - if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { + if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + return; - spin_lock(&napi_hash_lock); + spin_lock(&napi_hash_lock); - /* 0 is not a valid id, we also skip an id that is taken - * we expect both events to be extremely rare - */ - napi->napi_id = 0; - while (!napi->napi_id) { - napi->napi_id = ++napi_gen_id; - if (napi_by_id(napi->napi_id)) - napi->napi_id = 0; - } + /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + do { + if (unlikely(++napi_gen_id < NR_CPUS + 1)) + napi_gen_id = NR_CPUS + 1; + } while (napi_by_id(napi_gen_id)); + napi->napi_id = napi_gen_id; - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); - spin_unlock(&napi_hash_lock); - } + spin_unlock(&napi_hash_lock); } EXPORT_SYMBOL_GPL(napi_hash_add); -- cgit v1.2.3 From 02d62e86fe892c59a1259d089d4d16ac76977a37 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:52 -0800 Subject: net: un-inline sk_busy_loop() There is really little gain from inlining this big function. We'll soon make it even bigger in following patches. This means we no longer need to export napi_by_id() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 --------- include/net/busy_poll.h | 45 +----------------------------------------- net/core/dev.c | 50 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 49 insertions(+), 55 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67bfac1abfc1a..2020a89df12bb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -460,15 +460,6 @@ static inline void napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } -/** - * napi_by_id - lookup a NAPI by napi_id - * @napi_id: hashed napi_id - * - * lookup @napi_id in napi_hash table - * must be called under rcu_read_lock() - */ -struct napi_struct *napi_by_id(unsigned int napi_id); - /** * napi_hash_add - add a NAPI to global hashtable * @napi: napi context diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 1d67fb6b23a04..2fbeb1313c0f4 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -72,50 +72,7 @@ static inline bool busy_loop_timeout(unsigned long end_time) return time_after(now, end_time); } -/* when used in sock_poll() nonblock is known at compile time to be true - * so the loop and end_time will be optimized out - */ -static inline bool sk_busy_loop(struct sock *sk, int nonblock) -{ - unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; - const struct net_device_ops *ops; - struct napi_struct *napi; - int rc = false; - - /* - * rcu read lock for napi hash - * bh so we don't race with net_rx_action - */ - rcu_read_lock_bh(); - - napi = napi_by_id(sk->sk_napi_id); - if (!napi) - goto out; - - ops = napi->dev->netdev_ops; - if (!ops->ndo_busy_poll) - goto out; - - do { - rc = ops->ndo_busy_poll(napi); - - if (rc == LL_FLUSH_FAILED) - break; /* permanent failure */ - - if (rc > 0) - /* local bh are disabled so it is ok to use _BH */ - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); - cpu_relax(); - - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && - !need_resched() && !busy_loop_timeout(end_time)); - - rc = !skb_queue_empty(&sk->sk_receive_queue); -out: - rcu_read_unlock_bh(); - return rc; -} +bool sk_busy_loop(struct sock *sk, int nonblock); /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index 2582c24a75c69..74a816b299dfe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include #include @@ -4663,7 +4664,7 @@ void napi_complete_done(struct napi_struct *n, int work_done) EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) +static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; @@ -4674,7 +4675,52 @@ struct napi_struct *napi_by_id(unsigned int napi_id) return NULL; } -EXPORT_SYMBOL_GPL(napi_by_id); + +#if defined(CONFIG_NET_RX_BUSY_POLL) +bool sk_busy_loop(struct sock *sk, int nonblock) +{ + unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + const struct net_device_ops *ops; + struct napi_struct *napi; + int rc = false; + + /* + * rcu read lock for napi hash + * bh so we don't race with net_rx_action + */ + rcu_read_lock_bh(); + + napi = napi_by_id(sk->sk_napi_id); + if (!napi) + goto out; + + ops = napi->dev->netdev_ops; + if (!ops->ndo_busy_poll) + goto out; + + do { + rc = ops->ndo_busy_poll(napi); + + if (rc == LL_FLUSH_FAILED) + break; /* permanent failure */ + + if (rc > 0) + /* local bh are disabled so it is ok to use _BH */ + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); + cpu_relax(); + + } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && + !need_resched() && !busy_loop_timeout(end_time)); + + rc = !skb_queue_empty(&sk->sk_receive_queue); +out: + rcu_read_unlock_bh(); + return rc; +} +EXPORT_SYMBOL(sk_busy_loop); + +#endif /* CONFIG_NET_RX_BUSY_POLL */ void napi_hash_add(struct napi_struct *napi) { -- cgit v1.2.3 From 2a028ecb76497d05e5cd4e3e8b09d965cac2e3f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:53 -0800 Subject: net: allow BH servicing in sk_busy_loop() Instead of blocking BH in whole sk_busy_loop(), block them only around ->ndo_busy_poll() calls. This has many benefits. 1) allow tunneled traffic to use busy poll as well as native traffic. Tunnels handlers usually call netif_rx() and depend on net_rx_action() being run (from sofirq handler) 2) allow RFS/RPS being used (sending IPI to other cpus if needed) 3) use the 'lets burn cpu cycles' budget to do useful work (like TX completions, timers, RCU callbacks...) 4) reduce BH latencies, making busy poll a better citizen. Tested: Tested with SIT tunnel lpaa5:~# echo 0 >/proc/sys/net/core/busy_read lpaa5:~# ./netperf -H 2002:af6:786::1 -t TCP_RR MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:786::1 () port 0 AF_INET6 : first burst 0 Local /Remote Socket Size Request Resp. Elapsed Trans. Send Recv Size Size Time Rate bytes Bytes bytes bytes secs. per sec 16384 87380 1 1 10.00 37373.93 16384 87380 Now enable busy poll on both hosts lpaa5:~# echo 70 >/proc/sys/net/core/busy_read lpaa6:~# echo 70 >/proc/sys/net/core/busy_read lpaa5:~# ./netperf -H 2002:af6:786::1 -t TCP_RR MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:786::1 () port 0 AF_INET6 : first burst 0 Local /Remote Socket Size Request Resp. Elapsed Trans. Send Recv Size Size Time Rate bytes Bytes bytes bytes secs. per sec 16384 87380 1 1 10.00 58314.77 16384 87380 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 74a816b299dfe..2002eec2617d6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4684,11 +4684,7 @@ bool sk_busy_loop(struct sock *sk, int nonblock) struct napi_struct *napi; int rc = false; - /* - * rcu read lock for napi hash - * bh so we don't race with net_rx_action - */ - rcu_read_lock_bh(); + rcu_read_lock(); napi = napi_by_id(sk->sk_napi_id); if (!napi) @@ -4699,23 +4695,23 @@ bool sk_busy_loop(struct sock *sk, int nonblock) goto out; do { + local_bh_disable(); rc = ops->ndo_busy_poll(napi); + if (rc > 0) + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); + local_bh_enable(); if (rc == LL_FLUSH_FAILED) break; /* permanent failure */ - if (rc > 0) - /* local bh are disabled so it is ok to use _BH */ - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); cpu_relax(); - } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && !need_resched() && !busy_loop_timeout(end_time)); rc = !skb_queue_empty(&sk->sk_receive_queue); out: - rcu_read_unlock_bh(); + rcu_read_unlock(); return rc; } EXPORT_SYMBOL(sk_busy_loop); -- cgit v1.2.3 From ce6aea93f7510437dde625b77a7a2f4d20b72660 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:54 -0800 Subject: net: network drivers no longer need to implement ndo_busy_poll() Instead of having to implement complex ndo_busy_poll() method, drivers can simply rely on NAPI poll logic. Busy polling gains are mainly coming from polling itself, not on exact details on how we poll the device. ndo_busy_poll() if implemented can avoid touching napi state, but it adds extra synchronization between normal napi->poll() and busy poll handler, slowing down the common path (non busy polling) with extra atomic operations. In practice few drivers ever got busy poll because of the complexity. We could go one step further, and make busy polling available for all NAPI drivers, but this would require that all netif_napi_del() calls are done in process context so that we can call synchronize_rcu(). Full audit would be required. Before this is done, a driver still needs to call : - skb_mark_napi_id() for each skb provided to the stack. - napi_hash_add() and napi_hash_del() to allocate a napi_id per napi struct. - Make sure RCU grace period is respected after napi_hash_del() before memory containing napi structure is freed. Followup patch implements busy poll for mlx5 driver as an example. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2002eec2617d6..93009610aee84 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4677,10 +4677,11 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) } #if defined(CONFIG_NET_RX_BUSY_POLL) +#define BUSY_POLL_BUDGET 8 bool sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; - const struct net_device_ops *ops; + int (*busy_poll)(struct napi_struct *dev); struct napi_struct *napi; int rc = false; @@ -4690,13 +4691,27 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (!napi) goto out; - ops = napi->dev->netdev_ops; - if (!ops->ndo_busy_poll) - goto out; + /* Note: ndo_busy_poll method is optional in linux-4.5 */ + busy_poll = napi->dev->netdev_ops->ndo_busy_poll; do { + rc = 0; local_bh_disable(); - rc = ops->ndo_busy_poll(napi); + if (busy_poll) { + rc = busy_poll(napi); + } else if (napi_schedule_prep(napi)) { + void *have = netpoll_poll_lock(napi); + + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi); + if (rc == BUSY_POLL_BUDGET) { + napi_complete_done(napi, rc); + napi_schedule(napi); + } + } + netpoll_poll_unlock(have); + } if (rc > 0) NET_ADD_STATS_BH(sock_net(sk), LINUX_MIB_BUSYPOLLRXPACKETS, rc); -- cgit v1.2.3 From 93f93a4404159ecf7e9148f5ad0718ec702ac4cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:30:59 -0800 Subject: net: move skb_mark_napi_id() into core networking stack We would like to automatically provide busy polling support to all NAPI drivers, without them having to implement anything. skb_mark_napi_id() can be called from napi_gro_receive() and napi_get_frags(). Few drivers are still calling skb_mark_napi_id() because they use netif_receive_skb(). They should eventually call napi_gro_receive() instead. I will leave this to drivers maintainers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 1 - drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 2 -- drivers/net/ethernet/chelsio/cxgb4/sge.c | 1 - drivers/net/ethernet/emulex/benet/be_main.c | 1 - drivers/net/ethernet/intel/i40e/i40e_txrx.c | 1 - drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 1 - drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 3 --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 1 - drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 2 +- drivers/net/ethernet/sfc/rx.c | 1 - drivers/net/virtio_net.c | 2 -- net/core/dev.c | 2 ++ 13 files changed, 4 insertions(+), 16 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 53ce1222b11d1..8a9b493566c90 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -2024,7 +2024,6 @@ read_again: skb->dev = netdev; skb->protocol = eth_type_trans(skb, netdev); skb_record_rx_queue(skb, channel->queue_index); - skb_mark_napi_id(skb, napi); napi_gro_receive(napi, skb); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index ca208a7eecd5a..ab9222924bd97 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1094,8 +1094,6 @@ reuse_rx: __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), le16_to_cpu(cqe_fp->vlan_tag)); - skb_mark_napi_id(skb, &fp->napi); - napi_gro_receive(&fp->napi, skb); next_rx: rx_buf->data = NULL; diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index b7b93e7a643d8..f650f295f2646 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -1864,7 +1864,6 @@ static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl, skb->truesize += skb->data_len; skb->ip_summed = CHECKSUM_UNNECESSARY; skb_record_rx_queue(skb, rxq->rspq.idx); - skb_mark_napi_id(skb, &rxq->rspq.napi); pi = netdev_priv(skb->dev); if (pi->rxtstamp) cxgb4_sgetim_to_hwtstamp(adapter, skb_hwtstamps(skb), diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index b6ad02909d6b3..c29d62496ad99 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2184,7 +2184,6 @@ static void be_rx_compl_process_gro(struct be_rx_obj *rxo, skb_set_hash(skb, rxcp->rss_hash, PKT_HASH_TYPE_L3); skb->csum_level = rxcp->tunneled; - skb_mark_napi_id(skb, napi); if (rxcp->vlanf) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 635b3ac17877b..6649ce4ba2de2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1632,7 +1632,6 @@ static int i40e_clean_rx_irq_ps(struct i40e_ring *rx_ring, int budget) continue; } #endif - skb_mark_napi_id(skb, &rx_ring->q_vector->napi); i40e_receive_skb(rx_ring, skb, vlan_tag); rx_desc->wb.qword1.status_error_len = 0; diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index 47e9a90d6b100..77968b184b1fb 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -1090,7 +1090,6 @@ static int i40e_clean_rx_irq_ps(struct i40e_ring *rx_ring, int budget) continue; } #endif - skb_mark_napi_id(skb, &rx_ring->q_vector->napi); i40e_receive_skb(rx_ring, skb, vlan_tag); rx_desc->wb.qword1.status_error_len = 0; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 47395ff5d908c..4089d776d01a4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1659,6 +1659,7 @@ static void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring, static void ixgbe_rx_skb(struct ixgbe_q_vector *q_vector, struct sk_buff *skb) { + skb_mark_napi_id(skb, &q_vector->napi); if (ixgbe_qv_busy_polling(q_vector)) netif_receive_skb(skb); else @@ -2123,7 +2124,6 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, } #endif /* IXGBE_FCOE */ - skb_mark_napi_id(skb, &q_vector->napi); ixgbe_rx_skb(q_vector, skb); /* update budget accounting */ diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 1feead34093b7..41440b2b20a36 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -925,7 +925,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud PKT_HASH_TYPE_L3); skb_record_rx_queue(gro_skb, cq->ring); - skb_mark_napi_id(gro_skb, &cq->napi); if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) { timestamp = mlx4_en_get_cqe_ts(cqe); @@ -988,8 +987,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud timestamp); } - skb_mark_napi_id(skb, &cq->napi); - napi_gro_receive(&cq->napi, skb); next: for (nr = 0; nr < priv->num_frags; nr++) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index fe752f8e24b9b..7c8c4088d1be0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -243,7 +243,6 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget) wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter); skb = rq->skb[wqe_counter]; prefetch(skb->data); - skb_mark_napi_id(skb, cq->napi); rq->skb[wqe_counter] = NULL; dma_unmap_single(rq->pdev, diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index 83651ac8ddb9d..acf866147d65c 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -1488,7 +1488,6 @@ myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum) } myri10ge_vlan_rx(mgp->dev, va, skb); skb_record_rx_queue(skb, ss - &mgp->ss[0]); - skb_mark_napi_id(skb, &ss->napi); if (polling) { int hlen; @@ -1506,6 +1505,7 @@ myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum) skb->data_len -= hlen; skb->tail += hlen; skb->protocol = eth_type_trans(skb, dev); + skb_mark_napi_id(skb, &ss->napi); netif_receive_skb(skb); } else diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 809ea4610a77e..8956995b2fe71 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -463,7 +463,6 @@ efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, skb_record_rx_queue(skb, channel->rx_queue.core_index); - skb_mark_napi_id(skb, &channel->napi_str); gro_result = napi_gro_frags(napi); if (gro_result != GRO_DROP) channel->irq_mod_score += 2; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d8838dedb7a4c..d1d14cecf4507 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -516,8 +516,6 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, skb_shinfo(skb)->gso_segs = 0; } - skb_mark_napi_id(skb, &rq->napi); - napi_gro_receive(&rq->napi, skb); return; diff --git a/net/core/dev.c b/net/core/dev.c index 93009610aee84..83b48747928cb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4356,6 +4356,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { + skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); @@ -4390,6 +4391,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); napi->skb = skb; + skb_mark_napi_id(skb, napi); } return skb; } -- cgit v1.2.3 From d64b5e85bfe2fe4c790abcbd16d9ae32391ddd7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:00 -0800 Subject: net: add netif_tx_napi_add() netif_tx_napi_add() is a variant of netif_napi_add() It should be used by drivers that use a napi structure to exclusively poll TX. We do not want to add this kind of napi in napi_hash[] in following patches, adding generic busy polling to all NAPI drivers. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 2 +- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 ++-- .../net/ethernet/freescale/fs_enet/fs_enet-main.c | 2 +- drivers/net/ethernet/freescale/gianfar.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 4 ++-- drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c | 4 ++-- drivers/net/ethernet/rocker/rocker.c | 2 +- drivers/net/ethernet/ti/cpsw.c | 2 +- drivers/net/ethernet/ti/netcp_core.c | 2 +- drivers/net/wireless/ath/wil6210/netdev.c | 2 +- include/linux/netdevice.h | 23 +++++++++++++++++++++- net/core/dev.c | 3 ++- 12 files changed, 38 insertions(+), 16 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 858106352ce9c..993c780bdfab9 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1216,7 +1216,7 @@ static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv, /* Initialize SW view of the ring */ spin_lock_init(&ring->lock); ring->priv = priv; - netif_napi_add(priv->netdev, &ring->napi, bcm_sysport_tx_poll, 64); + netif_tx_napi_add(priv->netdev, &ring->napi, bcm_sysport_tx_poll, 64); ring->index = index; ring->size = size; ring->alloc_size = ring->size; diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 17f017ab4dac6..b15a60d787c70 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -2041,11 +2041,11 @@ static void bcmgenet_init_tx_napi(struct bcmgenet_priv *priv) for (i = 0; i < priv->hw_params->tx_queues; ++i) { ring = &priv->tx_rings[i]; - netif_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, 64); + netif_tx_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, 64); } ring = &priv->tx_rings[DESC_INDEX]; - netif_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, 64); + netif_tx_napi_add(priv->dev, &ring->napi, bcmgenet_tx_poll, 64); } static void bcmgenet_enable_tx_napi(struct bcmgenet_priv *priv) diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index cf8e54652df95..48a9c176e0d15 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -1050,7 +1050,7 @@ static int fs_enet_probe(struct platform_device *ofdev) ndev->netdev_ops = &fs_enet_netdev_ops; ndev->watchdog_timeo = 2 * HZ; netif_napi_add(ndev, &fep->napi, fs_enet_rx_napi, fpi->napi_weight); - netif_napi_add(ndev, &fep->napi_tx, fs_enet_tx_napi, 2); + netif_tx_napi_add(ndev, &fep->napi_tx, fs_enet_tx_napi, 2); ndev->ethtool_ops = &fs_ethtool_ops; diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 3e6b9b437497d..c8bc43e99a355 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1347,12 +1347,12 @@ static int gfar_probe(struct platform_device *ofdev) if (priv->poll_mode == GFAR_SQ_POLLING) { netif_napi_add(dev, &priv->gfargrp[i].napi_rx, gfar_poll_rx_sq, GFAR_DEV_WEIGHT); - netif_napi_add(dev, &priv->gfargrp[i].napi_tx, + netif_tx_napi_add(dev, &priv->gfargrp[i].napi_tx, gfar_poll_tx_sq, 2); } else { netif_napi_add(dev, &priv->gfargrp[i].napi_rx, gfar_poll_rx, GFAR_DEV_WEIGHT); - netif_napi_add(dev, &priv->gfargrp[i].napi_tx, + netif_tx_napi_add(dev, &priv->gfargrp[i].napi_tx, gfar_poll_tx, 2); } } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index eb8a4988de632..3a6176fea78d0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -156,8 +156,8 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, cq->mcq.event = mlx4_en_cq_event; if (cq->is_tx) { - netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq, - NAPI_POLL_WEIGHT); + netif_tx_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq, + NAPI_POLL_WEIGHT); } else { netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64); napi_hash_add(&cq->napi); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c index d4b5085a21fa8..7bd6f25b4625f 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c @@ -1604,7 +1604,7 @@ int qlcnic_82xx_napi_add(struct qlcnic_adapter *adapter, if (qlcnic_check_multi_tx(adapter) && !adapter->ahw->diag_test) { for (ring = 0; ring < adapter->drv_tx_rings; ring++) { tx_ring = &adapter->tx_ring[ring]; - netif_napi_add(netdev, &tx_ring->napi, qlcnic_tx_poll, + netif_tx_napi_add(netdev, &tx_ring->napi, qlcnic_tx_poll, NAPI_POLL_WEIGHT); } } @@ -2135,7 +2135,7 @@ int qlcnic_83xx_napi_add(struct qlcnic_adapter *adapter, !(adapter->flags & QLCNIC_TX_INTR_SHARED)) { for (ring = 0; ring < adapter->drv_tx_rings; ring++) { tx_ring = &adapter->tx_ring[ring]; - netif_napi_add(netdev, &tx_ring->napi, + netif_tx_napi_add(netdev, &tx_ring->napi, qlcnic_83xx_msix_tx_poll, NAPI_POLL_WEIGHT); } diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index e9f2349e98bc0..a4ab71d43e4e9 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4998,7 +4998,7 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number) dev->netdev_ops = &rocker_port_netdev_ops; dev->ethtool_ops = &rocker_port_ethtool_ops; dev->switchdev_ops = &rocker_port_switchdev_ops; - netif_napi_add(dev, &rocker_port->napi_tx, rocker_port_poll_tx, + netif_tx_napi_add(dev, &rocker_port->napi_tx, rocker_port_poll_tx, NAPI_POLL_WEIGHT); netif_napi_add(dev, &rocker_port->napi_rx, rocker_port_poll_rx, NAPI_POLL_WEIGHT); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 48b92c9de12a5..15322c08de80e 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2469,7 +2469,7 @@ static int cpsw_probe(struct platform_device *pdev) ndev->netdev_ops = &cpsw_netdev_ops; ndev->ethtool_ops = &cpsw_ethtool_ops; netif_napi_add(ndev, &priv->napi_rx, cpsw_rx_poll, CPSW_POLL_WEIGHT); - netif_napi_add(ndev, &priv->napi_tx, cpsw_tx_poll, CPSW_POLL_WEIGHT); + netif_tx_napi_add(ndev, &priv->napi_tx, cpsw_tx_poll, CPSW_POLL_WEIGHT); /* register the network device */ SET_NETDEV_DEV(ndev, &pdev->dev); diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 37b9b39192ec4..e5e20e734f21a 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -1990,7 +1990,7 @@ static int netcp_create_interface(struct netcp_device *netcp_device, /* NAPI register */ netif_napi_add(ndev, &netcp->rx_napi, netcp_rx_poll, NETCP_NAPI_WEIGHT); - netif_napi_add(ndev, &netcp->tx_napi, netcp_tx_poll, NETCP_NAPI_WEIGHT); + netif_tx_napi_add(ndev, &netcp->tx_napi, netcp_tx_poll, NETCP_NAPI_WEIGHT); /* Register the network device */ ndev->dev_id = 0; diff --git a/drivers/net/wireless/ath/wil6210/netdev.c b/drivers/net/wireless/ath/wil6210/netdev.c index e3b3c8fb46055..56aaa2d4fb0ec 100644 --- a/drivers/net/wireless/ath/wil6210/netdev.c +++ b/drivers/net/wireless/ath/wil6210/netdev.c @@ -183,7 +183,7 @@ void *wil_if_alloc(struct device *dev) netif_napi_add(ndev, &wil->napi_rx, wil6210_netdev_poll_rx, WIL6210_NAPI_BUDGET); - netif_napi_add(ndev, &wil->napi_tx, wil6210_netdev_poll_tx, + netif_tx_napi_add(ndev, &wil->napi_tx, wil6210_netdev_poll_tx, WIL6210_NAPI_BUDGET); netif_tx_stop_all_queues(ndev); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2020a89df12bb..838935d1cdbb0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -326,7 +326,8 @@ enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_HASHED, /* In NAPI hash */ + NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ + NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ }; enum gro_result { @@ -1938,6 +1939,26 @@ static inline void *netdev_priv(const struct net_device *dev) void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight); +/** + * netif_tx_napi_add - initialize a napi context + * @dev: network device + * @napi: napi context + * @poll: polling function + * @weight: default weight + * + * This variant of netif_napi_add() should be used from drivers using NAPI + * to exclusively poll a TX queue. + * This will avoid we add it into napi_hash[], thus polluting this hash table. + */ +static inline void netif_tx_napi_add(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) +{ + set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); + netif_napi_add(dev, napi, poll, weight); +} + /** * netif_napi_del - remove a napi context * @napi: napi context diff --git a/net/core/dev.c b/net/core/dev.c index 83b48747928cb..ff58a8bc5e3c5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4737,7 +4737,8 @@ EXPORT_SYMBOL(sk_busy_loop); void napi_hash_add(struct napi_struct *napi) { - if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || + test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) return; spin_lock(&napi_hash_lock); -- cgit v1.2.3 From 6180d9de61a5c461f9e3efef5417a844701dbbb2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:01 -0800 Subject: net: move napi_hash[] into read mostly section We do not often add/delete a napi context. Moving napi_hash[] into read mostly section avoids potential false sharing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/hashtable.h | 4 ++++ net/core/dev.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/include/linux/hashtable.h b/include/linux/hashtable.h index 519b6e2d769ed..661e5c2a8e2a4 100644 --- a/include/linux/hashtable.h +++ b/include/linux/hashtable.h @@ -16,6 +16,10 @@ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } +#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \ + struct hlist_head name[1 << (bits)] __read_mostly = \ + { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } + #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] diff --git a/net/core/dev.c b/net/core/dev.c index ff58a8bc5e3c5..02dfbd91a8e46 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -184,7 +184,7 @@ EXPORT_SYMBOL(dev_base_lock); static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; -static DEFINE_HASHTABLE(napi_hash, 8); +static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; -- cgit v1.2.3 From 34cbe27e811c591c854a39c0dee1b461bb796953 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:02 -0800 Subject: net: napi_hash_del() returns a boolean status napi_hash_del() will soon be used from both drivers (if they want) or core networking stack. Callers are responsibles to ensure an RCU grace period is respected before freeing napi structure : napi_hash_del() can signal if this RCU grace period is needed or not. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++-- net/core/dev.c | 10 +++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 838935d1cdbb0..e5c33b29471b7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -474,9 +474,10 @@ void napi_hash_add(struct napi_struct *napi); * @napi: napi context * * Warning: caller must observe rcu grace period - * before freeing memory containing @napi + * before freeing memory containing @napi, if + * this function returns true. */ -void napi_hash_del(struct napi_struct *napi); +bool napi_hash_del(struct napi_struct *napi); /** * napi_disable - prevent NAPI from scheduling diff --git a/net/core/dev.c b/net/core/dev.c index 02dfbd91a8e46..59dddac1c2e70 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4760,14 +4760,18 @@ EXPORT_SYMBOL_GPL(napi_hash_add); /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ -void napi_hash_del(struct napi_struct *napi) +bool napi_hash_del(struct napi_struct *napi) { + bool rcu_sync_needed = false; + spin_lock(&napi_hash_lock); - if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) + if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { + rcu_sync_needed = true; hlist_del_rcu(&napi->napi_hash_node); - + } spin_unlock(&napi_hash_lock); + return rcu_sync_needed; } EXPORT_SYMBOL_GPL(napi_hash_del); -- cgit v1.2.3 From 93d05d4a320cb16712bb3d57a9658f395d8cecb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Nov 2015 06:31:03 -0800 Subject: net: provide generic busy polling to all NAPI drivers NAPI drivers no longer need to observe a particular protocol to benefit from busy polling (CONFIG_NET_RX_BUSY_POLL=y) napi_hash_add() and napi_hash_del() are automatically called from core networking stack, respectively from netif_napi_add() and netif_napi_del() This patch depends on free_netdev() and netif_napi_del() being called from process context, which seems to be the norm. Drivers might still prefer to call napi_hash_del() on their own, since they might combine all the rcu grace periods into a single one, knowing their NAPI structures lifetime, while core networking stack has no idea of a possible combining. Once this patch proves to not bring serious regressions, we will cleanup drivers to either remove napi_hash_del() or provide appropriate rcu grace periods combining. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 2 -- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 -- drivers/net/ethernet/chelsio/cxgb4/sge.c | 1 - drivers/net/ethernet/cisco/enic/enic_main.c | 2 -- drivers/net/ethernet/emulex/benet/be_main.c | 1 - drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 1 - drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 3 --- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 6 ++---- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 - drivers/net/ethernet/myricom/myri10ge/myri10ge.c | 1 - drivers/net/ethernet/sfc/efx.c | 1 - drivers/net/virtio_net.c | 1 - include/linux/netdevice.h | 7 +++++++ net/core/dev.c | 7 +++++++ 14 files changed, 16 insertions(+), 20 deletions(-) (limited to 'net/core/dev.c') diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index ab9222924bd97..d9add7c02e42c 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -46,7 +46,6 @@ static void bnx2x_add_all_napi_cnic(struct bnx2x *bp) for_each_rx_queue_cnic(bp, i) { netif_napi_add(bp->dev, &bnx2x_fp(bp, i, napi), bnx2x_poll, NAPI_POLL_WEIGHT); - napi_hash_add(&bnx2x_fp(bp, i, napi)); } } @@ -58,7 +57,6 @@ static void bnx2x_add_all_napi(struct bnx2x *bp) for_each_eth_queue(bp, i) { netif_napi_add(bp->dev, &bnx2x_fp(bp, i, napi), bnx2x_poll, NAPI_POLL_WEIGHT); - napi_hash_add(&bnx2x_fp(bp, i, napi)); } } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index db15c5ee09c53..f2d0dc9b1c414 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4227,12 +4227,10 @@ static void bnxt_init_napi(struct bnxt *bp) bnapi = bp->bnapi[i]; netif_napi_add(bp->dev, &bnapi->napi, bnxt_poll, 64); - napi_hash_add(&bnapi->napi); } } else { bnapi = bp->bnapi[0]; netif_napi_add(bp->dev, &bnapi->napi, bnxt_poll, 64); - napi_hash_add(&bnapi->napi); } } diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index f650f295f2646..48d8fbb1c2200 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2527,7 +2527,6 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq, goto err; netif_napi_add(dev, &iq->napi, napi_rx_handler, 64); - napi_hash_add(&iq->napi); iq->cur_desc = iq->desc; iq->cidx = 0; iq->gen = 1; diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index b36643ef0593a..b2182d3ba3cc4 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -2458,13 +2458,11 @@ static int enic_dev_init(struct enic *enic) switch (vnic_dev_get_intr_mode(enic->vdev)) { default: netif_napi_add(netdev, &enic->napi[0], enic_poll, 64); - napi_hash_add(&enic->napi[0]); break; case VNIC_DEV_INTR_MODE_MSIX: for (i = 0; i < enic->rq_count; i++) { netif_napi_add(netdev, &enic->napi[i], enic_poll_msix_rq, NAPI_POLL_WEIGHT); - napi_hash_add(&enic->napi[i]); } for (i = 0; i < enic->wq_count; i++) netif_napi_add(netdev, &enic->napi[enic_cq_wq(enic, i)], diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index c29d62496ad99..4cab8879f5ae4 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -2630,7 +2630,6 @@ static int be_evt_queues_create(struct be_adapter *adapter) eqo->affinity_mask); netif_napi_add(adapter->netdev, &eqo->napi, be_poll, BE_NAPI_WEIGHT); - napi_hash_add(&eqo->napi); } return 0; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c index f3168bcc7d879..e771e764daa3a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c @@ -844,7 +844,6 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter, /* initialize NAPI */ netif_napi_add(adapter->netdev, &q_vector->napi, ixgbe_poll, 64); - napi_hash_add(&q_vector->napi); #ifdef CONFIG_NET_RX_BUSY_POLL /* initialize busy poll */ diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 592ff237d6920..2955186cd4f68 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -2483,9 +2483,6 @@ static int ixgbevf_alloc_q_vectors(struct ixgbevf_adapter *adapter) q_vector->v_idx = q_idx; netif_napi_add(adapter->netdev, &q_vector->napi, ixgbevf_poll, 64); -#ifdef CONFIG_NET_RX_BUSY_POLL - napi_hash_add(&q_vector->napi); -#endif adapter->q_vector[q_idx] = q_vector; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index 3a6176fea78d0..af975a2b74c6d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -155,13 +155,11 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, cq->mcq.comp = cq->is_tx ? mlx4_en_tx_irq : mlx4_en_rx_irq; cq->mcq.event = mlx4_en_cq_event; - if (cq->is_tx) { + if (cq->is_tx) netif_tx_napi_add(cq->dev, &cq->napi, mlx4_en_poll_tx_cq, NAPI_POLL_WEIGHT); - } else { + else netif_napi_add(cq->dev, &cq->napi, mlx4_en_poll_rx_cq, 64); - napi_hash_add(&cq->napi); - } napi_enable(&cq->napi); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index ffb1f9c1b9730..f6a8cc7876035 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -982,7 +982,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, mlx5e_build_channeltc_to_txq_map(priv, ix); netif_napi_add(netdev, &c->napi, mlx5e_napi_poll, 64); - napi_hash_add(&c->napi); err = mlx5e_open_tx_cqs(c, cparam); if (err) diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c index acf866147d65c..270c9eeb7ab62 100644 --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c @@ -3814,7 +3814,6 @@ static int myri10ge_alloc_slices(struct myri10ge_priv *mgp) ss->dev = mgp->dev; netif_napi_add(ss->dev, &ss->napi, myri10ge_poll, myri10ge_napi_weight); - napi_hash_add(&ss->napi); } return 0; abort: diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index a3c42a376741f..4e82bcfbe3e09 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -2059,7 +2059,6 @@ static void efx_init_napi_channel(struct efx_channel *channel) channel->napi_dev = efx->net_dev; netif_napi_add(channel->napi_dev, &channel->napi_str, efx_poll, napi_weight); - napi_hash_add(&channel->napi_str); efx_channel_busy_poll_init(channel); } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d1d14cecf4507..b1ae4cbf24534 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1610,7 +1610,6 @@ static int virtnet_alloc_queues(struct virtnet_info *vi) vi->rq[i].pages = NULL; netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll, napi_weight); - napi_hash_add(&vi->rq[i].napi); sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e5c33b29471b7..7d2d1d7aaec7c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -466,6 +466,9 @@ static inline void napi_complete(struct napi_struct *n) * @napi: napi context * * generate a new napi_id and store a @napi under it in napi_hash + * Used for busy polling (CONFIG_NET_RX_BUSY_POLL) + * Note: This is normally automatically done from netif_napi_add(), + * so might disappear in a future linux version. */ void napi_hash_add(struct napi_struct *napi); @@ -476,6 +479,10 @@ void napi_hash_add(struct napi_struct *napi); * Warning: caller must observe rcu grace period * before freeing memory containing @napi, if * this function returns true. + * Note: core networking stack automatically calls it + * from netif_napi_del() + * Drivers might want to call this helper to combine all + * the needed rcu grace periods into a single one. */ bool napi_hash_del(struct napi_struct *napi); diff --git a/net/core/dev.c b/net/core/dev.c index 59dddac1c2e70..41cef3e3f558b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4807,6 +4807,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); + napi_hash_add(napi); } EXPORT_SYMBOL(netif_napi_add); @@ -4826,8 +4827,12 @@ void napi_disable(struct napi_struct *n) } EXPORT_SYMBOL(napi_disable); +/* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { + might_sleep(); + if (napi_hash_del(napi)) + synchronize_net(); list_del_init(&napi->dev_list); napi_free_frags(napi); @@ -7227,11 +7232,13 @@ EXPORT_SYMBOL(alloc_netdev_mqs); * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. * If this is the last reference then it will be freed. + * Must be called in process context. */ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + might_sleep(); netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS kvfree(dev->_rx); -- cgit v1.2.3 From e2f9dc3bd213792ac006e83f50a5453f23b8c354 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Nov 2015 12:11:23 -0800 Subject: net: avoid NULL deref in napi_get_frags() napi_alloc_skb() can return NULL. We should not crash should this happen. Fixes: 93f93a440415 ("net: move skb_mark_napi_id() into core networking stack") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 41cef3e3f558b..5df6cbce727c1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4390,8 +4390,10 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); - napi->skb = skb; - skb_mark_napi_id(skb, napi); + if (skb) { + napi->skb = skb; + skb_mark_napi_id(skb, napi); + } } return skb; } -- cgit v1.2.3