summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/bridge/br_forward.c3
-rw-r--r--net/bridge/br_multicast.c9
-rw-r--r--net/bridge/netfilter/ebtables.c14
-rw-r--r--net/caif/cfctrl.c4
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/datagram.c2
-rw-r--r--net/core/dev.c71
-rw-r--r--net/core/devmem.c8
-rw-r--r--net/core/devmem.h2
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/filter.c66
-rw-r--r--net/core/lwt_bpf.c4
-rw-r--r--net/core/net-sysfs.c4
-rw-r--r--net/core/netdev-genl.c122
-rw-r--r--net/core/netdev_queues.c27
-rw-r--r--net/core/netdev_rx_queue.c9
-rw-r--r--net/core/pktgen.c7
-rw-r--r--net/core/skbuff.c4
-rw-r--r--net/core/sock.c90
-rw-r--r--net/core/xdp.c21
-rw-r--r--net/devlink/health.c109
-rw-r--r--net/devlink/netlink_gen.c5
-rw-r--r--net/devlink/param.c5
-rw-r--r--net/devlink/port.c33
-rw-r--r--net/ethtool/ioctl.c25
-rw-r--r--net/ethtool/rss.c27
-rw-r--r--net/hsr/hsr_slave.c5
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/cipso_ipv4.c13
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/fib_frontend.c7
-rw-r--r--net/ipv4/fib_rules.c4
-rw-r--r--net/ipv4/fou_nl.c4
-rw-r--r--net/ipv4/icmp.c33
-rw-r--r--net/ipv4/inet_connection_sock.c30
-rw-r--r--net/ipv4/inet_diag.c570
-rw-r--r--net/ipv4/inet_hashtables.c36
-rw-r--r--net/ipv4/inet_timewait_sock.c5
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ip_gre.c4
-rw-r--r--net/ipv4/ip_input.c11
-rw-r--r--net/ipv4/ip_options.c5
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/ipmr.c9
-rw-r--r--net/ipv4/netfilter.c9
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c4
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c27
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c4
-rw-r--r--net/ipv4/nexthop.c42
-rw-r--r--net/ipv4/ping.c68
-rw-r--r--net/ipv4/proc.c65
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/raw_diag.c10
-rw-r--r--net/ipv4/route.c28
-rw-r--r--net/ipv4/tcp.c34
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_diag.c461
-rw-r--r--net/ipv4/tcp_fastopen.c7
-rw-r--r--net/ipv4/tcp_input.c18
-rw-r--r--net/ipv4/tcp_ipv4.c26
-rw-r--r--net/ipv4/tcp_metrics.c6
-rw-r--r--net/ipv4/tcp_minisocks.c1
-rw-r--r--net/ipv4/tcp_offload.c3
-rw-r--r--net/ipv4/tcp_output.c19
-rw-r--r--net/ipv4/tcp_timer.c6
-rw-r--r--net/ipv4/udp.c20
-rw-r--r--net/ipv4/udp_diag.c10
-rw-r--r--net/ipv4/udp_tunnel_core.c3
-rw-r--r--net/ipv4/xfrm4_policy.c4
-rw-r--r--net/ipv6/Kconfig7
-rw-r--r--net/ipv6/addrconf.c4
-rw-r--r--net/ipv6/anycast.c2
-rw-r--r--net/ipv6/datagram.c2
-rw-r--r--net/ipv6/esp6.c4
-rw-r--r--net/ipv6/icmp.c9
-rw-r--r--net/ipv6/inet6_hashtables.c51
-rw-r--r--net/ipv6/ip6_gre.c10
-rw-r--r--net/ipv6/ip6_output.c64
-rw-r--r--net/ipv6/mcast.c67
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter.c5
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c37
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c3
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c5
-rw-r--r--net/ipv6/output_core.c8
-rw-r--r--net/ipv6/ping.c1
-rw-r--r--net/ipv6/proc.c91
-rw-r--r--net/ipv6/raw.c9
-rw-r--r--net/ipv6/route.c7
-rw-r--r--net/ipv6/seg6.c7
-rw-r--r--net/ipv6/seg6_hmac.c211
-rw-r--r--net/ipv6/sit.c104
-rw-r--r--net/ipv6/tcp_ipv6.c24
-rw-r--r--net/ipv6/tcpv6_offload.c3
-rw-r--r--net/ipv6/udp.c6
-rw-r--r--net/iucv/af_iucv.c4
-rw-r--r--net/mctp/af_mctp.c2
-rw-r--r--net/mptcp/crypto.c35
-rw-r--r--net/mptcp/mib.c12
-rw-r--r--net/mptcp/mptcp_diag.c15
-rw-r--r--net/mptcp/pm.c28
-rw-r--r--net/mptcp/protocol.c26
-rw-r--r--net/mptcp/protocol.h4
-rw-r--r--net/mptcp/subflow.c11
-rw-r--r--net/netfilter/nf_conntrack_netlink.c39
-rw-r--r--net/netfilter/nf_tables_api.c47
-rw-r--r--net/netfilter/nft_flow_offload.c4
-rw-r--r--net/netfilter/nft_payload.c20
-rw-r--r--net/netfilter/nft_set_hash.c100
-rw-r--r--net/netfilter/nft_set_pipapo.c89
-rw-r--r--net/netfilter/nft_set_pipapo.h8
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c137
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h4
-rw-r--r--net/netfilter/nft_set_rbtree.c35
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/openvswitch/flow.c12
-rw-r--r--net/openvswitch/flow_table.c7
-rw-r--r--net/packet/af_packet.c2
-rw-r--r--net/phonet/af_phonet.c4
-rw-r--r--net/phonet/pep.c6
-rw-r--r--net/phonet/socket.c25
-rw-r--r--net/rds/af_rds.c2
-rw-r--r--net/rds/connection.c9
-rw-r--r--net/rds/ib_mr.h1
-rw-r--r--net/rds/ib_recv.c2
-rw-r--r--net/rds/message.c4
-rw-r--r--net/rds/rds.h2
-rw-r--r--net/rds/recv.c4
-rw-r--r--net/rds/send.c4
-rw-r--r--net/sched/act_api.c12
-rw-r--r--net/sched/act_simple.c1
-rw-r--r--net/sched/act_skbmod.c22
-rw-r--r--net/sched/act_tunnel_key.c16
-rw-r--r--net/sched/act_vlan.c16
-rw-r--r--net/sched/sch_api.c4
-rw-r--r--net/sctp/Kconfig47
-rw-r--r--net/sctp/auth.c166
-rw-r--r--net/sctp/chunk.c3
-rw-r--r--net/sctp/diag.c2
-rw-r--r--net/sctp/endpointola.c23
-rw-r--r--net/sctp/proc.c12
-rw-r--r--net/sctp/protocol.c14
-rw-r--r--net/sctp/sm_make_chunk.c60
-rw-r--r--net/sctp/sm_statefuns.c5
-rw-r--r--net/sctp/socket.c41
-rw-r--r--net/sctp/sysctl.c49
-rw-r--r--net/smc/smc_ib.c18
-rw-r--r--net/smc/smc_ism.c13
-rw-r--r--net/smc/smc_pnet.c2
-rw-r--r--net/tipc/socket.c6
-rw-r--r--net/tls/tls_proc.c10
-rw-r--r--net/vmw_vsock/af_vsock.c7
-rw-r--r--net/xfrm/xfrm_policy.c16
-rw-r--r--net/xfrm/xfrm_proc.c12
157 files changed, 2228 insertions, 2123 deletions
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 29097e984b4f..870bdf2e082c 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -148,7 +148,8 @@ void br_forward(const struct net_bridge_port *to,
goto out;
/* redirect to backup link if the destination port is down */
- if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
+ if (rcu_access_pointer(to->backup_port) &&
+ (!netif_carrier_ok(to->dev) || !netif_running(to->dev))) {
struct net_bridge_port *backup_port;
backup_port = rcu_dereference(to->backup_port);
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 8ce145938b02..22d12e545966 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4049,8 +4049,7 @@ int br_multicast_rcv(struct net_bridge_mcast **brmctx,
}
static void br_multicast_query_expired(struct net_bridge_mcast *brmctx,
- struct bridge_mcast_own_query *query,
- struct bridge_mcast_querier *querier)
+ struct bridge_mcast_own_query *query)
{
spin_lock(&brmctx->br->multicast_lock);
if (br_multicast_ctx_vlan_disabled(brmctx))
@@ -4069,8 +4068,7 @@ static void br_ip4_multicast_query_expired(struct timer_list *t)
struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
ip4_own_query.timer);
- br_multicast_query_expired(brmctx, &brmctx->ip4_own_query,
- &brmctx->ip4_querier);
+ br_multicast_query_expired(brmctx, &brmctx->ip4_own_query);
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -4079,8 +4077,7 @@ static void br_ip6_multicast_query_expired(struct timer_list *t)
struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
ip6_own_query.timer);
- br_multicast_query_expired(brmctx, &brmctx->ip6_own_query,
- &brmctx->ip6_querier);
+ br_multicast_query_expired(brmctx, &brmctx->ip6_own_query);
}
#endif
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 3e67d4aff419..5697e3949a36 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -920,8 +920,8 @@ static int translate_table(struct net *net, const char *name,
* if an error occurs
*/
newinfo->chainstack =
- vmalloc(array_size(nr_cpu_ids,
- sizeof(*(newinfo->chainstack))));
+ vmalloc_array(nr_cpu_ids,
+ sizeof(*(newinfo->chainstack)));
if (!newinfo->chainstack)
return -ENOMEM;
for_each_possible_cpu(i) {
@@ -938,7 +938,7 @@ static int translate_table(struct net *net, const char *name,
}
}
- cl_s = vmalloc(array_size(udc_cnt, sizeof(*cl_s)));
+ cl_s = vmalloc_array(udc_cnt, sizeof(*cl_s));
if (!cl_s)
return -ENOMEM;
i = 0; /* the i'th udc */
@@ -1018,8 +1018,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
* the check on the size is done later, when we have the lock
*/
if (repl->num_counters) {
- unsigned long size = repl->num_counters * sizeof(*counterstmp);
- counterstmp = vmalloc(size);
+ counterstmp = vmalloc_array(repl->num_counters,
+ sizeof(*counterstmp));
if (!counterstmp)
return -ENOMEM;
}
@@ -1386,7 +1386,7 @@ static int do_update_counters(struct net *net, const char *name,
if (num_counters == 0)
return -EINVAL;
- tmp = vmalloc(array_size(num_counters, sizeof(*tmp)));
+ tmp = vmalloc_array(num_counters, sizeof(*tmp));
if (!tmp)
return -ENOMEM;
@@ -1526,7 +1526,7 @@ static int copy_counters_to_user(struct ebt_table *t,
if (num_counters != nentries)
return -EINVAL;
- counterstmp = vmalloc(array_size(nentries, sizeof(*counterstmp)));
+ counterstmp = vmalloc_array(nentries, sizeof(*counterstmp));
if (!counterstmp)
return -ENOMEM;
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index 06b604cf9d58..2aa1e7d46eb2 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -257,9 +257,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
cfpkt_add_body(pkt, &tmp16, 2);
tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs);
cfpkt_add_body(pkt, &tmp16, 2);
- memset(utility_name, 0, sizeof(utility_name));
- strscpy(utility_name, param->u.utility.name,
- UTILITY_NAME_LENGTH);
+ strscpy_pad(utility_name, param->u.utility.name);
cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH);
tmp8 = param->u.utility.paramlen;
cfpkt_add_body(pkt, &tmp8, 1);
diff --git a/net/core/Makefile b/net/core/Makefile
index b2a76ce33932..9ef2099c5426 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-y += hotdata.o
obj-y += netdev_rx_queue.o
+obj-y += netdev_queues.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f474b9b120f9..cb4b9ef2e4e3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -345,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
spin_unlock_bh(&sk_queue->lock);
}
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
return err;
}
EXPORT_SYMBOL(__sk_queue_drop_skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 93a25d87b86b..1d1650d9ecff 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4849,9 +4849,40 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
return hash_32(hash, flow_table->log);
}
+#ifdef CONFIG_RFS_ACCEL
+/**
+ * rps_flow_is_active - check whether the flow is recently active.
+ * @rflow: Specific flow to check activity.
+ * @flow_table: per-queue flowtable that @rflow belongs to.
+ * @cpu: CPU saved in @rflow.
+ *
+ * If the CPU has processed many packets since the flow's last activity
+ * (beyond 10 times the table size), the flow is considered stale.
+ *
+ * Return: true if flow was recently active.
+ */
+static bool rps_flow_is_active(struct rps_dev_flow *rflow,
+ struct rps_dev_flow_table *flow_table,
+ unsigned int cpu)
+{
+ unsigned int flow_last_active;
+ unsigned int sd_input_head;
+
+ if (cpu >= nr_cpu_ids)
+ return false;
+
+ sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
+ flow_last_active = READ_ONCE(rflow->last_qtail);
+
+ return (int)(sd_input_head - flow_last_active) <
+ (int)(10 << flow_table->log);
+}
+#endif
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- struct rps_dev_flow *rflow, u16 next_cpu)
+ struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
+ u32 flow_id)
{
if (next_cpu < nr_cpu_ids) {
u32 head;
@@ -4859,8 +4890,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
+ struct rps_dev_flow *tmp_rflow;
+ unsigned int tmp_cpu;
u16 rxq_index;
- u32 flow_id;
int rc;
/* Should we steer this flow to a different hardware queue? */
@@ -4875,14 +4907,29 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = rfs_slot(skb_get_hash(skb), flow_table);
+
+ tmp_rflow = &flow_table->flows[flow_id];
+ tmp_cpu = READ_ONCE(tmp_rflow->cpu);
+
+ if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
+ if (rps_flow_is_active(tmp_rflow, flow_table,
+ tmp_cpu)) {
+ if (hash != READ_ONCE(tmp_rflow->hash) ||
+ next_cpu == tmp_cpu)
+ goto out;
+ }
+ }
+
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
+
old_rflow = rflow;
- rflow = &flow_table->flows[flow_id];
+ rflow = tmp_rflow;
WRITE_ONCE(rflow->filter, rc);
+ WRITE_ONCE(rflow->hash, hash);
+
if (old_rflow->filter == rc)
WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
out:
@@ -4908,6 +4955,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
+ u32 flow_id;
u32 tcpu;
u32 hash;
@@ -4954,7 +5002,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
+ flow_id = rfs_slot(hash, flow_table);
+ rflow = &flow_table->flows[flow_id];
tcpu = rflow->cpu;
/*
@@ -4973,7 +5022,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
- rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
+ flow_id);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
@@ -5017,17 +5067,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
- unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id < (1UL << flow_table->log)) {
+ unsigned int cpu;
+
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
- if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
- ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
- READ_ONCE(rflow->last_qtail)) <
- (int)(10 << flow_table->log)))
+ if (READ_ONCE(rflow->filter) == filter_id &&
+ rps_flow_is_active(rflow, flow_table, cpu))
expire = false;
}
rcu_read_unlock();
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 24c591ab38ae..d9de31a6cc7f 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -176,6 +176,7 @@ err_close_rxq:
struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack)
@@ -188,6 +189,11 @@ net_devmem_bind_dmabuf(struct net_device *dev,
unsigned long virtual;
int err;
+ if (!dma_dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't support DMA");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
dmabuf = dma_buf_get(dmabuf_fd);
if (IS_ERR(dmabuf))
return ERR_CAST(dmabuf);
@@ -209,7 +215,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
binding->dmabuf = dmabuf;
binding->direction = direction;
- binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
+ binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev);
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 41cd6e1c9141..101150d761af 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -85,6 +85,7 @@ struct dmabuf_genpool_chunk_owner {
void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack);
@@ -170,6 +171,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov)
static inline struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd,
struct netdev_nl_sock *priv,
diff --git a/net/core/dst.c b/net/core/dst.c
index e2de8b68c41d..e9d35f49c9e7 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst)
dst->ops->ifdown(dst, dev);
WRITE_ONCE(dst->input, dst_discard);
WRITE_ONCE(dst->output, dst_discard_out);
- WRITE_ONCE(dst->dev, blackhole_netdev);
+ rcu_assign_pointer(dst->dev_rcu, blackhole_netdev);
netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
GFP_ATOMIC);
}
diff --git a/net/core/filter.c b/net/core/filter.c
index da391e2b0788..b005363f482c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2373,7 +2373,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
struct flowi4 fl4 = {
.flowi4_flags = FLOWI_FLAG_ANYSRC,
.flowi4_mark = skb->mark,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)),
+ .flowi4_dscp = ip4h_dscp(ip4h),
.flowi4_oif = dev->ifindex,
.flowi4_proto = ip4h->protocol,
.daddr = ip4h->daddr,
@@ -6020,7 +6020,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
fl4.flowi4_iif = params->ifindex;
fl4.flowi4_oif = 0;
}
- fl4.flowi4_tos = params->tos & INET_DSCP_MASK;
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos);
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
@@ -6767,7 +6767,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
{
- struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
bool refcounted = false;
struct sock *sk = NULL;
@@ -6776,7 +6775,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
__be32 dst4 = tuple->ipv4.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet_lookup(net, hinfo, NULL, 0,
+ sk = __inet_lookup(net, NULL, 0,
src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &refcounted);
@@ -6790,7 +6789,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet6_lookup(net, hinfo, NULL, 0,
+ sk = __inet6_lookup(net, NULL, 0,
src6, tuple->ipv6.sport,
dst6, ntohs(tuple->ipv6.dport),
dif, sdif, &refcounted);
@@ -11990,6 +11989,16 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return func;
}
+/**
+ * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area.
+ * @skb: socket buffer carrying the metadata
+ * @offset: offset into the metadata area, must be <= skb_metadata_len()
+ */
+void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
+{
+ return skb_metadata_end(skb) - skb_metadata_len(skb) + offset;
+}
+
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
struct bpf_dynptr *ptr__uninit)
@@ -12007,6 +12016,42 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
return 0;
}
+/**
+ * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area.
+ * @skb_: socket buffer carrying the metadata
+ * @flags: future use, must be zero
+ * @ptr__uninit: dynptr to initialize
+ *
+ * Set up a dynptr for access to the metadata area earlier allocated from the
+ * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to
+ * &__sk_buff->data_meta.
+ *
+ * If passed @skb_ is a clone which shares the data with the original, the
+ * dynptr will be read-only. This limitation may be lifted in the future.
+ *
+ * Return:
+ * * %0 - dynptr ready to use
+ * * %-EINVAL - invalid flags, dynptr set to null
+ */
+__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct sk_buff *skb = (struct sk_buff *)skb_;
+
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb));
+
+ if (skb_cloned(skb))
+ bpf_dynptr_set_rdonly(ptr);
+
+ return 0;
+}
+
__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
struct bpf_dynptr *ptr__uninit)
{
@@ -12181,6 +12226,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
+BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta)
+
BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)
@@ -12202,6 +12251,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.set = &bpf_kfunc_check_set_skb,
};
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_skb_meta,
+};
+
static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_xdp,
@@ -12237,6 +12291,8 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
&bpf_kfunc_set_sock_addr);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index ae74634310a3..9f40be0c3e71 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -8,12 +8,12 @@
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/bpf.h>
+#include <net/flow.h>
#include <net/lwtunnel.h>
#include <net/gre.h>
#include <net/ip.h>
#include <net/ip6_route.h>
#include <net/ipv6_stubs.h>
-#include <net/inet_dscp.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@@ -209,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
fl4.flowi4_oif = oif;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_uid = sock_net_uid(net, sk);
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
fl4.flowi4_proto = iph->protocol;
fl4.daddr = iph->daddr;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c28cd6665444..5ea9f64adce3 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1120,8 +1120,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
return -ENOMEM;
table->log = ilog2(mask) + 1;
- for (count = 0; count <= mask; count++)
+ for (count = 0; count <= mask; count++) {
table->flows[count].cpu = RPS_NO_CPU;
+ table->flows[count].filter = RPS_NO_FILTER;
+ }
} else {
table = NULL;
}
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 6314eb7bdf69..470fabbeacd9 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -869,16 +869,79 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
return err;
}
-int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+static int netdev_nl_read_rxq_bitmap(struct genl_info *info,
+ u32 rxq_bitmap_len,
+ unsigned long *rxq_bitmap)
{
+ const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
+ struct nlattr *attr;
+ int rem, err = 0;
+ u32 rxq_idx;
+
+ nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
+ genlmsg_data(info->genlhdr),
+ genlmsg_len(info->genlhdr), rem) {
+ err = nla_parse_nested(tb, maxtype, attr,
+ netdev_queue_id_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
+ NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE))
+ return -EINVAL;
+
+ if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
+ return -EINVAL;
+ }
+
+ rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+ if (rxq_idx >= rxq_bitmap_len) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]);
+ return -EINVAL;
+ }
+
+ bitmap_set(rxq_bitmap, rxq_idx, 1);
+ }
+
+ return 0;
+}
+
+static struct device *
+netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap,
+ struct netlink_ext_ack *extack)
+{
+ struct device *dma_dev = NULL;
+ u32 rxq_idx, prev_rxq_idx;
+
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
+ struct device *rxq_dma_dev;
+
+ rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx);
+ if (dma_dev && rxq_dma_dev != dma_dev) {
+ NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)",
+ rxq_idx, prev_rxq_idx);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ dma_dev = rxq_dma_dev;
+ prev_rxq_idx = rxq_idx;
+ }
+
+ return dma_dev;
+}
+
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+{
struct net_devmem_dmabuf_binding *binding;
u32 ifindex, dmabuf_fd, rxq_idx;
struct netdev_nl_sock *priv;
struct net_device *netdev;
+ unsigned long *rxq_bitmap;
+ struct device *dma_dev;
struct sk_buff *rsp;
- struct nlattr *attr;
- int rem, err = 0;
+ int err = 0;
void *hdr;
if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
@@ -921,36 +984,31 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock;
}
- binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd,
- priv, info->extack);
- if (IS_ERR(binding)) {
- err = PTR_ERR(binding);
+ rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL);
+ if (!rxq_bitmap) {
+ err = -ENOMEM;
goto err_unlock;
}
- nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
- genlmsg_data(info->genlhdr),
- genlmsg_len(info->genlhdr), rem) {
- err = nla_parse_nested(
- tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr,
- netdev_queue_id_nl_policy, info->extack);
- if (err < 0)
- goto err_unbind;
-
- if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
- NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) {
- err = -EINVAL;
- goto err_unbind;
- }
+ err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues,
+ rxq_bitmap);
+ if (err)
+ goto err_rxq_bitmap;
- if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
- NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
- err = -EINVAL;
- goto err_unbind;
- }
+ dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack);
+ if (IS_ERR(dma_dev)) {
+ err = PTR_ERR(dma_dev);
+ goto err_rxq_bitmap;
+ }
- rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
+ dmabuf_fd, priv, info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_rxq_bitmap;
+ }
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding,
info->extack);
if (err)
@@ -964,6 +1022,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_unbind;
+ bitmap_free(rxq_bitmap);
+
netdev_unlock(netdev);
mutex_unlock(&priv->lock);
@@ -972,6 +1032,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
err_unbind:
net_devmem_unbind_dmabuf(binding);
+err_rxq_bitmap:
+ bitmap_free(rxq_bitmap);
err_unlock:
netdev_unlock(netdev);
err_unlock_sock:
@@ -986,6 +1048,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
struct net_devmem_dmabuf_binding *binding;
struct netdev_nl_sock *priv;
struct net_device *netdev;
+ struct device *dma_dev;
u32 ifindex, dmabuf_fd;
struct sk_buff *rsp;
int err = 0;
@@ -1032,8 +1095,9 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_netdev;
}
- binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv,
- info->extack);
+ dma_dev = netdev_queue_get_dma_dev(netdev, 0);
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
+ dmabuf_fd, priv, info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
goto err_unlock_netdev;
diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c
new file mode 100644
index 000000000000..251f27a8307f
--- /dev/null
+++ b/net/core/netdev_queues.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/netdev_queues.h>
+
+/**
+ * netdev_queue_get_dma_dev() - get dma device for zero-copy operations
+ * @dev: net_device
+ * @idx: queue index
+ *
+ * Get dma device for zero-copy operations to be used for this queue.
+ * When such device is not available or valid, the function will return NULL.
+ *
+ * Return: Device or NULL on error
+ */
+struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
+{
+ const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
+ struct device *dma_dev;
+
+ if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
+ dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
+ else
+ dma_dev = dev->dev.parent;
+
+ return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
+}
+
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index 3bf1151d8061..c7d9341b7630 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -9,6 +9,15 @@
#include "page_pool_priv.h"
+/* See also page_pool_is_unreadable() */
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
+{
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
+
+ return !!rxq->mp_params.mp_ops;
+}
+EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+
int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
{
struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 0ebe5461d4d9..d41b03fd1f63 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -114,6 +114,7 @@
#include <linux/sys.h>
#include <linux/types.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
@@ -2841,8 +2842,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
}
i = 0;
- frag_len = (datalen/frags) < PAGE_SIZE ?
- (datalen/frags) : PAGE_SIZE;
+ frag_len = min_t(int, datalen / frags, PAGE_SIZE);
while (datalen > 0) {
if (unlikely(!pkt_dev->page)) {
int node = numa_node_id();
@@ -2859,8 +2859,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
if (i == (frags - 1))
skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
pkt_dev->page, 0,
- (datalen < PAGE_SIZE ?
- datalen : PAGE_SIZE));
+ min(datalen, PAGE_SIZE));
else
skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
pkt_dev->page, 0, frag_len);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ee0274417948..23b776cd9879 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3112,7 +3112,9 @@ static bool __splice_segment(struct page *page, unsigned int poff,
poff += flen;
plen -= flen;
*len -= flen;
- } while (*len && plen);
+ if (!*len)
+ return true;
+ } while (plen);
return false;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 158bddd23134..1f8ef4d8bcd9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -281,12 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];
/* Run time adjustable parameters. */
-__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_wmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_wmem_max);
-__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_rmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_rmem_max);
-__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
-__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);
@@ -491,13 +491,13 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
struct sk_buff_head *list = &sk->sk_receive_queue;
if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
}
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
return -ENOBUFS;
}
@@ -562,7 +562,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
skb->dev = NULL;
if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
goto discard_and_relse;
}
@@ -585,7 +585,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
reason = SKB_DROP_REASON_PFMEMALLOC;
if (err == -ENOBUFS)
reason = SKB_DROP_REASON_SOCKET_BACKLOG;
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
goto discard_and_relse;
}
@@ -1032,7 +1032,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
bool charged;
int pages;
- if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
+ if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
return -EOPNOTSUPP;
if (!bytes)
@@ -1041,8 +1041,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
pages = sk_mem_pages(bytes);
/* pre-charge to memcg */
- charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ charged = mem_cgroup_sk_charge(sk, pages,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!charged)
return -ENOMEM;
@@ -1054,7 +1054,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
*/
if (allocated > sk_prot_mem_limits(sk, 1)) {
sk_memory_allocated_sub(sk, pages);
- mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
+ mem_cgroup_sk_uncharge(sk, pages);
return -ENOMEM;
}
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
@@ -2505,15 +2505,18 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
newsk->sk_reserved_mem = 0;
- atomic_set(&newsk->sk_drops, 0);
+ DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
+ sk_drops_reset(newsk);
newsk->sk_send_head = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
atomic_set(&newsk->sk_zckey, 0);
sock_reset_flag(newsk, SOCK_DONE);
+#ifdef CONFIG_MEMCG
/* sk->sk_memcg will be populated at accept() time */
newsk->sk_memcg = NULL;
+#endif
cgroup_sk_clone(&newsk->sk_cgrp_data);
@@ -2584,7 +2587,7 @@ free:
}
EXPORT_SYMBOL_GPL(sk_clone_lock);
-static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
+static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
{
bool is_ipv6 = false;
u32 max_size;
@@ -2594,8 +2597,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
#endif
/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
- max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) :
- READ_ONCE(dst_dev(dst)->gso_ipv4_max_size);
+ max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
+ READ_ONCE(dev->gso_ipv4_max_size);
if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
max_size = GSO_LEGACY_MAX_SIZE;
@@ -2604,9 +2607,12 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
+ const struct net_device *dev;
u32 max_segs = 1;
- sk->sk_route_caps = dst_dev(dst)->features;
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ sk->sk_route_caps = dev->features;
if (sk_is_tcp(sk)) {
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2622,13 +2628,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
- sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
+ sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
- max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1);
+ max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
}
}
sk->sk_gso_max_segs = max_segs;
sk_dst_set(sk, dst);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -3158,23 +3165,27 @@ void __release_sock(struct sock *sk)
__acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb, *next;
+ int nb = 0;
while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
spin_unlock_bh(&sk->sk_lock.slock);
- do {
+ while (1) {
next = skb->next;
prefetch(next);
DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
skb_mark_not_on_list(skb);
sk_backlog_rcv(sk, skb);
- cond_resched();
-
skb = next;
- } while (skb != NULL);
+ if (!skb)
+ break;
+
+ if (!(++nb & 15))
+ cond_resched();
+ }
spin_lock_bh(&sk->sk_lock.slock);
}
@@ -3241,16 +3252,16 @@ EXPORT_SYMBOL(sk_wait_data);
*/
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
- struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
+ bool memcg_enabled = false, charged = false;
struct proto *prot = sk->sk_prot;
- bool charged = true;
long allocated;
sk_memory_allocated_add(sk, amt);
allocated = sk_memory_allocated(sk);
- if (memcg) {
- charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge());
+ if (mem_cgroup_sk_enabled(sk)) {
+ memcg_enabled = true;
+ charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
if (!charged)
goto suppress_allocation;
}
@@ -3324,21 +3335,19 @@ suppress_allocation:
*/
if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
/* Force charge with __GFP_NOFAIL */
- if (memcg && !charged) {
- mem_cgroup_charge_skmem(memcg, amt,
- gfp_memcg_charge() | __GFP_NOFAIL);
- }
+ if (memcg_enabled && !charged)
+ mem_cgroup_sk_charge(sk, amt,
+ gfp_memcg_charge() | __GFP_NOFAIL);
return 1;
}
}
- if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
- trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
+ trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
sk_memory_allocated_sub(sk, amt);
- if (memcg && charged)
- mem_cgroup_uncharge_skmem(memcg, amt);
+ if (charged)
+ mem_cgroup_sk_uncharge(sk, amt);
return 0;
}
@@ -3376,8 +3385,8 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
sk_memory_allocated_sub(sk, amount);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_uncharge(sk, amount);
if (sk_under_global_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
@@ -3691,7 +3700,7 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
*/
smp_wmb();
refcount_set(&sk->sk_refcnt, 1);
- atomic_set(&sk->sk_drops, 0);
+ sk_drops_reset(sk);
}
EXPORT_SYMBOL(sock_init_data_uid);
@@ -3951,7 +3960,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
- mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+ mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
}
#ifdef CONFIG_PROC_FS
@@ -4432,7 +4441,10 @@ static int __init sock_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
+#ifdef CONFIG_MEMCG
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
+#endif
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 491334b9b8be..9100e160113a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -663,9 +663,8 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
u32 tsize;
tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
- xdp_update_skb_shared_info(skb, nr_frags,
- sinfo->xdp_frags_size, tsize,
- xdp_buff_is_frag_pfmemalloc(xdp));
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ tsize, xdp_buff_get_skb_flags(xdp));
}
skb->protocol = eth_type_trans(skb, rxq->dev);
@@ -692,7 +691,7 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
struct skb_shared_info *sinfo = skb_shinfo(skb);
const struct skb_shared_info *xinfo;
u32 nr_frags, tsize = 0;
- bool pfmemalloc = false;
+ u32 flags = 0;
xinfo = xdp_get_shared_info_from_buff(xdp);
nr_frags = xinfo->nr_frags;
@@ -714,11 +713,12 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
__skb_fill_page_desc_noacc(sinfo, i, page, offset, len);
tsize += truesize;
- pfmemalloc |= page_is_pfmemalloc(page);
+ if (page_is_pfmemalloc(page))
+ flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
}
- xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size,
- tsize, pfmemalloc);
+ xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize,
+ flags);
return true;
}
@@ -823,10 +823,9 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
skb_metadata_set(skb, xdpf->metasize);
if (unlikely(xdp_frame_has_frags(xdpf)))
- xdp_update_skb_shared_info(skb, nr_frags,
- sinfo->xdp_frags_size,
- nr_frags * xdpf->frame_sz,
- xdp_frame_is_frag_pfmemalloc(xdpf));
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ nr_frags * xdpf->frame_sz,
+ xdp_frame_get_skb_flags(xdpf));
/* Essential SKB info: protocol and skb->dev */
skb->protocol = eth_type_trans(skb, dev);
diff --git a/net/devlink/health.c b/net/devlink/health.c
index b3ce8ecbb7fb..136a67c36a20 100644
--- a/net/devlink/health.c
+++ b/net/devlink/health.c
@@ -60,6 +60,7 @@ struct devlink_health_reporter {
struct devlink_port *devlink_port;
struct devlink_fmsg *dump_fmsg;
u64 graceful_period;
+ u64 burst_period;
bool auto_recover;
bool auto_dump;
u8 health_state;
@@ -108,11 +109,14 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port,
static struct devlink_health_reporter *
__devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
- if (WARN_ON(graceful_period && !ops->recover))
+ if (WARN_ON(ops->default_graceful_period && !ops->recover))
+ return ERR_PTR(-EINVAL);
+
+ if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period))
return ERR_PTR(-EINVAL);
reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
@@ -122,7 +126,8 @@ __devlink_health_reporter_create(struct devlink *devlink,
reporter->priv = priv;
reporter->ops = ops;
reporter->devlink = devlink;
- reporter->graceful_period = graceful_period;
+ reporter->graceful_period = ops->default_graceful_period;
+ reporter->burst_period = ops->default_burst_period;
reporter->auto_recover = !!ops->recover;
reporter->auto_dump = !!ops->dump;
return reporter;
@@ -134,13 +139,12 @@ __devlink_health_reporter_create(struct devlink *devlink,
*
* @port: devlink_port to which health reports will relate
* @ops: devlink health reporter ops
- * @graceful_period: min time (in msec) between recovery attempts
* @priv: driver priv pointer
*/
struct devlink_health_reporter *
devl_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
@@ -150,8 +154,7 @@ devl_port_health_reporter_create(struct devlink_port *port,
ops->name))
return ERR_PTR(-EEXIST);
- reporter = __devlink_health_reporter_create(port->devlink, ops,
- graceful_period, priv);
+ reporter = __devlink_health_reporter_create(port->devlink, ops, priv);
if (IS_ERR(reporter))
return reporter;
@@ -164,14 +167,13 @@ EXPORT_SYMBOL_GPL(devl_port_health_reporter_create);
struct devlink_health_reporter *
devlink_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
struct devlink *devlink = port->devlink;
devl_lock(devlink);
- reporter = devl_port_health_reporter_create(port, ops,
- graceful_period, priv);
+ reporter = devl_port_health_reporter_create(port, ops, priv);
devl_unlock(devlink);
return reporter;
}
@@ -182,13 +184,12 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create);
*
* @devlink: devlink instance which the health reports will relate
* @ops: devlink health reporter ops
- * @graceful_period: min time (in msec) between recovery attempts
* @priv: driver priv pointer
*/
struct devlink_health_reporter *
devl_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
@@ -197,8 +198,7 @@ devl_health_reporter_create(struct devlink *devlink,
if (devlink_health_reporter_find_by_name(devlink, ops->name))
return ERR_PTR(-EEXIST);
- reporter = __devlink_health_reporter_create(devlink, ops,
- graceful_period, priv);
+ reporter = __devlink_health_reporter_create(devlink, ops, priv);
if (IS_ERR(reporter))
return reporter;
@@ -210,13 +210,12 @@ EXPORT_SYMBOL_GPL(devl_health_reporter_create);
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
devl_lock(devlink);
- reporter = devl_health_reporter_create(devlink, ops,
- graceful_period, priv);
+ reporter = devl_health_reporter_create(devlink, ops, priv);
devl_unlock(devlink);
return reporter;
}
@@ -298,6 +297,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
reporter->graceful_period))
goto reporter_nest_cancel;
if (reporter->ops->recover &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
+ reporter->burst_period))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
reporter->auto_recover))
goto reporter_nest_cancel;
@@ -462,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb,
if (!reporter->ops->recover &&
(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
- info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] ||
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]))
return -EOPNOTSUPP;
if (!reporter->ops->dump &&
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
return -EOPNOTSUPP;
- if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) {
reporter->graceful_period =
nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
+ if (!reporter->graceful_period)
+ reporter->burst_period = 0;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) {
+ u64 burst_period =
+ nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]);
+
+ if (!reporter->graceful_period && burst_period) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Cannot set burst period without a grace period.");
+ return -EINVAL;
+ }
+
+ reporter->burst_period = burst_period;
+ }
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
reporter->auto_recover =
@@ -514,11 +534,25 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter,
devlink_nl_notify_send_desc(devlink, msg, &desc);
}
+static bool
+devlink_health_reporter_in_burst(struct devlink_health_reporter *reporter)
+{
+ unsigned long burst_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->burst_period);
+
+ return time_is_after_jiffies(burst_threshold);
+}
+
void
devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter)
{
reporter->recovery_count++;
- reporter->last_recovery_ts = jiffies;
+ if (!devlink_health_reporter_in_burst(reporter))
+ /* When burst period is set, last_recovery_ts marks the first
+ * recovery within the burst period, not necessarily the last
+ * one.
+ */
+ reporter->last_recovery_ts = jiffies;
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done);
@@ -592,12 +626,37 @@ dump_err:
return err;
}
+static bool
+devlink_health_recover_abort(struct devlink_health_reporter *reporter,
+ enum devlink_health_reporter_state prev_state)
+{
+ unsigned long recover_ts_threshold;
+
+ if (!reporter->auto_recover)
+ return false;
+
+ /* abort if the previous error wasn't recovered */
+ if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY)
+ return true;
+
+ if (devlink_health_reporter_in_burst(reporter))
+ return false;
+
+ recover_ts_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->burst_period) +
+ msecs_to_jiffies(reporter->graceful_period);
+ if (reporter->last_recovery_ts && reporter->recovery_count &&
+ time_is_after_jiffies(recover_ts_threshold))
+ return true;
+
+ return false;
+}
+
int devlink_health_report(struct devlink_health_reporter *reporter,
const char *msg, void *priv_ctx)
{
enum devlink_health_reporter_state prev_health_state;
struct devlink *devlink = reporter->devlink;
- unsigned long recover_ts_threshold;
int ret;
/* write a log message of the current error */
@@ -608,13 +667,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
- /* abort if the previous error wasn't recovered */
- recover_ts_threshold = reporter->last_recovery_ts +
- msecs_to_jiffies(reporter->graceful_period);
- if (reporter->auto_recover &&
- (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY ||
- (reporter->last_recovery_ts && reporter->recovery_count &&
- time_is_after_jiffies(recover_ts_threshold)))) {
+ if (devlink_health_recover_abort(reporter, prev_health_state)) {
trace_devlink_health_recover_aborted(devlink,
reporter->ops->name,
reporter->health_state,
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index d97c326a9045..9fd00977d59e 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN
};
/* DEVLINK_CMD_HEALTH_REPORTER_SET - do */
-static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = {
+static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
@@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT
[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, },
};
/* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */
@@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_health_reporter_set_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_health_reporter_set_nl_policy,
- .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{
diff --git a/net/devlink/param.c b/net/devlink/param.c
index 41dcc86cfd94..33134940c266 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -102,6 +102,11 @@ static const struct devlink_param devlink_param_generic[] = {
.name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME,
.type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE,
},
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS,
+ .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE,
+ },
};
static int devlink_param_generic_verify(const struct devlink_param *param)
diff --git a/net/devlink/port.c b/net/devlink/port.c
index cb8d4df61619..93d8a25bb920 100644
--- a/net/devlink/port.c
+++ b/net/devlink/port.c
@@ -1333,8 +1333,8 @@ int devlink_port_netdevice_event(struct notifier_block *nb,
return NOTIFY_OK;
}
-static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
- enum devlink_port_flavour flavour)
+static void __devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
@@ -1347,7 +1347,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
} else {
devlink_port->switch_port = false;
}
- return 0;
}
/**
@@ -1357,17 +1356,13 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
* @attrs: devlink port attrs
*/
void devlink_port_attrs_set(struct devlink_port *devlink_port,
- struct devlink_port_attrs *attrs)
+ const struct devlink_port_attrs *attrs)
{
- int ret;
-
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+ WARN_ON(attrs->splittable && attrs->split);
devlink_port->attrs = *attrs;
- ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
- if (ret)
- return;
- WARN_ON(attrs->splittable && attrs->split);
+ __devlink_port_attrs_set(devlink_port, attrs->flavour);
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
@@ -1383,14 +1378,10 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro
u16 pf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
- int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
- ret = __devlink_port_attrs_set(devlink_port,
- DEVLINK_PORT_FLAVOUR_PCI_PF);
- if (ret)
- return;
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF);
attrs->pci_pf.controller = controller;
attrs->pci_pf.pf = pf;
attrs->pci_pf.external = external;
@@ -1411,14 +1402,10 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
u16 pf, u16 vf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
- int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
- ret = __devlink_port_attrs_set(devlink_port,
- DEVLINK_PORT_FLAVOUR_PCI_VF);
- if (ret)
- return;
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF);
attrs->pci_vf.controller = controller;
attrs->pci_vf.pf = pf;
attrs->pci_vf.vf = vf;
@@ -1439,14 +1426,10 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
u16 pf, u32 sf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
- int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
- ret = __devlink_port_attrs_set(devlink_port,
- DEVLINK_PORT_FLAVOUR_PCI_SF);
- if (ret)
- return;
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF);
attrs->pci_sf.controller = controller;
attrs->pci_sf.pf = pf;
attrs->pci_sf.sf = sf;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 43a7854e784e..0b2a4d0573b3 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1014,6 +1014,28 @@ static bool flow_type_hashable(u32 flow_type)
return false;
}
+static bool flow_type_v6(u32 flow_type)
+{
+ switch (flow_type) {
+ case TCP_V6_FLOW:
+ case UDP_V6_FLOW:
+ case SCTP_V6_FLOW:
+ case AH_ESP_V6_FLOW:
+ case AH_V6_FLOW:
+ case ESP_V6_FLOW:
+ case IPV6_FLOW:
+ case GTPU_V6_FLOW:
+ case GTPC_V6_FLOW:
+ case GTPC_TEID_V6_FLOW:
+ case GTPU_EH_V6_FLOW:
+ case GTPU_UL_V6_FLOW:
+ case GTPU_DL_V6_FLOW:
+ return true;
+ }
+
+ return false;
+}
+
/* When adding a new type, update the assert and, if it's hashable, add it to
* the flow_type_hashable switch case.
*/
@@ -1077,6 +1099,9 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
if (rc)
return rc;
+ if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type))
+ return -EINVAL;
+
if (info.flow_type & FLOW_RSS && info.rss_context &&
!ops->rxfh_per_ctx_fields)
return -EINVAL;
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index 992e98abe9dd..202d95e8bf3e 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -536,35 +536,36 @@ void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context)
#define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \
RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \
RXH_GTP_TEID | RXH_DISCARD)
+#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL)
static const struct nla_policy ethnl_rss_flows_policy[] = {
[ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
};
const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = {
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 102eccf5ead7..8177ac6c2d26 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -143,6 +143,7 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev,
struct netlink_ext_ack *extack)
{
+ struct netdev_lag_upper_info lag_upper_info;
struct net_device *hsr_dev;
struct hsr_port *master;
int res;
@@ -159,7 +160,9 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev,
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
hsr_dev = master->dev;
- res = netdev_upper_dev_link(dev, hsr_dev, extack);
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_BROADCAST;
+ lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN;
+ res = netdev_master_upper_dev_link(dev, hsr_dev, NULL, &lag_upper_info, extack);
if (res)
goto fail_upper_dev_link;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 5cfc1c939673..833f2cf97178 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -170,7 +170,7 @@ struct neigh_table arp_tbl = {
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
- [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 740af8541d2f..709021197e1c 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1715,8 +1715,7 @@ validate_return:
*/
void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
- unsigned char optbuf[sizeof(struct ip_options) + 40];
- struct ip_options *opt = (struct ip_options *)optbuf;
+ struct inet_skb_parm parm;
int res;
if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
@@ -1727,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
* so we can not use icmp_send and IPCB here.
*/
- memset(opt, 0, sizeof(struct ip_options));
- opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
+ memset(&parm, 0, sizeof(parm));
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
- res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL);
+ res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL);
rcu_read_unlock();
if (res)
return;
if (gateway)
- __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm);
else
- __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm);
}
/**
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index f14a41ee4aa1..2c922afadb8f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -132,8 +132,8 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
dport = encap->encap_dport;
spin_unlock_bh(&x->lock);
- sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4,
- dport, x->props.saddr.a4, sport, 0);
+ sk = inet_lookup_established(net, x->id.daddr.a4, dport,
+ x->props.saddr.a4, sport, 0);
if (!sk)
return ERR_PTR(-ENOENT);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6e1b94796f67..1dab44e13d3b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -32,6 +32,7 @@
#include <linux/list.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -293,7 +294,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
.flowi4_iif = LOOPBACK_IFINDEX,
.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))),
+ .flowi4_dscp = ip4h_dscp(ip_hdr(skb)),
.flowi4_scope = scope,
.flowi4_mark = vmark ? skb->mark : 0,
};
@@ -358,7 +359,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
- fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_flags = 0;
@@ -1372,7 +1373,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
struct flowi4 fl4 = {
.flowi4_mark = frn->fl_mark,
.daddr = frn->fl_addr,
- .flowi4_tos = frn->fl_tos & INET_DSCP_MASK,
+ .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos),
.flowi4_scope = frn->fl_scope,
};
struct fib_table *tb;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index fa58d6620ed6..51f0193092f0 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -23,6 +23,7 @@
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -193,8 +194,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
* to mask the upper three DSCP bits prior to matching to maintain
* legacy behavior.
*/
- if (r->dscp_full &&
- (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask)
+ if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask)
return 0;
else if (!r->dscp_full && r->dscp &&
!fib_dscp_masked_match(r->dscp, fl4))
diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c
index 3d9614609b2d..506260b4a4dc 100644
--- a/net/ipv4/fou_nl.c
+++ b/net/ipv4/fou_nl.c
@@ -18,9 +18,9 @@ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = {
[FOU_ATTR_TYPE] = { .type = NLA_U8, },
[FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
[FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, },
- [FOU_ATTR_LOCAL_V6] = { .len = 16, },
+ [FOU_ATTR_LOCAL_V6] = NLA_POLICY_EXACT_LEN(16),
[FOU_ATTR_PEER_V4] = { .type = NLA_U32, },
- [FOU_ATTR_PEER_V6] = { .len = 16, },
+ [FOU_ATTR_PEER_V6] = NLA_POLICY_EXACT_LEN(16),
[FOU_ATTR_PEER_PORT] = { .type = NLA_BE16, },
[FOU_ATTR_IFINDEX] = { .type = NLA_S32, },
};
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c48c572f024d..1b7fb5d935ed 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -72,6 +72,7 @@
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -318,17 +319,17 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
return true;
/* No rate limit on loopback */
- dev = dst_dev(dst);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
if (dev && (dev->flags & IFF_LOOPBACK))
goto out;
- rcu_read_lock();
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
l3mdev_master_ifindex_rcu(dev));
rc = inet_peer_xrlim_allow(peer,
READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
- rcu_read_unlock();
out:
+ rcu_read_unlock();
if (!rc)
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
else
@@ -444,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = sock_net_uid(net, NULL);
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb)));
+ fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb));
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
@@ -495,7 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
fl4->flowi4_uid = sock_net_uid(net, NULL);
- fl4->flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4->flowi4_dscp = dscp;
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
@@ -544,14 +545,15 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
goto relookup_failed;
}
/* Ugh! */
- orefdst = skb_in->_skb_refdst; /* save old refdst */
- skb_dst_set(skb_in, NULL);
+ orefdst = skb_dstref_steal(skb_in);
err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
dscp, rt2->dst.dev) ? -EINVAL : 0;
dst_release(&rt2->dst);
rt2 = skb_rtable(skb_in);
- skb_in->_skb_refdst = orefdst; /* restore old refdst */
+ /* steal dst entry from skb_in, don't drop refcnt */
+ skb_dstref_steal(skb_in);
+ skb_dstref_restore(skb_in, orefdst);
}
if (err)
@@ -592,7 +594,7 @@ relookup_failed:
*/
void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
- const struct ip_options *opt)
+ const struct inet_skb_parm *parm)
{
struct iphdr *iph;
int room;
@@ -708,7 +710,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
rcu_read_lock();
if (rt_is_input_route(rt) &&
READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
- dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
+ dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif :
+ inet_iif(skb_in));
if (dev)
saddr = inet_select_addr(dev, iph->saddr,
@@ -723,7 +726,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
+ if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in,
+ &parm->opt))
goto out_unlock;
@@ -797,15 +801,16 @@ EXPORT_SYMBOL(__icmp_send);
void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct sk_buff *cloned_skb = NULL;
- struct ip_options opts = { 0 };
enum ip_conntrack_info ctinfo;
enum ip_conntrack_dir dir;
+ struct inet_skb_parm parm;
struct nf_conn *ct;
__be32 orig_ip;
+ memset(&parm, 0, sizeof(parm));
ct = nf_ct_get(skb_in, &ctinfo);
if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
- __icmp_send(skb_in, type, code, info, &opts);
+ __icmp_send(skb_in, type, code, info, &parm);
return;
}
@@ -821,7 +826,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
orig_ip = ip_hdr(skb_in)->saddr;
dir = CTINFO2DIR(ctinfo);
ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip;
- __icmp_send(skb_in, type, code, info, &opts);
+ __icmp_send(skb_in, type, code, info, &parm);
ip_hdr(skb_in)->saddr = orig_ip;
out:
consume_skb(cloned_skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 1e2df51427fe..142ff8d86fc2 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -706,9 +706,9 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
spin_unlock_bh(&queue->fastopenq.lock);
}
-out:
release_sock(sk);
- if (newsk && mem_cgroup_sockets_enabled) {
+
+ if (mem_cgroup_sockets_enabled) {
gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
int amt = 0;
@@ -718,7 +718,7 @@ out:
lock_sock(newsk);
mem_cgroup_sk_alloc(newsk);
- if (newsk->sk_memcg) {
+ if (mem_cgroup_from_sk(newsk)) {
/* The socket has not been accepted yet, no need
* to look at newsk->sk_wmem_queued.
*/
@@ -727,23 +727,22 @@ out:
}
if (amt)
- mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp);
+ mem_cgroup_sk_charge(newsk, amt, gfp);
kmem_cache_charge(newsk, gfp);
release_sock(newsk);
}
+
if (req)
reqsk_put(req);
- if (newsk)
- inet_init_csk_locks(newsk);
-
+ inet_init_csk_locks(newsk);
return newsk;
+
out_err:
- newsk = NULL;
- req = NULL;
+ release_sock(sk);
arg->err = error;
- goto out;
+ return NULL;
}
EXPORT_SYMBOL(inet_csk_accept);
@@ -1297,12 +1296,19 @@ void inet_csk_destroy_sock(struct sock *sk)
xfrm_sk_free_policy(sk);
- this_cpu_dec(*sk->sk_prot->orphan_count);
+ tcp_orphan_count_dec();
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
+void inet_csk_prepare_for_destroy_sock(struct sock *sk)
+{
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ tcp_orphan_count_inc();
+}
+
/* This function allows to force a closure of a socket after the call to
* tcp_create_openreq_child().
*/
@@ -1370,7 +1376,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
sock_orphan(child);
- this_cpu_inc(*sk->sk_prot->orphan_count);
+ tcp_orphan_count_inc();
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 2fa53b16fe77..f0b6c5a411a2 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -20,9 +20,6 @@
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/inet_connection_sock.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_timewait_sock.h>
-#include <net/inet6_hashtables.h>
#include <net/bpf_sk_storage.h>
#include <net/netlink.h>
@@ -74,54 +71,29 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
{
- r->idiag_family = sk->sk_family;
+ r->idiag_family = READ_ONCE(sk->sk_family);
- r->id.idiag_sport = htons(sk->sk_num);
- r->id.idiag_dport = sk->sk_dport;
- r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_sport = htons(READ_ONCE(sk->sk_num));
+ r->id.idiag_dport = READ_ONCE(sk->sk_dport);
+ r->id.idiag_if = READ_ONCE(sk->sk_bound_dev_if);
sock_diag_save_cookie(sk, r->id.idiag_cookie);
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+ if (r->idiag_family == AF_INET6) {
+ data_race(*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr);
+ data_race(*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr);
} else
#endif
{
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
- r->id.idiag_src[0] = sk->sk_rcv_saddr;
- r->id.idiag_dst[0] = sk->sk_daddr;
+ r->id.idiag_src[0] = READ_ONCE(sk->sk_rcv_saddr);
+ r->id.idiag_dst[0] = READ_ONCE(sk->sk_daddr);
}
}
EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
-static size_t inet_sk_attr_size(struct sock *sk,
- const struct inet_diag_req_v2 *req,
- bool net_admin)
-{
- const struct inet_diag_handler *handler;
- size_t aux = 0;
-
- rcu_read_lock();
- handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]);
- DEBUG_NET_WARN_ON_ONCE(!handler);
- if (handler && handler->idiag_get_aux_size)
- aux = handler->idiag_get_aux_size(sk, net_admin);
- rcu_read_unlock();
-
- return nla_total_size(sizeof(struct tcp_info))
- + nla_total_size(sizeof(struct inet_diag_msg))
- + inet_diag_msg_attrs_size()
- + nla_total_size(sizeof(struct inet_diag_meminfo))
- + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
- + nla_total_size(TCP_CA_NAME_MAX)
- + nla_total_size(sizeof(struct tcpvegas_info))
- + aux
- + 64;
-}
-
int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_msg *r, int ext,
struct user_namespace *user_ns,
@@ -313,17 +285,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
- r->idiag_retrans = icsk->icsk_retransmits;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits);
r->idiag_expires =
jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
- r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
} else if (timer_pending(&sk->sk_timer)) {
r->idiag_timer = 2;
- r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
}
@@ -422,183 +394,6 @@ errout:
}
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
-static int inet_twsk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- struct netlink_callback *cb,
- u16 nlmsg_flags, bool net_admin)
-{
- struct inet_timewait_sock *tw = inet_twsk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
- sizeof(*r), nlmsg_flags);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- BUG_ON(tw->tw_state != TCP_TIME_WAIT);
-
- inet_diag_msg_common_fill(r, sk);
- r->idiag_retrans = 0;
-
- r->idiag_state = READ_ONCE(tw->tw_substate);
- r->idiag_timer = 3;
- tmo = tw->tw_timer.expires - jiffies;
- r->idiag_expires = jiffies_delta_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = 0;
- r->idiag_inode = 0;
-
- if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
- tw->tw_mark)) {
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
- }
-
- nlmsg_end(skb, nlh);
- return 0;
-}
-
-static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
- struct netlink_callback *cb,
- u16 nlmsg_flags, bool net_admin)
-{
- struct request_sock *reqsk = inet_reqsk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
- cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- inet_diag_msg_common_fill(r, sk);
- r->idiag_state = TCP_SYN_RECV;
- r->idiag_timer = 1;
- r->idiag_retrans = reqsk->num_retrans;
-
- BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
- offsetof(struct sock, sk_cookie));
-
- tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
- r->idiag_expires = jiffies_delta_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = 0;
- r->idiag_inode = 0;
-
- if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
- inet_rsk(reqsk)->ir_mark)) {
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
- }
-
- nlmsg_end(skb, nlh);
- return 0;
-}
-
-static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- u16 nlmsg_flags, bool net_admin)
-{
- if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
-
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
-
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
- net_admin);
-}
-
-struct sock *inet_diag_find_one_icsk(struct net *net,
- struct inet_hashinfo *hashinfo,
- const struct inet_diag_req_v2 *req)
-{
- struct sock *sk;
-
- rcu_read_lock();
- if (req->sdiag_family == AF_INET)
- sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
- req->id.idiag_dport, req->id.idiag_src[0],
- req->id.idiag_sport, req->id.idiag_if);
-#if IS_ENABLED(CONFIG_IPV6)
- else if (req->sdiag_family == AF_INET6) {
- if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
- ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
- sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],
- req->id.idiag_dport, req->id.idiag_src[3],
- req->id.idiag_sport, req->id.idiag_if);
- else
- sk = inet6_lookup(net, hashinfo, NULL, 0,
- (struct in6_addr *)req->id.idiag_dst,
- req->id.idiag_dport,
- (struct in6_addr *)req->id.idiag_src,
- req->id.idiag_sport,
- req->id.idiag_if);
- }
-#endif
- else {
- rcu_read_unlock();
- return ERR_PTR(-EINVAL);
- }
- rcu_read_unlock();
- if (!sk)
- return ERR_PTR(-ENOENT);
-
- if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
- sock_gen_put(sk);
- return ERR_PTR(-ENOENT);
- }
-
- return sk;
-}
-EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
-
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *req)
-{
- struct sk_buff *in_skb = cb->skb;
- bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
- struct net *net = sock_net(in_skb->sk);
- struct sk_buff *rep;
- struct sock *sk;
- int err;
-
- sk = inet_diag_find_one_icsk(net, hashinfo, req);
- if (IS_ERR(sk))
- return PTR_ERR(sk);
-
- rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL);
- if (!rep) {
- err = -ENOMEM;
- goto out;
- }
-
- err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
- if (err < 0) {
- WARN_ON(err == -EMSGSIZE);
- nlmsg_free(rep);
- goto out;
- }
- err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
-
-out:
- if (sk)
- sock_gen_put(sk);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
-
static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
int hdrlen,
@@ -785,7 +580,7 @@ static void entry_fill_addrs(struct inet_diag_entry *entry,
const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
+ if (entry->family == AF_INET6) {
entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
entry->daddr = sk->sk_v6_daddr.s6_addr32;
} else
@@ -796,31 +591,36 @@ static void entry_fill_addrs(struct inet_diag_entry *entry,
}
}
-int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
+int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct nlattr *bc = cb_data->inet_diag_nla_bc;
+ const struct inet_sock *inet = inet_sk(sk);
struct inet_diag_entry entry;
if (!bc)
return 1;
- entry.family = sk->sk_family;
+ entry.family = READ_ONCE(sk->sk_family);
entry_fill_addrs(&entry, sk);
- entry.sport = inet->inet_num;
- entry.dport = ntohs(inet->inet_dport);
- entry.ifindex = sk->sk_bound_dev_if;
- entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
- if (sk_fullsock(sk))
- entry.mark = READ_ONCE(sk->sk_mark);
- else if (sk->sk_state == TCP_NEW_SYN_RECV)
- entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
- else if (sk->sk_state == TCP_TIME_WAIT)
- entry.mark = inet_twsk(sk)->tw_mark;
- else
- entry.mark = 0;
+ entry.sport = READ_ONCE(inet->inet_num);
+ entry.dport = ntohs(READ_ONCE(inet->inet_dport));
+ entry.ifindex = READ_ONCE(sk->sk_bound_dev_if);
+ if (cb_data->userlocks_needed)
+ entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0;
+ if (cb_data->mark_needed) {
+ if (sk_fullsock(sk))
+ entry.mark = READ_ONCE(sk->sk_mark);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
+ else if (sk->sk_state == TCP_TIME_WAIT)
+ entry.mark = inet_twsk(sk)->tw_mark;
+ else
+ entry.mark = 0;
+ }
#ifdef CONFIG_SOCK_CGROUP_DATA
- entry.cgroup_id = sk_fullsock(sk) ?
- cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
+ if (cb_data->cgroup_needed)
+ entry.cgroup_id = sk_fullsock(sk) ?
+ cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
#endif
return inet_diag_bc_run(bc, &entry);
@@ -920,16 +720,21 @@ static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len,
}
#endif
-static int inet_diag_bc_audit(const struct nlattr *attr,
+static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data,
const struct sk_buff *skb)
{
- bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
+ const struct nlattr *attr = cb_data->inet_diag_nla_bc;
const void *bytecode, *bc;
int bytecode_len, len;
+ bool net_admin;
+
+ if (!attr)
+ return 0;
- if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
+ if (nla_len(attr) < sizeof(struct inet_diag_bc_op))
return -EINVAL;
+ net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
bytecode = bc = nla_data(attr);
len = bytecode_len = nla_len(attr);
@@ -961,14 +766,18 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return -EPERM;
if (!valid_markcond(bc, len, &min_len))
return -EINVAL;
+ cb_data->mark_needed = true;
break;
#ifdef CONFIG_SOCK_CGROUP_DATA
case INET_DIAG_BC_CGROUP_COND:
if (!valid_cgroupcond(bc, len, &min_len))
return -EINVAL;
+ cb_data->cgroup_needed = true;
break;
#endif
case INET_DIAG_BC_AUTO:
+ cb_data->userlocks_needed = true;
+ fallthrough;
case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
break;
@@ -992,280 +801,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return len == 0 ? 0 : -EINVAL;
}
-static void twsk_build_assert(void)
-{
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
- offsetof(struct sock, sk_family));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
- offsetof(struct inet_sock, inet_num));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
- offsetof(struct inet_sock, inet_dport));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
- offsetof(struct inet_sock, inet_rcv_saddr));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
- offsetof(struct inet_sock, inet_daddr));
-
-#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
- offsetof(struct sock, sk_v6_rcv_saddr));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
- offsetof(struct sock, sk_v6_daddr));
-#endif
-}
-
-void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r)
-{
- bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
- struct inet_diag_dump_data *cb_data = cb->data;
- struct net *net = sock_net(skb->sk);
- u32 idiag_states = r->idiag_states;
- int i, num, s_i, s_num;
- struct nlattr *bc;
- struct sock *sk;
-
- bc = cb_data->inet_diag_nla_bc;
- if (idiag_states & TCPF_SYN_RECV)
- idiag_states |= TCPF_NEW_SYN_RECV;
- s_i = cb->args[1];
- s_num = num = cb->args[2];
-
- if (cb->args[0] == 0) {
- if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
- goto skip_listen_ht;
-
- for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
- struct inet_listen_hashbucket *ilb;
- struct hlist_nulls_node *node;
-
- num = 0;
- ilb = &hashinfo->lhash2[i];
-
- if (hlist_nulls_empty(&ilb->nulls_head)) {
- s_num = 0;
- continue;
- }
- spin_lock(&ilb->lock);
- sk_nulls_for_each(sk, node, &ilb->nulls_head) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (!net_eq(sock_net(sk), net))
- continue;
-
- if (num < s_num) {
- num++;
- continue;
- }
-
- if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
- goto next_listen;
-
- if (r->id.idiag_sport != inet->inet_sport &&
- r->id.idiag_sport)
- goto next_listen;
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_listen;
-
- if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
- cb, r, NLM_F_MULTI,
- net_admin) < 0) {
- spin_unlock(&ilb->lock);
- goto done;
- }
-
-next_listen:
- ++num;
- }
- spin_unlock(&ilb->lock);
-
- s_num = 0;
- }
-skip_listen_ht:
- cb->args[0] = 1;
- s_i = num = s_num = 0;
- }
-
-/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
- * with bh disabled.
- */
-#define SKARR_SZ 16
-
- /* Dump bound but inactive (not listening, connecting, etc.) sockets */
- if (cb->args[0] == 1) {
- if (!(idiag_states & TCPF_BOUND_INACTIVE))
- goto skip_bind_ht;
-
- for (i = s_i; i < hashinfo->bhash_size; i++) {
- struct inet_bind_hashbucket *ibb;
- struct inet_bind2_bucket *tb2;
- struct sock *sk_arr[SKARR_SZ];
- int num_arr[SKARR_SZ];
- int idx, accum, res;
-
-resume_bind_walk:
- num = 0;
- accum = 0;
- ibb = &hashinfo->bhash2[i];
-
- if (hlist_empty(&ibb->chain)) {
- s_num = 0;
- continue;
- }
- spin_lock_bh(&ibb->lock);
- inet_bind_bucket_for_each(tb2, &ibb->chain) {
- if (!net_eq(ib2_net(tb2), net))
- continue;
-
- sk_for_each_bound(sk, &tb2->owners) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_bind;
-
- if (sk->sk_state != TCP_CLOSE ||
- !inet->inet_num)
- goto next_bind;
-
- if (r->sdiag_family != AF_UNSPEC &&
- r->sdiag_family != sk->sk_family)
- goto next_bind;
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_bind;
-
- sock_hold(sk);
- num_arr[accum] = num;
- sk_arr[accum] = sk;
- if (++accum == SKARR_SZ)
- goto pause_bind_walk;
-next_bind:
- num++;
- }
- }
-pause_bind_walk:
- spin_unlock_bh(&ibb->lock);
-
- res = 0;
- for (idx = 0; idx < accum; idx++) {
- if (res >= 0) {
- res = inet_sk_diag_fill(sk_arr[idx],
- NULL, skb, cb,
- r, NLM_F_MULTI,
- net_admin);
- if (res < 0)
- num = num_arr[idx];
- }
- sock_put(sk_arr[idx]);
- }
- if (res < 0)
- goto done;
-
- cond_resched();
-
- if (accum == SKARR_SZ) {
- s_num = num + 1;
- goto resume_bind_walk;
- }
-
- s_num = 0;
- }
-skip_bind_ht:
- cb->args[0] = 2;
- s_i = num = s_num = 0;
- }
-
- if (!(idiag_states & ~TCPF_LISTEN))
- goto out;
-
- for (i = s_i; i <= hashinfo->ehash_mask; i++) {
- struct inet_ehash_bucket *head = &hashinfo->ehash[i];
- spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
- struct hlist_nulls_node *node;
- struct sock *sk_arr[SKARR_SZ];
- int num_arr[SKARR_SZ];
- int idx, accum, res;
-
- if (hlist_nulls_empty(&head->chain))
- continue;
-
- if (i > s_i)
- s_num = 0;
-
-next_chunk:
- num = 0;
- accum = 0;
- spin_lock_bh(lock);
- sk_nulls_for_each(sk, node, &head->chain) {
- int state;
-
- if (!net_eq(sock_net(sk), net))
- continue;
- if (num < s_num)
- goto next_normal;
- state = (sk->sk_state == TCP_TIME_WAIT) ?
- READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state;
- if (!(idiag_states & (1 << state)))
- goto next_normal;
- if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
- goto next_normal;
- if (r->id.idiag_sport != htons(sk->sk_num) &&
- r->id.idiag_sport)
- goto next_normal;
- if (r->id.idiag_dport != sk->sk_dport &&
- r->id.idiag_dport)
- goto next_normal;
- twsk_build_assert();
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_normal;
-
- if (!refcount_inc_not_zero(&sk->sk_refcnt))
- goto next_normal;
-
- num_arr[accum] = num;
- sk_arr[accum] = sk;
- if (++accum == SKARR_SZ)
- break;
-next_normal:
- ++num;
- }
- spin_unlock_bh(lock);
- res = 0;
- for (idx = 0; idx < accum; idx++) {
- if (res >= 0) {
- res = sk_diag_fill(sk_arr[idx], skb, cb, r,
- NLM_F_MULTI, net_admin);
- if (res < 0)
- num = num_arr[idx];
- }
- sock_gen_put(sk_arr[idx]);
- }
- if (res < 0)
- break;
- cond_resched();
- if (accum == SKARR_SZ) {
- s_num = num + 1;
- goto next_chunk;
- }
- }
-
-done:
- cb->args[1] = i;
- cb->args[2] = num;
-out:
- ;
-}
-EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
-
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
@@ -1319,13 +854,10 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
kfree(cb_data);
return err;
}
- nla = cb_data->inet_diag_nla_bc;
- if (nla) {
- err = inet_diag_bc_audit(nla, skb);
- if (err) {
- kfree(cb_data);
- return err;
- }
+ err = inet_diag_bc_audit(cb_data, skb);
+ if (err) {
+ kfree(cb_data);
+ return err;
}
nla = cb_data->inet_diag_nla_bpf_stgs;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ceeeec9b7290..ef4ccfd46ff6 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -425,19 +425,18 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net,
}
struct sock *__inet_lookup_listener(const struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
struct inet_listen_hashbucket *ilb2;
+ struct inet_hashinfo *hashinfo;
struct sock *result = NULL;
unsigned int hash2;
/* Lookup redirect from BPF */
- if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
- hashinfo == net->ipv4.tcp_death_row.hashinfo) {
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
saddr, sport, daddr, hnum, dif,
inet_ehashfn);
@@ -445,6 +444,7 @@ struct sock *__inet_lookup_listener(const struct net *net,
goto done;
}
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
@@ -490,21 +490,22 @@ void sock_edemux(struct sk_buff *skb)
EXPORT_SYMBOL(sock_edemux);
struct sock *__inet_lookup_established(const struct net *net,
- struct inet_hashinfo *hashinfo,
- const __be32 saddr, const __be16 sport,
- const __be32 daddr, const u16 hnum,
- const int dif, const int sdif)
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 hnum,
+ const int dif, const int sdif)
{
- INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
- struct sock *sk;
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const struct hlist_nulls_node *node;
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
- unsigned int slot = hash & hashinfo->ehash_mask;
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+ struct inet_ehash_bucket *head;
+ struct inet_hashinfo *hashinfo;
+ unsigned int hash, slot;
+ struct sock *sk;
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash & hashinfo->ehash_mask;
+ head = &hashinfo->ehash[slot];
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
@@ -579,8 +580,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
- if (sk->sk_protocol == IPPROTO_TCP &&
- tcp_twsk_unique(sk, sk2, twp))
+ if (tcp_twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
@@ -707,7 +707,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
if (ok) {
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} else {
- this_cpu_inc(*sk->sk_prot->orphan_count);
+ tcp_orphan_count_inc();
inet_sk_set_state(sk, TCP_CLOSE);
sock_set_flag(sk, SOCK_DEAD);
inet_csk_destroy_sock(sk);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 875ff923a8ed..5b5426b8ee92 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -15,7 +15,7 @@
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
-
+#include <net/tcp.h>
/**
* inet_twsk_bind_unhash - unhash a timewait socket from bind hash
@@ -74,7 +74,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
void inet_twsk_free(struct inet_timewait_sock *tw)
{
struct module *owner = tw->tw_prot->owner;
- twsk_destructor((struct sock *)tw);
+
+ tcp_twsk_destructor((struct sock *)tw);
kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
module_put(owner);
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b2584cce90ae..f7012479713b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -476,14 +476,16 @@ out_fail:
/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
- struct net_device *dev = skb->dev ? : skb_dst_dev(skb);
- int vif = l3mdev_master_ifindex_rcu(dev);
+ struct net_device *dev;
struct ipq *qp;
+ int vif;
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
/* Lookup (or create) queue header */
rcu_read_lock();
+ dev = skb->dev ? : skb_dst_dev_rcu(skb);
+ vif = l3mdev_master_ifindex_rcu(dev);
qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
int ret, refs = 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f5b9004d6938..761a53c6a89a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -28,6 +28,7 @@
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
+#include <net/flow.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -44,7 +45,6 @@
#include <net/gre.h>
#include <net/dst_metadata.h>
#include <net/erspan.h>
-#include <net/inet_dscp.h>
/*
Problems & solutions
@@ -930,7 +930,7 @@ static int ipgre_open(struct net_device *dev)
if (ipv4_is_multicast(t->parms.iph.daddr)) {
struct flowi4 fl4 = {
.flowi4_oif = t->parms.link,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)),
+ .flowi4_dscp = ip4h_dscp(&t->parms.iph),
.flowi4_scope = RT_SCOPE_UNIVERSE,
.flowi4_proto = IPPROTO_GRE,
.saddr = t->parms.iph.saddr,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fc323994b1fa..a09aca2c8567 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -587,9 +587,13 @@ static void ip_sublist_rcv_finish(struct list_head *head)
}
static struct sk_buff *ip_extract_route_hint(const struct net *net,
- struct sk_buff *skb, int rt_type)
+ struct sk_buff *skb)
{
- if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (fib4_has_custom_rules(net) ||
+ ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_zeronet(iph->daddr) ||
IPCB(skb)->flags & IPSKB_MULTIPATH)
return NULL;
@@ -618,8 +622,7 @@ static void ip_list_rcv_finish(struct net *net, struct list_head *head)
dst = skb_dst(skb);
if (curr_dst != dst) {
- hint = ip_extract_route_hint(net, skb,
- dst_rtable(dst)->rt_type);
+ hint = ip_extract_route_hint(net, skb);
/* dispatch old sublist */
if (!list_empty(&sublist))
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index e3321932bec0..be8815ce3ac2 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -615,14 +615,13 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev)
}
memcpy(&nexthop, &optptr[srrptr-1], 4);
- orefdst = skb->_skb_refdst;
- skb_dst_set(skb, NULL);
+ orefdst = skb_dstref_steal(skb);
err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph),
dev) ? -EINVAL : 0;
rt2 = skb_rtable(skb);
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
skb_dst_drop(skb);
- skb->_skb_refdst = orefdst;
+ skb_dstref_restore(skb, orefdst);
return -EINVAL;
}
refdst_drop(orefdst);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 84e7f8a2f50f..2b96651d719b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -63,6 +63,7 @@
#include <linux/stat.h>
#include <linux/init.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -485,7 +486,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
inet_sk_init_flowi4(inet, fl4);
/* sctp_v4_xmit() uses its own DSCP value */
- fl4->flowi4_tos = tos & INET_DSCP_MASK;
+ fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e86a8a862c41..ca9eaee4c2ef 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -42,6 +42,7 @@
#include <linux/init.h>
#include <linux/if_ether.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -1904,7 +1905,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
return -1;
}
- encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ encap += LL_RESERVED_SPACE(dst_dev_rcu(&rt->dst)) + rt->dst.header_len;
if (skb_cow(skb, encap)) {
ip_rt_put(rt);
@@ -1957,7 +1958,7 @@ static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt,
* result in receiving multiple packets.
*/
NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, rt->dst.dev,
+ net, NULL, skb, skb->dev, dst_dev_rcu(&rt->dst),
ipmr_forward_finish);
return;
@@ -2120,7 +2121,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)),
+ .flowi4_dscp = ip4h_dscp(iph),
.flowi4_oif = (rt_is_output_route(rt) ?
skb->dev->ifindex : 0),
.flowi4_iif = (rt_is_output_route(rt) ?
@@ -2301,7 +2302,7 @@ int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
guard(rcu)();
- dev = rt->dst.dev;
+ dev = dst_dev_rcu(&rt->dst);
if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto mc_output;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 0565f001120d..ce310eb779e0 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -11,10 +11,10 @@
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <linux/export.h>
+#include <net/flow.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
-#include <net/inet_dscp.h>
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
@@ -44,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
*/
fl4.daddr = iph->daddr;
fl4.saddr = saddr;
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
@@ -65,7 +65,10 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
xfrm_decode_session(net, skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
struct dst_entry *dst = skb_dst(skb);
- skb_dst_set(skb, NULL);
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index a27782d7653e..6d9bf5106868 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -8,8 +8,8 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
-#include <net/inet_dscp.h>
#include <linux/ip.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/route.h>
@@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
- flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ flow.flowi4_dscp = ip4h_dscp(iph);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par));
flow.flowi4_uid = sock_net_uid(xt_net(par), NULL);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index ed08fb78cfa8..9a773502f10a 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -12,10 +12,10 @@
#include <linux/skbuff.h>
#include <linux/netfilter.h>
#include <net/checksum.h>
+#include <net/flow.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
-#include <net/inet_dscp.h>
#include <net/netfilter/ipv4/nf_dup_ipv4.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
@@ -33,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
fl4.flowi4_oif = oif;
fl4.daddr = gw->s_addr;
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 0d3cb2ba6fc8..05631abe3f0d 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -12,6 +12,15 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl);
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth);
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook);
+
static int nf_reject_iphdr_validate(struct sk_buff *skb)
{
struct iphdr *iph;
@@ -136,8 +145,9 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach);
-const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
- struct tcphdr *_oth, int hook)
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook)
{
const struct tcphdr *oth;
@@ -163,11 +173,10 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
return oth;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
-struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- __u8 protocol, int ttl)
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl)
{
struct iphdr *niph, *oiph = ip_hdr(oldskb);
@@ -188,10 +197,9 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
return niph;
}
-EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
-void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
- const struct tcphdr *oth)
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth)
{
struct iphdr *niph = ip_hdr(nskb);
struct tcphdr *tcph;
@@ -218,7 +226,6 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
nskb->csum_start = (unsigned char *)tcph - nskb->head;
nskb->csum_offset = offsetof(struct tcphdr, check);
}
-EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
{
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index a1350fc25838..5080fa5fbf6a 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -71,8 +71,7 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo,
- skb, doff, saddr, sport, daddr, dport,
+ return inet_lookup(net, skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp4_lib_lookup(net, saddr, sport, daddr, dport,
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 73e66a088e25..041c3f37f237 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -81,7 +81,6 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
- struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
struct sock *sk;
switch (protocol) {
@@ -95,7 +94,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet_lookup_listener(net, hinfo, skb,
+ sk = inet_lookup_listener(net, skb,
ip_hdrlen(skb) + __tcp_hdrlen(hp),
saddr, sport, daddr, dport,
in->ifindex, 0);
@@ -109,7 +108,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = inet_lookup_established(net, hinfo, saddr, sport,
+ sk = inet_lookup_established(net, saddr, sport,
daddr, dport, in->ifindex);
break;
default:
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 7e7c49535e3f..82af6cd76d13 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -10,7 +10,7 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_fib.h>
-#include <net/inet_dscp.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/route.h>
@@ -114,7 +114,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (priv->flags & NFTA_FIB_F_MARK)
fl4.flowi4_mark = pkt->skb->mark;
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
if (priv->flags & NFTA_FIB_F_DADDR) {
fl4.daddr = iph->daddr;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 29118c43ebf5..0a20625f5ffb 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -2087,6 +2087,12 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
{
struct nh_grp_entry *nhge, *tmp;
+ /* If there is nothing to do, let's avoid the costly call to
+ * synchronize_net()
+ */
+ if (list_empty(&nh->grp_list))
+ return;
+
list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
remove_nh_grp_entry(net, nhge, nlinfo);
@@ -3511,12 +3517,42 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb,
int err;
s_idx = ctx->idx;
- for (node = rb_first(root); node; node = rb_next(node)) {
+
+ /* If this is not the first invocation, ctx->idx will contain the id of
+ * the last nexthop we processed. Instead of starting from the very
+ * first element of the red/black tree again and linearly skipping the
+ * (potentially large) set of nodes with an id smaller than s_idx, walk
+ * the tree and find the left-most node whose id is >= s_idx. This
+ * provides an efficient O(log n) starting point for the dump
+ * continuation.
+ */
+ if (s_idx != 0) {
+ struct rb_node *tmp = root->rb_node;
+
+ node = NULL;
+ while (tmp) {
+ struct nexthop *nh;
+
+ nh = rb_entry(tmp, struct nexthop, rb_node);
+ if (nh->id < s_idx) {
+ tmp = tmp->rb_right;
+ } else {
+ /* Track current candidate and keep looking on
+ * the left side to find the left-most
+ * (smallest id) that is still >= s_idx.
+ */
+ node = tmp;
+ tmp = tmp->rb_left;
+ }
+ }
+ } else {
+ node = rb_first(root);
+ }
+
+ for (; node; node = rb_next(node)) {
struct nexthop *nh;
nh = rb_entry(node, struct nexthop, rb_node);
- if (nh->id < s_idx)
- continue;
ctx->idx = nh->id;
err = nh_cb(skb, cb, nh, data);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 031df4c19fcc..5321c5801c64 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -56,9 +56,7 @@ struct ping_table {
static struct ping_table ping_table;
struct pingv6_ops pingv6_ops;
-EXPORT_SYMBOL_GPL(pingv6_ops);
-
-static u16 ping_port_rover;
+EXPORT_IPV6_MOD_GPL(pingv6_ops);
static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
{
@@ -67,7 +65,6 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
pr_debug("hash(%u) = %u\n", num, res);
return res;
}
-EXPORT_SYMBOL_GPL(ping_hash);
static inline struct hlist_head *ping_hashslot(struct ping_table *table,
struct net *net, unsigned int num)
@@ -77,6 +74,7 @@ static inline struct hlist_head *ping_hashslot(struct ping_table *table,
int ping_get_port(struct sock *sk, unsigned short ident)
{
+ struct net *net = sock_net(sk);
struct inet_sock *isk, *isk2;
struct hlist_head *hlist;
struct sock *sk2 = NULL;
@@ -84,15 +82,16 @@ int ping_get_port(struct sock *sk, unsigned short ident)
isk = inet_sk(sk);
spin_lock(&ping_table.lock);
if (ident == 0) {
+ u16 result = net->ipv4.ping_port_rover + 1;
u32 i;
- u16 result = ping_port_rover + 1;
for (i = 0; i < (1L << 16); i++, result++) {
if (!result)
- result++; /* avoid zero */
- hlist = ping_hashslot(&ping_table, sock_net(sk),
- result);
+ continue; /* avoid zero */
+ hlist = ping_hashslot(&ping_table, net, result);
sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
if (isk2->inet_num == result)
@@ -100,7 +99,7 @@ int ping_get_port(struct sock *sk, unsigned short ident)
}
/* found */
- ping_port_rover = ident = result;
+ net->ipv4.ping_port_rover = ident = result;
break;
next_port:
;
@@ -108,8 +107,10 @@ next_port:
if (i >= (1L << 16))
goto fail;
} else {
- hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+ hlist = ping_hashslot(&ping_table, net, ident);
sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
/* BUG? Why is this reuse and not reuseaddr? ping.c
@@ -129,7 +130,7 @@ next_port:
pr_debug("was not hashed\n");
sk_add_node_rcu(sk, hlist);
sock_set_flag(sk, SOCK_RCU_FREE);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
}
spin_unlock(&ping_table.lock);
return 0;
@@ -138,15 +139,7 @@ fail:
spin_unlock(&ping_table.lock);
return -EADDRINUSE;
}
-EXPORT_SYMBOL_GPL(ping_get_port);
-
-int ping_hash(struct sock *sk)
-{
- pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
- BUG(); /* "Please do not press this button again." */
-
- return 0;
-}
+EXPORT_IPV6_MOD_GPL(ping_get_port);
void ping_unhash(struct sock *sk)
{
@@ -161,7 +154,7 @@ void ping_unhash(struct sock *sk)
}
spin_unlock(&ping_table.lock);
}
-EXPORT_SYMBOL_GPL(ping_unhash);
+EXPORT_IPV6_MOD_GPL(ping_unhash);
/* Called under rcu_read_lock() */
static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
@@ -188,6 +181,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
}
sk_for_each_rcu(sk, hslot) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
isk = inet_sk(sk);
pr_debug("iterate\n");
@@ -279,7 +274,7 @@ out_release_group:
put_group_info(group_info);
return ret;
}
-EXPORT_SYMBOL_GPL(ping_init_sock);
+EXPORT_IPV6_MOD_GPL(ping_init_sock);
void ping_close(struct sock *sk, long timeout)
{
@@ -289,7 +284,7 @@ void ping_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
-EXPORT_SYMBOL_GPL(ping_close);
+EXPORT_IPV6_MOD_GPL(ping_close);
static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
@@ -467,7 +462,7 @@ out:
pr_debug("ping_v4_bind -> %d\n", err);
return err;
}
-EXPORT_SYMBOL_GPL(ping_bind);
+EXPORT_IPV6_MOD_GPL(ping_bind);
/*
* Is this a supported type of ICMP message?
@@ -600,7 +595,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
out:
return;
}
-EXPORT_SYMBOL_GPL(ping_err);
+EXPORT_IPV6_MOD_GPL(ping_err);
/*
* Copy and checksum an ICMP Echo packet from user space into a buffer
@@ -630,7 +625,7 @@ int ping_getfrag(void *from, char *to,
return 0;
}
-EXPORT_SYMBOL_GPL(ping_getfrag);
+EXPORT_IPV6_MOD_GPL(ping_getfrag);
static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
struct flowi4 *fl4)
@@ -691,7 +686,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
return 0;
}
-EXPORT_SYMBOL_GPL(ping_common_sendmsg);
+EXPORT_IPV6_MOD_GPL(ping_common_sendmsg);
static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
@@ -936,7 +931,7 @@ out:
pr_debug("ping_recvmsg -> %d\n", err);
return err;
}
-EXPORT_SYMBOL_GPL(ping_recvmsg);
+EXPORT_IPV6_MOD_GPL(ping_recvmsg);
static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk,
struct sk_buff *skb)
@@ -957,7 +952,7 @@ int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
return __ping_queue_rcv_skb(sk, skb) ? -1 : 0;
}
-EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
+EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb);
/*
@@ -985,7 +980,7 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb)
kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET);
return SKB_DROP_REASON_NO_SOCKET;
}
-EXPORT_SYMBOL_GPL(ping_rcv);
+EXPORT_IPV6_MOD_GPL(ping_rcv);
struct proto ping_prot = {
.name = "PING",
@@ -1002,13 +997,12 @@ struct proto ping_prot = {
.bind = ping_bind,
.backlog_rcv = ping_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb,
- .hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
.put_port = ping_unhash,
.obj_size = sizeof(struct inet_sock),
};
-EXPORT_SYMBOL(ping_prot);
+EXPORT_IPV6_MOD(ping_prot);
#ifdef CONFIG_PROC_FS
@@ -1073,7 +1067,7 @@ void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
-EXPORT_SYMBOL_GPL(ping_seq_start);
+EXPORT_IPV6_MOD_GPL(ping_seq_start);
static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -1092,14 +1086,14 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
return sk;
}
-EXPORT_SYMBOL_GPL(ping_seq_next);
+EXPORT_IPV6_MOD_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v)
__releases(ping_table.lock)
{
spin_unlock(&ping_table.lock);
}
-EXPORT_SYMBOL_GPL(ping_seq_stop);
+EXPORT_IPV6_MOD_GPL(ping_seq_stop);
static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
int bucket)
@@ -1119,7 +1113,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
static int ping_v4_seq_show(struct seq_file *seq, void *v)
@@ -1150,6 +1144,8 @@ static int __net_init ping_v4_proc_init_net(struct net *net)
if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops,
sizeof(struct ping_iter_state)))
return -ENOMEM;
+
+ net->ipv4.ping_port_rover = get_random_u16();
return 0;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 65b0d0ab0084..974afc4ecbe2 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -95,7 +95,6 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS),
- SNMP_MIB_SENTINEL
};
/* Following items are displayed in /proc/net/netstat */
@@ -119,7 +118,6 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
- SNMP_MIB_SENTINEL
};
static const struct {
@@ -157,7 +155,6 @@ static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp4_udp_list[] = {
@@ -170,7 +167,6 @@ static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp4_net_list[] = {
@@ -309,7 +305,6 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND),
SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD),
SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS),
- SNMP_MIB_SENTINEL
};
static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
@@ -389,14 +384,15 @@ static void icmp_put(struct seq_file *seq)
*/
static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
{
+ const int cnt = ARRAY_SIZE(snmp4_ipstats_list);
+ u64 buff64[ARRAY_SIZE(snmp4_ipstats_list)];
struct net *net = seq->private;
- u64 buff64[IPSTATS_MIB_MAX];
int i;
- memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
+ memset(buff64, 0, sizeof(buff64));
seq_puts(seq, "Ip: Forwarding DefaultTTL");
- for (i = 0; snmp4_ipstats_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
seq_printf(seq, "\nIp: %d %d",
@@ -404,10 +400,10 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
- snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list,
- net->mib.ip_statistics,
- offsetof(struct ipstats_mib, syncp));
- for (i = 0; snmp4_ipstats_list[i].name; i++)
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipstats_list, cnt,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %llu", buff64[i]);
return 0;
@@ -415,20 +411,23 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
{
+ const int udp_cnt = ARRAY_SIZE(snmp4_udp_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_tcp_list);
unsigned long buff[TCPUDP_MIB_MAX];
struct net *net = seq->private;
int i;
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, tcp_cnt * sizeof(unsigned long));
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name; i++)
+ for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %s", snmp4_tcp_list[i].name);
seq_puts(seq, "\nTcp:");
- snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
- net->mib.tcp_statistics);
- for (i = 0; snmp4_tcp_list[i].name; i++) {
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_tcp_list,
+ tcp_cnt,
+ net->mib.tcp_statistics);
+ for (i = 0; i < tcp_cnt; i++) {
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
seq_printf(seq, " %ld", buff[i]);
@@ -436,27 +435,29 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
seq_printf(seq, " %lu", buff[i]);
}
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
- snmp_get_cpu_field_batch(buff, snmp4_udp_list,
- net->mib.udp_statistics);
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udp_statistics);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
/* the UDP and UDP-Lite MIBs are the same */
seq_puts(seq, "\nUdpLite:");
- snmp_get_cpu_field_batch(buff, snmp4_udp_list,
- net->mib.udplite_statistics);
- for (i = 0; snmp4_udp_list[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udplite_statistics);
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
seq_putc(seq, '\n');
@@ -480,8 +481,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
*/
static int netstat_seq_show(struct seq_file *seq, void *v)
{
- const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1;
- const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1;
+ const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_net_list);
struct net *net = seq->private;
unsigned long *buff;
int i;
@@ -494,8 +495,8 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)),
GFP_KERNEL);
if (buff) {
- snmp_get_cpu_field_batch(buff, snmp4_net_list,
- net->mib.net_statistics);
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_net_list, tcp_cnt,
+ net->mib.net_statistics);
for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
} else {
@@ -513,7 +514,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
u64 *buff64 = (u64 *)buff;
memset(buff64, 0, ip_cnt * sizeof(u64));
- snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list,
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipextstats_list, ip_cnt,
net->mib.ip_statistics,
offsetof(struct ipstats_mib, syncp));
for (i = 0; i < ip_cnt; i++)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1d2c89d63cc7..d54ebb7df966 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -178,7 +178,7 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb,
if (atomic_read(&sk->sk_rmem_alloc) >=
READ_ONCE(sk->sk_rcvbuf)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
continue;
}
@@ -311,7 +311,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
@@ -793,6 +793,7 @@ static int raw_sk_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
+ sk->sk_drop_counters = &rp->drop_counters;
if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
memset(&rp->filter, 0, sizeof(rp->filter));
return 0;
@@ -1045,7 +1046,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
0, 0L, 0,
from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
0, sock_i_ino(sp),
- refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
+ refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp));
}
static int raw_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index cc793bd8de25..943e5998e0ad 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -126,9 +126,9 @@ static int raw_diag_dump_one(struct netlink_callback *cb,
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
@@ -140,17 +140,13 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
struct net *net = sock_net(skb->sk);
- struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
struct hlist_head *hlist;
struct sock *sk = NULL;
- struct nlattr *bc;
if (IS_ERR(hashinfo))
return;
- cb_data = cb->data;
- bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -174,7 +170,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
if (r->id.idiag_dport != inet->inet_dport &&
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0)
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0)
goto out_unlock;
next:
num++;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index baa43e5966b1..6d27d3610c1c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -84,6 +84,7 @@
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/net_namespace.h>
#include <net/ip.h>
@@ -413,11 +414,11 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
const void *daddr)
{
const struct rtable *rt = container_of(dst, struct rtable, dst);
- struct net_device *dev = dst_dev(dst);
+ struct net_device *dev;
struct neighbour *n;
rcu_read_lock();
-
+ dev = dst_dev_rcu(dst);
if (likely(rt->rt_gw_family == AF_INET)) {
n = ip_neigh_gw4(dev, rt->rt_gw4);
} else if (rt->rt_gw_family == AF_INET6) {
@@ -1026,7 +1027,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
return;
rcu_read_lock();
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
if (mtu < net->ipv4.ip_rt_min_pmtu) {
lock = true;
mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
@@ -1221,8 +1222,8 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
+ struct inet_skb_parm parm;
struct net_device *dev;
- struct ip_options opt;
int res;
/* Recompile ip options since IPCB may not be valid anymore.
@@ -1232,21 +1233,21 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb)
ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
return;
- memset(&opt, 0, sizeof(opt));
+ memset(&parm, 0, sizeof(parm));
if (ip_hdr(skb)->ihl > 5) {
if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
return;
- opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
- res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
+ res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL);
rcu_read_unlock();
if (res)
return;
}
- __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm);
}
static void ipv4_link_failure(struct sk_buff *skb)
@@ -1291,7 +1292,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)),
+ .flowi4_dscp = ip4h_dscp(iph),
.flowi4_oif = rt->dst.dev->ifindex,
.flowi4_iif = skb->dev->ifindex,
.flowi4_mark = skb->mark,
@@ -1326,7 +1327,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
struct net *net;
rcu_read_lock();
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
net->ipv4.ip_rt_min_advmss);
rcu_read_unlock();
@@ -2210,7 +2211,7 @@ ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto martian_source;
}
- if (rt->rt_type != RTN_LOCAL)
+ if (!(rt->rt_flags & RTCF_LOCAL))
goto skip_validate_source;
reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev,
@@ -2331,7 +2332,7 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.flowi4_oif = 0;
fl4.flowi4_iif = dev->ifindex;
fl4.flowi4_mark = skb->mark;
- fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
fl4.daddr = daddr;
@@ -2694,7 +2695,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
struct rtable *rth;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
- fl4->flowi4_tos &= INET_DSCP_MASK;
rcu_read_lock();
rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
@@ -3337,7 +3337,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 71a956fbfc55..588932c3cf1d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3099,8 +3099,8 @@ bool tcp_check_oom(const struct sock *sk, int shift)
void __tcp_close(struct sock *sk, long timeout)
{
+ bool data_was_unread = false;
struct sk_buff *skb;
- int data_was_unread = 0;
int state;
WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
@@ -3118,13 +3118,14 @@ void __tcp_close(struct sock *sk, long timeout)
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
- while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- len--;
- data_was_unread += len;
- __kfree_skb(skb);
+ end_seq--;
+ if (after(end_seq, tcp_sk(sk)->copied_seq))
+ data_was_unread = true;
+ tcp_eat_recv_skb(sk, skb);
}
/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
@@ -3195,7 +3196,7 @@ adjudge_to_death:
/* remove backlog if any, without releasing ownership. */
__release_sock(sk);
- this_cpu_inc(tcp_orphan_count);
+ tcp_orphan_count_inc();
/* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -3376,7 +3377,7 @@ int tcp_disconnect(struct sock *sk, int flags)
WRITE_ONCE(tp->write_seq, seq);
icsk->icsk_backoff = 0;
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_probes_tstamp = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT;
WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
@@ -3760,7 +3761,7 @@ int tcp_sock_set_maxseg(struct sock *sk, int val)
if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW))
return -EINVAL;
- tcp_sk(sk)->rx_opt.user_mss = val;
+ WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val);
return 0;
}
@@ -3890,15 +3891,13 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max);
return 0;
}
+ case TCP_MAXSEG:
+ return tcp_sock_set_maxseg(sk, val);
}
sockopt_lock_sock(sk);
switch (optname) {
- case TCP_MAXSEG:
- err = tcp_sock_set_maxseg(sk, val);
- break;
-
case TCP_NODELAY:
__tcp_sock_set_nodelay(sk, val);
break;
@@ -4348,7 +4347,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
- nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+ nla_put_u8(stats, TCP_NLA_RECUR_RETRANS,
+ READ_ONCE(inet_csk(sk)->icsk_retransmits));
nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
@@ -4383,6 +4383,7 @@ int do_tcp_getsockopt(struct sock *sk, int level,
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
+ int user_mss;
int val, len;
if (copy_from_sockptr(&len, optlen, sizeof(int)))
@@ -4396,9 +4397,10 @@ int do_tcp_getsockopt(struct sock *sk, int level,
switch (optname) {
case TCP_MAXSEG:
val = tp->mss_cache;
- if (tp->rx_opt.user_mss &&
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss &&
((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- val = tp->rx_opt.user_mss;
+ val = user_mss;
if (tp->repair)
val = tp->rx_opt.mss_clamp;
break;
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index ba4d98e510e0..fbad6c35dee9 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -379,7 +379,7 @@ static void tcp_cdg_init(struct sock *sk)
/* We silently fall back to window = 1 if allocation fails. */
if (window > 1)
ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
ca->rtt_seq = tp->snd_nxt;
ca->shadow_wnd = tcp_snd_cwnd(tp);
}
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 45e174b8cd22..d83efd91f461 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -12,6 +12,9 @@
#include <linux/tcp.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/inet_timewait_sock.h>
#include <net/netlink.h>
#include <net/tcp.h>
@@ -174,27 +177,465 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
size += ulp_ops->get_info_size(sk, net_admin);
}
}
- return size;
+
+ return size
+ + nla_total_size(sizeof(struct tcp_info))
+ + nla_total_size(sizeof(struct inet_diag_msg))
+ + inet_diag_msg_attrs_size()
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+ + nla_total_size(TCP_CA_NAME_MAX)
+ + nla_total_size(sizeof(struct tcpvegas_info))
+ + 64;
+}
+
+static int tcp_twsk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
+ sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT);
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_retrans = 0;
+
+ r->idiag_state = READ_ONCE(tw->tw_substate);
+ r->idiag_timer = 3;
+ tmo = tw->tw_timer.expires - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ tw->tw_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct request_sock *reqsk = inet_reqsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = READ_ONCE(reqsk->num_retrans);
+
+ BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+ offsetof(struct sock, sk_cookie));
+
+ tmo = READ_ONCE(inet_reqsk(sk)->rsk_timer.expires) - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ inet_rsk(reqsk)->ir_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ u16 nlmsg_flags, bool net_admin)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
+ net_admin);
+}
+
+static void twsk_build_assert(void)
+{
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+ offsetof(struct sock, sk_family));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+ offsetof(struct inet_sock, inet_num));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+ offsetof(struct inet_sock, inet_dport));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+ offsetof(struct inet_sock, inet_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+ offsetof(struct inet_sock, inet_daddr));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+ offsetof(struct sock, sk_v6_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+ offsetof(struct sock, sk_v6_daddr));
+#endif
}
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
- struct inet_hashinfo *hinfo;
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct inet_diag_dump_data *cb_data = cb->data;
+ struct net *net = sock_net(skb->sk);
+ u32 idiag_states = r->idiag_states;
+ struct inet_hashinfo *hashinfo;
+ int i, num, s_i, s_num;
+ struct sock *sk;
- hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ if (idiag_states & TCPF_SYN_RECV)
+ idiag_states |= TCPF_NEW_SYN_RECV;
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
+ goto skip_listen_ht;
+
+ for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
+ struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+
+ num = 0;
+ ilb = &hashinfo->lhash2[i];
+
+ if (hlist_nulls_empty(&ilb->nulls_head)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_listen;
+
+ if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
+ cb, r, NLM_F_MULTI,
+ net_admin) < 0) {
+ spin_unlock(&ilb->lock);
+ goto done;
+ }
+
+next_listen:
+ ++num;
+ }
+ spin_unlock(&ilb->lock);
+
+ s_num = 0;
+ }
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
+ * with bh disabled.
+ */
+#define SKARR_SZ 16
+
+ /* Dump bound but inactive (not listening, connecting, etc.) sockets */
+ if (cb->args[0] == 1) {
+ if (!(idiag_states & TCPF_BOUND_INACTIVE))
+ goto skip_bind_ht;
+
+ for (i = s_i; i < hashinfo->bhash_size; i++) {
+ struct inet_bind_hashbucket *ibb;
+ struct inet_bind2_bucket *tb2;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+resume_bind_walk:
+ num = 0;
+ accum = 0;
+ ibb = &hashinfo->bhash2[i];
+
+ if (hlist_empty(&ibb->chain)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock_bh(&ibb->lock);
+ inet_bind_bucket_for_each(tb2, &ibb->chain) {
+ if (!net_eq(ib2_net(tb2), net))
+ continue;
+
+ sk_for_each_bound(sk, &tb2->owners) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_bind;
+
+ if (sk->sk_state != TCP_CLOSE ||
+ !inet->inet_num)
+ goto next_bind;
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ r->sdiag_family != sk->sk_family)
+ goto next_bind;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_bind;
+
+ sock_hold(sk);
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ goto pause_bind_walk;
+next_bind:
+ num++;
+ }
+ }
+pause_bind_walk:
+ spin_unlock_bh(&ibb->lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = inet_sk_diag_fill(sk_arr[idx],
+ NULL, skb, cb,
+ r, NLM_F_MULTI,
+ net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ goto done;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto resume_bind_walk;
+ }
+
+ s_num = 0;
+ }
+skip_bind_ht:
+ cb->args[0] = 2;
+ s_i = num = s_num = 0;
+ }
- inet_diag_dump_icsk(hinfo, skb, cb, r);
+ if (!(idiag_states & ~TCPF_LISTEN))
+ goto out;
+
+ for (i = s_i; i <= hashinfo->ehash_mask; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
+ struct hlist_nulls_node *node;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+ if (hlist_nulls_empty(&head->chain))
+ continue;
+
+ if (i > s_i)
+ s_num = 0;
+
+next_chunk:
+ num = 0;
+ accum = 0;
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &head->chain) {
+ int state;
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next_normal;
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state;
+ if (!(idiag_states & (1 << state)))
+ goto next_normal;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_normal;
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != sk->sk_dport &&
+ r->id.idiag_dport)
+ goto next_normal;
+ twsk_build_assert();
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_normal;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto next_normal;
+
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ break;
+next_normal:
+ ++num;
+ }
+ spin_unlock_bh(lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = sk_diag_fill(sk_arr[idx], skb, cb, r,
+ NLM_F_MULTI, net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_gen_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ break;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto next_chunk;
+ }
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+out:
+ ;
+}
+
+static struct sock *tcp_diag_find_one_icsk(struct net *net,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sock *sk;
+
+ rcu_read_lock();
+ if (req->sdiag_family == AF_INET) {
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3],
+ req->id.idiag_dport, req->id.idiag_src[3],
+ req->id.idiag_sport, req->id.idiag_if);
+ else
+ sk = inet6_lookup(net, NULL, 0,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+#endif
+ } else {
+ rcu_read_unlock();
+ return ERR_PTR(-EINVAL);
+ }
+ rcu_read_unlock();
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_gen_put(sk);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return sk;
}
static int tcp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- struct inet_hashinfo *hinfo;
+ struct sk_buff *in_skb = cb->skb;
+ struct sk_buff *rep;
+ struct sock *sk;
+ struct net *net;
+ bool net_admin;
+ int err;
- hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+ net = sock_net(in_skb->sk);
+ sk = tcp_diag_find_one_icsk(net, req);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
- return inet_diag_dump_one_icsk(hinfo, cb, req);
+ net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
+ rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ nlmsg_free(rep);
+ goto out;
+ }
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+out:
+ if (sk)
+ sock_gen_put(sk);
+
+ return err;
}
#ifdef CONFIG_INET_DIAG_DESTROY
@@ -202,13 +643,10 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
struct net *net = sock_net(in_skb->sk);
- struct inet_hashinfo *hinfo;
struct sock *sk;
int err;
- hinfo = net->ipv4.tcp_death_row.hashinfo;
- sk = inet_diag_find_one_icsk(net, hinfo, req);
-
+ sk = tcp_diag_find_one_icsk(net, req);
if (IS_ERR(sk))
return PTR_ERR(sk);
@@ -226,7 +664,6 @@ static const struct inet_diag_handler tcp_diag_handler = {
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_get_aux = tcp_diag_get_aux,
- .idiag_get_aux_size = tcp_diag_get_aux_size,
.idiag_type = IPPROTO_TCP,
.idiag_info_size = sizeof(struct tcp_info),
#ifdef CONFIG_INET_DIAG_DESTROY
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f1884f0c9e52..7d945a527daf 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -576,11 +576,12 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
}
} else if (tp->syn_fastopen_ch &&
atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
- dst = sk_dst_get(sk);
- dev = dst ? dst_dev(dst) : NULL;
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
if (!(dev && (dev->flags & IFF_LOOPBACK)))
atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
- dst_release(dst);
+ rcu_read_unlock();
}
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 71b76e98371a..f1be65af1a77 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2569,7 +2569,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
if (frto_undo)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
if (tcp_is_non_sack_preventing_reopen(sk))
return true;
if (frto_undo || tcp_is_sack(tp)) {
@@ -3851,7 +3851,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
- icsk->icsk_retransmits = 0;
+ WRITE_ONCE(icsk->icsk_retransmits, 0);
#if IS_ENABLED(CONFIG_TLS_DEVICE)
if (static_branch_unlikely(&clean_acked_data_enabled.key))
@@ -3913,7 +3913,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* log. Something worked...
*/
WRITE_ONCE(sk->sk_err_soft, 0);
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
tp->rcv_tstamp = tcp_jiffies32;
if (!prior_packets)
goto no_queue;
@@ -4830,7 +4830,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
noinline_for_tracing static void
tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
sk_skb_reason_drop(sk, skb, reason);
}
@@ -6297,7 +6297,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;
- if (mss == tp->rx_opt.user_mss) {
+ if (mss == READ_ONCE(tp->rx_opt.user_mss)) {
struct tcp_options_received opt;
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
@@ -6636,7 +6636,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
tcp_try_undo_recovery(sk);
tcp_update_rto_time(tp);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
/* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set
* retrans_stamp but don't enter CA_Loss, so in case that happened we
* need to zero retrans_stamp here to prevent spurious
@@ -7117,7 +7117,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
return 0;
}
- mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
+ mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss));
if (!mss)
mss = af_ops->mss_clamp;
@@ -7131,7 +7131,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
{
struct tcp_fastopen_cookie foc = { .len = -1 };
struct tcp_options_received tmp_opt;
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
@@ -7182,7 +7182,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
- tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss);
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
want_cookie ? NULL : &foc);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 84d3d556ed80..1e58a8a9ff7a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -506,8 +506,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
struct sock *sk;
int err;
- sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- iph->daddr, th->dest, iph->saddr,
+ sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
ntohs(th->source), inet_iif(skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
@@ -823,8 +822,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
- NULL, 0, ip_hdr(skb)->saddr,
+ sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
ntohs(th->source), dif, sdif);
/* don't send rst if it can't find key */
@@ -1992,8 +1990,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (th->doff < sizeof(struct tcphdr) / 4)
return 0;
- sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- iph->saddr, th->source,
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
skb->skb_iif, inet_sdif(skb));
if (sk) {
@@ -2236,8 +2233,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
lookup:
- sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
- skb, __tcp_hdrlen(th), th->source,
+ sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -2258,7 +2254,7 @@ lookup:
&iph->saddr, &iph->daddr,
AF_INET, dif, sdif);
if (unlikely(drop_reason)) {
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -2403,7 +2399,7 @@ discard_it:
return 0;
discard_and_relse:
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
@@ -2426,9 +2422,7 @@ do_time_wait:
&drop_reason);
switch (tw_status) {
case TCP_TW_SYN: {
- struct sock *sk2 = inet_lookup_listener(net,
- net->ipv4.tcp_death_row.hashinfo,
- skb, __tcp_hdrlen(th),
+ struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb),
@@ -2459,7 +2453,6 @@ do_time_wait:
static struct timewait_sock_ops tcp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
- .twsk_destructor= tcp_twsk_destructor,
};
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
@@ -2958,9 +2951,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
+ READ_ONCE(icsk->icsk_retransmits),
from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
- icsk->icsk_probes_out,
+ READ_ONCE(icsk->icsk_probes_out),
sock_i_ino(sk),
refcount_read(&sk->sk_refcnt), sk,
jiffies_to_clock_t(icsk->icsk_rto),
@@ -3524,7 +3517,6 @@ struct proto tcp_prot = {
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
- .orphan_count = &tcp_orphan_count,
.memory_allocated = &net_aligned_data.tcp_memory_allocated,
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 03c068ea27b6..10e86f1008e9 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
struct net *net;
spin_lock_bh(&tcp_metrics_lock);
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
/* While waiting for the spin-lock the cache might have been populated
* with this entry and so we have to check again.
@@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
return NULL;
}
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
@@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
else
return NULL;
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2994c9222c9c..d1c9e4088646 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -401,7 +401,6 @@ void tcp_twsk_destructor(struct sock *sk)
#endif
tcp_ao_destroy_sock(sk, true);
}
-EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor);
void tcp_twsk_purge(struct list_head *net_exit_list)
{
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index be5c2294610e..e6612bd84d09 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -434,8 +434,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
inet_get_iif_sdif(skb, &iif, &sdif);
iph = skb_gro_network_header(skb);
net = dev_net_rcu(skb->dev);
- sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- iph->saddr, th->source,
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
iif, sdif);
NAPI_GRO_CB(skb)->is_flist = !sk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index caf11920a878..e180364b8dda 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3578,9 +3578,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
sk_memory_allocated_add(sk, amt);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_charge_skmem(sk->sk_memcg, amt,
- gfp_memcg_charge() | __GFP_NOFAIL);
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
}
/* Send a FIN. The caller locks the socket for us.
@@ -3891,6 +3890,7 @@ static void tcp_connect_init(struct sock *sk)
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+ u16 user_mss;
u32 rcv_wnd;
/* We'll fix this up when we get a response from the other end.
@@ -3903,8 +3903,9 @@ static void tcp_connect_init(struct sock *sk)
tcp_ao_connect_init(sk);
/* If user gave his TCP_MAXSEG, record it to clamp */
- if (tp->rx_opt.user_mss)
- tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss)
+ tp->rx_opt.mss_clamp = user_mss;
tp->max_window = 0;
tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
@@ -3955,7 +3956,7 @@ static void tcp_connect_init(struct sock *sk)
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
tcp_clear_retrans(tp);
}
@@ -4393,13 +4394,13 @@ void tcp_send_probe0(struct sock *sk)
if (tp->packets_out || tcp_write_queue_empty(sk)) {
/* Cancel probe timer, if it is not required. */
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_backoff = 0;
icsk->icsk_probes_tstamp = 0;
return;
}
- icsk->icsk_probes_out++;
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
if (err <= 0) {
if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
icsk->icsk_backoff++;
@@ -4437,7 +4438,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
tcp_sk_rw(sk)->total_retrans++;
}
trace_tcp_retransmit_synack(sk, req);
- req->num_retrans++;
+ WRITE_ONCE(req->num_retrans, req->num_retrans + 1);
}
return res;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a207877270fb..2dd73a4e8e51 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -392,7 +392,7 @@ static void tcp_probe_timer(struct sock *sk)
int max_probes;
if (tp->packets_out || !skb) {
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_probes_tstamp = 0;
return;
}
@@ -444,7 +444,7 @@ static void tcp_update_rto_stats(struct sock *sk)
tp->total_rto_recoveries++;
tp->rto_stamp = tcp_time_stamp_ms(tp);
}
- icsk->icsk_retransmits++;
+ WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1);
tp->total_rto++;
}
@@ -839,7 +839,7 @@ static void tcp_keepalive_timer(struct timer_list *t)
goto out;
}
if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
- icsk->icsk_probes_out++;
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cc3ce0f762ec..cca41c569f37 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -68,7 +68,7 @@
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
* Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
* a single port at the same time.
- * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ * Derek Atkins <derek@ihtfp.com>: Add Encapsulation Support
* James Chapman : Add L2TP encapsulation type.
*/
@@ -509,7 +509,7 @@ rescore:
/* compute_score is too long of a function to be
* inlined, and calling it again here yields
- * measureable overhead for some
+ * measurable overhead for some
* workloads. Work around it by jumping
* backwards to rescore 'result'.
*/
@@ -1787,7 +1787,7 @@ uncharge_drop:
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
drop:
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
busylock_release(busy);
return err;
}
@@ -1852,7 +1852,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
IS_UDPLITE(sk));
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
IS_UDPLITE(sk));
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
__skb_unlink(skb, rcvq);
*total += skb->truesize;
kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
@@ -2008,7 +2008,7 @@ try_again:
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite);
__UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
goto try_again;
}
@@ -2078,7 +2078,7 @@ try_again:
if (unlikely(err)) {
if (!peeking) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
}
@@ -2449,7 +2449,7 @@ csum_error:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
@@ -2534,7 +2534,7 @@ start_lookup:
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
__UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP_INC_STATS(net, UDP_MIB_INERRORS,
@@ -2609,7 +2609,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return 0;
}
-/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+/* wrapper for udp_queue_rcv_skb taking care of csum conversion and
* return code conversion for ip layer consumption
*/
static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
@@ -3386,7 +3386,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
int udp4_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 38cb3a28e4ed..6e491c720c90 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -16,9 +16,9 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
@@ -92,12 +92,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
- struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
- struct nlattr *bc;
- cb_data = cb->data;
- bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -130,7 +126,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) {
spin_unlock_bh(&hslot->lock);
goto done;
}
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index fce945f23069..54386e06a813 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -4,6 +4,7 @@
#include <linux/socket.h>
#include <linux/kernel.h>
#include <net/dst_metadata.h>
+#include <net/flow.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
#include <net/inet_dscp.h>
@@ -253,7 +254,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
fl4.saddr = key->u.ipv4.src;
fl4.fl4_dport = dport;
fl4.fl4_sport = sport;
- fl4.flowi4_tos = tos & INET_DSCP_MASK;
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(tos);
fl4.flowi4_flags = key->flow_flags;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 7fb6205619e7..58faf1ddd2b1 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -14,7 +14,7 @@
#include <linux/inetdevice.h>
#include <net/dst.h>
#include <net/xfrm.h>
-#include <net/inet_dscp.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/l3mdev.h>
@@ -25,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4,
memset(fl4, 0, sizeof(*fl4));
fl4->daddr = params->daddr->a4;
- fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp);
+ fl4->flowi4_dscp = params->dscp;
fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net,
params->oif);
fl4->flowi4_mark = params->mark;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 1c9c686d9522..b8f9a8c0302e 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -304,10 +304,9 @@ config IPV6_SEG6_LWTUNNEL
config IPV6_SEG6_HMAC
bool "IPv6: Segment Routing HMAC support"
depends on IPV6
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_SHA1
- select CRYPTO_SHA256
+ select CRYPTO_LIB_SHA1
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_UTILS
help
Support for HMAC signature generation and verification
of SR-enabled packets.
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f17a5dd4789f..40e9c336f6c5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -7238,7 +7238,9 @@ static const struct ctl_table addrconf_sysctl[] = {
.data = &ipv6_devconf.rpl_seg_enabled,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "ioam6_enabled",
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index f8a8e46286b8..52599584422b 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -104,7 +104,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
rcu_read_lock();
rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
if (rt) {
- dev = dst_dev(&rt->dst);
+ dev = dst_dev_rcu(&rt->dst);
netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
ip6_rt_put(rt);
} else if (ishost) {
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 972bf0426d59..33ebe93d80e3 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -1068,5 +1068,5 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
0,
sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 72adfc107b55..e75da98f5283 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -149,8 +149,8 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
dport = encap->encap_dport;
spin_unlock_bh(&x->lock);
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6,
- dport, &x->props.saddr.in6, ntohs(sport), 0, 0);
+ sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport,
+ &x->props.saddr.in6, ntohs(sport), 0, 0);
if (!sk)
return ERR_PTR(-ENOENT);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 44550957fd4e..56c974cf75d1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -209,7 +209,8 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
* this lookup should be more aggressive (not longer than timeout).
*/
dst = ip6_route_output(net, sk, fl6);
- dev = dst_dev(dst);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
if (dst->error) {
IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_OUTNOROUTES);
@@ -224,14 +225,12 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
if (rt->rt6i_dst.plen < 128)
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
- rcu_read_lock();
peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr);
res = inet_peer_xrlim_allow(peer, tmo);
- rcu_read_unlock();
}
+ rcu_read_unlock();
if (!res)
- __ICMP6_INC_STATS(net, ip6_dst_idev(dst),
- ICMP6_MIB_RATELIMITHOST);
+ __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST);
else
icmp_global_consume(net);
dst_release(dst);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 76ee521189eb..a3a9ea49fee2 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -47,24 +47,23 @@ EXPORT_SYMBOL_GPL(inet6_ehashfn);
* The sockhash lock must be held as a reader here.
*/
struct sock *__inet6_lookup_established(const struct net *net,
- struct inet_hashinfo *hashinfo,
- const struct in6_addr *saddr,
- const __be16 sport,
- const struct in6_addr *daddr,
- const u16 hnum,
- const int dif, const int sdif)
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const u16 hnum,
+ const int dif, const int sdif)
{
- struct sock *sk;
- const struct hlist_nulls_node *node;
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
- unsigned int slot = hash & hashinfo->ehash_mask;
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
-
+ const struct hlist_nulls_node *node;
+ struct inet_ehash_bucket *head;
+ struct inet_hashinfo *hashinfo;
+ unsigned int hash, slot;
+ struct sock *sk;
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash & hashinfo->ehash_mask;
+ head = &hashinfo->ehash[slot];
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
@@ -200,19 +199,20 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net,
EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup);
struct sock *inet6_lookup_listener(const struct net *net,
- struct inet_hashinfo *hashinfo,
- struct sk_buff *skb, int doff,
- const struct in6_addr *saddr,
- const __be16 sport, const struct in6_addr *daddr,
- const unsigned short hnum, const int dif, const int sdif)
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const unsigned short hnum,
+ const int dif, const int sdif)
{
struct inet_listen_hashbucket *ilb2;
+ struct inet_hashinfo *hashinfo;
struct sock *result = NULL;
unsigned int hash2;
/* Lookup redirect from BPF */
- if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
- hashinfo == net->ipv4.tcp_death_row.hashinfo) {
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
saddr, sport, daddr, hnum, dif,
inet6_ehashfn);
@@ -220,6 +220,7 @@ struct sock *inet6_lookup_listener(const struct net *net,
goto done;
}
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
@@ -244,7 +245,6 @@ done:
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
struct sock *inet6_lookup(const struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr, const __be16 sport,
const struct in6_addr *daddr, const __be16 dport,
@@ -253,7 +253,7 @@ struct sock *inet6_lookup(const struct net *net,
struct sock *sk;
bool refcounted;
- sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
+ sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr,
ntohs(dport), dif, 0, &refcounted);
if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
@@ -305,8 +305,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
- if (sk->sk_protocol == IPPROTO_TCP &&
- tcp_twsk_unique(sk, sk2, twp))
+ if (tcp_twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 74d49dd6124d..c82a75510c0e 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -329,9 +329,9 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
return NULL;
- strscpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name);
} else {
- strcpy(name, "ip6gre%d");
+ strscpy(name, "ip6gre%d");
}
dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
ip6gre_tunnel_setup);
@@ -1469,7 +1469,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
tunnel = netdev_priv(dev);
tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ strscpy(tunnel->parms.name, dev->name);
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (ret)
@@ -1529,7 +1529,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
tunnel->dev = dev;
tunnel->net = dev_net(dev);
- strcpy(tunnel->parms.name, dev->name);
+ strscpy(tunnel->parms.name, dev->name);
tunnel->hlen = sizeof(struct ipv6hdr) + 4;
}
@@ -1842,7 +1842,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
tunnel = netdev_priv(dev);
tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ strscpy(tunnel->parms.name, dev->name);
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (ret)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1e1410237b6e..9d64c13bab5e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -60,7 +60,7 @@
static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- struct net_device *dev = dst_dev(dst);
+ struct net_device *dev = dst_dev_rcu(dst);
struct inet6_dev *idev = ip6_dst_idev(dst);
unsigned int hh_len = LL_RESERVED_SPACE(dev);
const struct in6_addr *daddr, *nexthop;
@@ -70,15 +70,12 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
/* Be paranoid, rather than too clever. */
if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
- /* Make sure idev stays alive */
- rcu_read_lock();
+ /* idev stays alive because we hold rcu_read_lock(). */
skb = skb_expand_head(skb, hh_len);
if (!skb) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
- rcu_read_unlock();
return -ENOMEM;
}
- rcu_read_unlock();
}
hdr = ipv6_hdr(skb);
@@ -123,7 +120,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
- rcu_read_lock();
nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
@@ -131,7 +127,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
if (unlikely(!neigh))
neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
if (IS_ERR(neigh)) {
- rcu_read_unlock();
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
return -EINVAL;
@@ -139,7 +134,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
}
sock_confirm_neigh(skb, neigh);
ret = neigh_output(neigh, skb, false);
- rcu_read_unlock();
return ret;
}
@@ -233,22 +227,29 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- struct net_device *dev = dst_dev(dst), *indev = skb->dev;
- struct inet6_dev *idev = ip6_dst_idev(dst);
+ struct net_device *dev, *indev = skb->dev;
+ struct inet6_dev *idev;
+ int ret;
skb->protocol = htons(ETH_P_IPV6);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ idev = ip6_dst_idev(dst);
skb->dev = dev;
if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ rcu_read_unlock();
kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
return 0;
}
- return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
- net, sk, skb, indev, dev,
- ip6_finish_output,
- !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+ ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+ net, sk, skb, indev, dev,
+ ip6_finish_output,
+ !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL(ip6_output);
@@ -268,35 +269,36 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
__u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
{
- struct net *net = sock_net(sk);
const struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *first_hop = &fl6->daddr;
struct dst_entry *dst = skb_dst(skb);
- struct net_device *dev = dst_dev(dst);
struct inet6_dev *idev = ip6_dst_idev(dst);
struct hop_jumbo_hdr *hop_jumbo;
int hoplen = sizeof(*hop_jumbo);
+ struct net *net = sock_net(sk);
unsigned int head_room;
+ struct net_device *dev;
struct ipv6hdr *hdr;
u8 proto = fl6->flowi6_proto;
int seg_len = skb->len;
- int hlimit = -1;
+ int ret, hlimit = -1;
u32 mtu;
+ rcu_read_lock();
+
+ dev = dst_dev_rcu(dst);
head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
if (opt)
head_room += opt->opt_nflen + opt->opt_flen;
if (unlikely(head_room > skb_headroom(skb))) {
- /* Make sure idev stays alive */
- rcu_read_lock();
+ /* idev stays alive while we hold rcu_read_lock(). */
skb = skb_expand_head(skb, head_room);
if (!skb) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
- rcu_read_unlock();
- return -ENOBUFS;
+ ret = -ENOBUFS;
+ goto unlock;
}
- rcu_read_unlock();
}
if (opt) {
@@ -358,17 +360,21 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
* skb to its handler for processing
*/
skb = l3mdev_ip6_out((struct sock *)sk, skb);
- if (unlikely(!skb))
- return 0;
+ if (unlikely(!skb)) {
+ ret = 0;
+ goto unlock;
+ }
/* hooks should never assume socket lock is held.
* we promote our socket to non const
*/
- return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
- net, (struct sock *)sk, skb, NULL, dev,
- dst_output);
+ ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, (struct sock *)sk, skb, NULL, dev,
+ dst_output);
+ goto unlock;
}
+ ret = -EMSGSIZE;
skb->dev = dev;
/* ipv6_local_error() does not require socket lock,
* we promote our socket to non const
@@ -377,7 +383,9 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
- return -EMSGSIZE;
+unlock:
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL(ip6_xmit);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 36ca27496b3c..016b572e7d6f 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -169,6 +169,29 @@ static int unsolicited_report_interval(struct inet6_dev *idev)
return iv > 0 ? iv : 1;
}
+static struct net_device *ip6_mc_find_dev(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
+{
+ struct net_device *dev = NULL;
+ struct rt6_info *rt;
+
+ if (ifindex == 0) {
+ rcu_read_lock();
+ rt = rt6_lookup(net, group, NULL, 0, NULL, 0);
+ if (rt) {
+ dev = dst_dev_rcu(&rt->dst);
+ dev_hold(dev);
+ ip6_rt_put(rt);
+ }
+ rcu_read_unlock();
+ } else {
+ dev = dev_get_by_index(net, ifindex);
+ }
+
+ return dev;
+}
+
/*
* socket join on multicast group
*/
@@ -191,28 +214,13 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
}
mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
-
if (!mc_lst)
return -ENOMEM;
mc_lst->next = NULL;
mc_lst->addr = *addr;
- if (ifindex == 0) {
- struct rt6_info *rt;
-
- rcu_read_lock();
- rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
- if (rt) {
- dev = dst_dev(&rt->dst);
- dev_hold(dev);
- ip6_rt_put(rt);
- }
- rcu_read_unlock();
- } else {
- dev = dev_get_by_index(net, ifindex);
- }
-
+ dev = ip6_mc_find_dev(net, addr, ifindex);
if (!dev) {
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
return -ENODEV;
@@ -302,27 +310,14 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
}
EXPORT_SYMBOL(ipv6_sock_mc_drop);
-static struct inet6_dev *ip6_mc_find_dev(struct net *net,
- const struct in6_addr *group,
- int ifindex)
+static struct inet6_dev *ip6_mc_find_idev(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
{
- struct net_device *dev = NULL;
+ struct net_device *dev;
struct inet6_dev *idev;
- if (ifindex == 0) {
- struct rt6_info *rt;
-
- rcu_read_lock();
- rt = rt6_lookup(net, group, NULL, 0, NULL, 0);
- if (rt) {
- dev = dst_dev(&rt->dst);
- dev_hold(dev);
- ip6_rt_put(rt);
- }
- rcu_read_unlock();
- } else {
- dev = dev_get_by_index(net, ifindex);
- }
+ dev = ip6_mc_find_dev(net, group, ifindex);
if (!dev)
return NULL;
@@ -374,7 +369,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (!ipv6_addr_is_multicast(group))
return -EINVAL;
- idev = ip6_mc_find_dev(net, group, pgsr->gsr_interface);
+ idev = ip6_mc_find_idev(net, group, pgsr->gsr_interface);
if (!idev)
return -ENODEV;
@@ -509,7 +504,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
gsf->gf_fmode != MCAST_EXCLUDE)
return -EINVAL;
- idev = ip6_mc_find_dev(net, group, gsf->gf_interface);
+ idev = ip6_mc_find_idev(net, group, gsf->gf_interface);
if (!idev)
return -ENODEV;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7d5abb3158ec..f427e41e9c49 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -130,7 +130,7 @@ struct neigh_table nd_tbl = {
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
- [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
@@ -505,7 +505,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);
- dev = dst_dev(dst);
+ dev = dst_dev_rcu(dst);
idev = __in6_dev_get(dev);
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 45f9105f9ac1..46540a5a4331 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -63,7 +63,10 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff
#ifdef CONFIG_XFRM
if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
xfrm_decode_session(net, skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) {
- skb_dst_set(skb, NULL);
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index cb2d38e80de9..6b022449f867 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -12,6 +12,19 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
+static struct ipv6hdr *
+nf_reject_ip6hdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int hoplimit);
+static void
+nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ const struct tcphdr *oth, unsigned int otcplen);
+static const struct tcphdr *
+nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *otcph,
+ unsigned int *otcplen, int hook);
+
static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook)
{
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
@@ -146,9 +159,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach);
-const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
- struct tcphdr *otcph,
- unsigned int *otcplen, int hook)
+static const struct tcphdr *
+nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *otcph,
+ unsigned int *otcplen, int hook)
{
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
u8 proto;
@@ -192,11 +206,11 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
return otcph;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get);
-struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- __u8 protocol, int hoplimit)
+static struct ipv6hdr *
+nf_reject_ip6hdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int hoplimit)
{
struct ipv6hdr *ip6h;
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
@@ -216,11 +230,11 @@ struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
return ip6h;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put);
-void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- const struct tcphdr *oth, unsigned int otcplen)
+static void
+nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ const struct tcphdr *oth, unsigned int otcplen)
{
struct tcphdr *tcph;
@@ -248,7 +262,6 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
csum_partial(tcph,
sizeof(struct tcphdr), 0));
}
-EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put);
static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in)
{
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index 9ea5ef56cb27..ced8bd44828e 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -83,8 +83,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo,
- skb, doff, saddr, sport, daddr, dport,
+ return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp6_lib_lookup(net, saddr, sport, daddr, dport,
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index 52f828bb5a83..b2f59ed9d7cc 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -80,7 +80,6 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
- struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
struct sock *sk;
switch (protocol) {
@@ -94,7 +93,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet6_lookup_listener(net, hinfo, skb,
+ sk = inet6_lookup_listener(net, skb,
thoff + __tcp_hdrlen(hp),
saddr, sport,
daddr, ntohs(dport),
@@ -109,7 +108,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr,
+ sk = __inet6_lookup_established(net, saddr, sport, daddr,
ntohs(dport), in->ifindex, 0);
break;
default:
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index d21fe27fe21e..1c9b283a4132 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -104,18 +104,20 @@ EXPORT_SYMBOL(ip6_find_1stfragopt);
int ip6_dst_hoplimit(struct dst_entry *dst)
{
int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+
+ rcu_read_lock();
if (hoplimit == 0) {
- struct net_device *dev = dst_dev(dst);
+ struct net_device *dev = dst_dev_rcu(dst);
struct inet6_dev *idev;
- rcu_read_lock();
idev = __in6_dev_get(dev);
if (idev)
hoplimit = READ_ONCE(idev->cnf.hop_limit);
else
hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
- rcu_read_unlock();
}
+ rcu_read_unlock();
+
return hoplimit;
}
EXPORT_SYMBOL(ip6_dst_hoplimit);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 82b0492923d4..d7a2cdaa2631 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -208,7 +208,6 @@ struct proto pingv6_prot = {
.recvmsg = ping_recvmsg,
.bind = ping_bind,
.backlog_rcv = ping_queue_rcv_skb,
- .hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
.put_port = ping_unhash,
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 752327b10dde..73296f38c252 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -85,7 +85,6 @@ static const struct snmp_mib snmp6_ipstats_list[] = {
SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp6_icmp6_list[] = {
@@ -95,30 +94,10 @@ static const struct snmp_mib snmp6_icmp6_list[] = {
SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS),
SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS),
SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS),
+/* ICMP6_MIB_RATELIMITHOST needs to be last, see snmp6_dev_seq_show(). */
SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST),
- SNMP_MIB_SENTINEL
};
-/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
-static const char *const icmp6type2name[256] = {
- [ICMPV6_DEST_UNREACH] = "DestUnreachs",
- [ICMPV6_PKT_TOOBIG] = "PktTooBigs",
- [ICMPV6_TIME_EXCEED] = "TimeExcds",
- [ICMPV6_PARAMPROB] = "ParmProblems",
- [ICMPV6_ECHO_REQUEST] = "Echos",
- [ICMPV6_ECHO_REPLY] = "EchoReplies",
- [ICMPV6_MGM_QUERY] = "GroupMembQueries",
- [ICMPV6_MGM_REPORT] = "GroupMembResponses",
- [ICMPV6_MGM_REDUCTION] = "GroupMembReductions",
- [ICMPV6_MLD2_REPORT] = "MLDv2Reports",
- [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements",
- [NDISC_ROUTER_SOLICITATION] = "RouterSolicits",
- [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements",
- [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits",
- [NDISC_REDIRECT] = "Redirects",
-};
-
-
static const struct snmp_mib snmp6_udp6_list[] = {
SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS),
SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
@@ -129,7 +108,6 @@ static const struct snmp_mib snmp6_udp6_list[] = {
SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI),
SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp6_udplite6_list[] = {
@@ -141,7 +119,6 @@ static const struct snmp_mib snmp6_udplite6_list[] = {
SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS),
- SNMP_MIB_SENTINEL
};
static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
@@ -151,11 +128,31 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
/* print by name -- deprecated items */
for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+ const char *p = NULL;
int icmptype;
- const char *p;
+
+#define CASE(TYP, STR) case TYP: p = STR; break;
icmptype = i & 0xff;
- p = icmp6type2name[icmptype];
+ switch (icmptype) {
+/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
+ CASE(ICMPV6_DEST_UNREACH, "DestUnreachs")
+ CASE(ICMPV6_PKT_TOOBIG, "PktTooBigs")
+ CASE(ICMPV6_TIME_EXCEED, "TimeExcds")
+ CASE(ICMPV6_PARAMPROB, "ParmProblems")
+ CASE(ICMPV6_ECHO_REQUEST, "Echos")
+ CASE(ICMPV6_ECHO_REPLY, "EchoReplies")
+ CASE(ICMPV6_MGM_QUERY, "GroupMembQueries")
+ CASE(ICMPV6_MGM_REPORT, "GroupMembResponses")
+ CASE(ICMPV6_MGM_REDUCTION, "GroupMembReductions")
+ CASE(ICMPV6_MLD2_REPORT, "MLDv2Reports")
+ CASE(NDISC_ROUTER_ADVERTISEMENT, "RouterAdvertisements")
+ CASE(NDISC_ROUTER_SOLICITATION, "RouterSolicits")
+ CASE(NDISC_NEIGHBOUR_ADVERTISEMENT, "NeighborAdvertisements")
+ CASE(NDISC_NEIGHBOUR_SOLICITATION, "NeighborSolicits")
+ CASE(NDISC_REDIRECT, "Redirects")
+ }
+#undef CASE
if (!p) /* don't print un-named types here */
continue;
snprintf(name, sizeof(name), "Icmp6%s%s",
@@ -182,35 +179,37 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
*/
static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib,
atomic_long_t *smib,
- const struct snmp_mib *itemlist)
+ const struct snmp_mib *itemlist,
+ int cnt)
{
unsigned long buff[SNMP_MIB_MAX];
int i;
if (pcpumib) {
- memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
+ memset(buff, 0, sizeof(unsigned long) * cnt);
- snmp_get_cpu_field_batch(buff, itemlist, pcpumib);
- for (i = 0; itemlist[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%lu\n",
itemlist[i].name, buff[i]);
} else {
- for (i = 0; itemlist[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name,
atomic_long_read(smib + itemlist[i].entry));
}
}
static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
- const struct snmp_mib *itemlist, size_t syncpoff)
+ const struct snmp_mib *itemlist,
+ int cnt, size_t syncpoff)
{
u64 buff64[SNMP_MIB_MAX];
int i;
- memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX);
+ memset(buff64, 0, sizeof(u64) * cnt);
- snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
- for (i = 0; itemlist[i].name; i++)
+ snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]);
}
@@ -219,14 +218,19 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
struct net *net = (struct net *)seq->private;
snmp6_seq_show_item64(seq, net->mib.ipv6_statistics,
- snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
+ snmp6_ipstats_list,
+ ARRAY_SIZE(snmp6_ipstats_list),
+ offsetof(struct ipstats_mib, syncp));
snmp6_seq_show_item(seq, net->mib.icmpv6_statistics,
- NULL, snmp6_icmp6_list);
+ NULL, snmp6_icmp6_list,
+ ARRAY_SIZE(snmp6_icmp6_list));
snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs);
snmp6_seq_show_item(seq, net->mib.udp_stats_in6,
- NULL, snmp6_udp6_list);
+ NULL, snmp6_udp6_list,
+ ARRAY_SIZE(snmp6_udp6_list));
snmp6_seq_show_item(seq, net->mib.udplite_stats_in6,
- NULL, snmp6_udplite6_list);
+ NULL, snmp6_udplite6_list,
+ ARRAY_SIZE(snmp6_udplite6_list));
return 0;
}
@@ -236,9 +240,14 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
snmp6_seq_show_item64(seq, idev->stats.ipv6,
- snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
+ snmp6_ipstats_list,
+ ARRAY_SIZE(snmp6_ipstats_list),
+ offsetof(struct ipstats_mib, syncp));
+
+ /* Per idev icmp stats do not have ICMP6_MIB_RATELIMITHOST */
snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs,
- snmp6_icmp6_list);
+ snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list) - 1);
+
snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs);
return 0;
}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 4c3f8245c40f..4ae07a67b4d4 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -163,7 +163,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
if (atomic_read(&sk->sk_rmem_alloc) >=
READ_ONCE(sk->sk_rcvbuf)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
continue;
}
@@ -361,7 +361,7 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) &&
skb_checksum_complete(skb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
@@ -389,7 +389,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
struct raw6_sock *rp = raw6_sk(sk);
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
@@ -414,7 +414,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
if (inet_test_bit(HDRINCL, sk)) {
if (skb_checksum_complete(skb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
@@ -1175,6 +1175,7 @@ static int rawv6_init_sk(struct sock *sk)
{
struct raw6_sock *rp = raw6_sk(sk);
+ sk->sk_drop_counters = &rp->drop_counters;
switch (inet_sk(sk)->inet_num) {
case IPPROTO_ICMPV6:
rp->checksum = 1;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3299cfa12e21..3371f16b7a3e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2943,7 +2943,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (res.f6i->nh) {
struct fib6_nh_match_arg arg = {
- .dev = dst_dev(dst),
+ .dev = dst_dev_rcu(dst),
.gw = &rt6->rt6i_gateway,
};
@@ -3238,7 +3238,6 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect);
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
{
- struct net_device *dev = dst_dev(dst);
unsigned int mtu = dst_mtu(dst);
struct net *net;
@@ -3246,7 +3245,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
rcu_read_lock();
- net = dev_net_rcu(dev);
+ net = dst_dev_net_rcu(dst);
if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
@@ -4301,7 +4300,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
if (res.f6i->nh) {
struct fib6_nh_match_arg arg = {
- .dev = dst_dev(dst),
+ .dev = dst_dev_rcu(dst),
.gw = &rt->rt6i_gateway,
};
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 180da19c148c..a5c4c629b788 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -522,16 +522,10 @@ int __init seg6_init(void)
if (err)
goto out_unregister_iptun;
- err = seg6_hmac_init();
- if (err)
- goto out_unregister_seg6;
-
pr_info("Segment Routing with IPv6\n");
out:
return err;
-out_unregister_seg6:
- seg6_local_exit();
out_unregister_iptun:
seg6_iptunnel_exit();
out_unregister_genl:
@@ -543,7 +537,6 @@ out_unregister_pernet:
void seg6_exit(void)
{
- seg6_hmac_exit();
seg6_local_exit();
seg6_iptunnel_exit();
genl_unregister_family(&seg6_genl_family);
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index fd58426f222b..ee6bac0160ac 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -16,7 +16,6 @@
#include <linux/in6.h>
#include <linux/icmpv6.h>
#include <linux/mroute6.h>
-#include <linux/slab.h>
#include <linux/rhashtable.h>
#include <linux/netfilter.h>
@@ -34,7 +33,8 @@
#include <net/addrconf.h>
#include <net/xfrm.h>
-#include <crypto/hash.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
#include <crypto/utils.h>
#include <net/seg6.h>
#include <net/genetlink.h>
@@ -78,17 +78,6 @@ static const struct rhashtable_params rht_params = {
.obj_cmpfn = seg6_hmac_cmpfn,
};
-static struct seg6_hmac_algo hmac_algos[] = {
- {
- .alg_id = SEG6_HMAC_ALGO_SHA1,
- .name = "hmac(sha1)",
- },
- {
- .alg_id = SEG6_HMAC_ALGO_SHA256,
- .name = "hmac(sha256)",
- },
-};
-
static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
{
struct sr6_tlv_hmac *tlv;
@@ -108,75 +97,13 @@ static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
return tlv;
}
-static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id)
-{
- struct seg6_hmac_algo *algo;
- int i, alg_count;
-
- alg_count = ARRAY_SIZE(hmac_algos);
- for (i = 0; i < alg_count; i++) {
- algo = &hmac_algos[i];
- if (algo->alg_id == alg_id)
- return algo;
- }
-
- return NULL;
-}
-
-static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize,
- u8 *output, int outlen)
-{
- struct seg6_hmac_algo *algo;
- struct crypto_shash *tfm;
- struct shash_desc *shash;
- int ret, dgsize;
-
- algo = __hmac_get_algo(hinfo->alg_id);
- if (!algo)
- return -ENOENT;
-
- tfm = *this_cpu_ptr(algo->tfms);
-
- dgsize = crypto_shash_digestsize(tfm);
- if (dgsize > outlen) {
- pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n",
- dgsize, outlen);
- return -ENOMEM;
- }
-
- ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen);
- if (ret < 0) {
- pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret);
- goto failed;
- }
-
- shash = *this_cpu_ptr(algo->shashs);
- shash->tfm = tfm;
-
- ret = crypto_shash_digest(shash, text, psize, output);
- if (ret < 0) {
- pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret);
- goto failed;
- }
-
- return dgsize;
-
-failed:
- return ret;
-}
-
int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
struct in6_addr *saddr, u8 *output)
{
__be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid);
- u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE];
- int plen, i, dgsize, wrsize;
+ int plen, i, ret = 0;
char *ring, *off;
- /* a 160-byte buffer for digest output allows to store highest known
- * hash function (RadioGatun) with up to 1216 bits
- */
-
/* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */
plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
@@ -219,22 +146,25 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
off += 16;
}
- dgsize = __do_hmac(hinfo, ring, plen, tmp_out,
- SEG6_HMAC_MAX_DIGESTSIZE);
+ switch (hinfo->alg_id) {
+ case SEG6_HMAC_ALGO_SHA1:
+ hmac_sha1(&hinfo->key.sha1, ring, plen, output);
+ static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE);
+ memset(&output[SHA1_DIGEST_SIZE], 0,
+ SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE);
+ break;
+ case SEG6_HMAC_ALGO_SHA256:
+ hmac_sha256(&hinfo->key.sha256, ring, plen, output);
+ static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ break;
+ }
local_unlock_nested_bh(&hmac_storage.bh_lock);
local_bh_enable();
-
- if (dgsize < 0)
- return dgsize;
-
- wrsize = SEG6_HMAC_FIELD_LEN;
- if (wrsize > dgsize)
- wrsize = dgsize;
-
- memset(output, 0, SEG6_HMAC_FIELD_LEN);
- memcpy(output, tmp_out, wrsize);
-
- return 0;
+ return ret;
}
EXPORT_SYMBOL(seg6_hmac_compute);
@@ -305,8 +235,18 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo)
struct seg6_pernet_data *sdata = seg6_pernet(net);
int err;
- if (!__hmac_get_algo(hinfo->alg_id))
+ switch (hinfo->alg_id) {
+ case SEG6_HMAC_ALGO_SHA1:
+ hmac_sha1_preparekey(&hinfo->key.sha1,
+ hinfo->secret, hinfo->slen);
+ break;
+ case SEG6_HMAC_ALGO_SHA256:
+ hmac_sha256_preparekey(&hinfo->key.sha256,
+ hinfo->secret, hinfo->slen);
+ break;
+ default:
return -EINVAL;
+ }
err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node,
rht_params);
@@ -363,65 +303,6 @@ out:
}
EXPORT_SYMBOL(seg6_push_hmac);
-static int seg6_hmac_init_algo(void)
-{
- struct seg6_hmac_algo *algo;
- struct crypto_shash *tfm;
- struct shash_desc *shash;
- int i, alg_count, cpu;
- int ret = -ENOMEM;
-
- alg_count = ARRAY_SIZE(hmac_algos);
-
- for (i = 0; i < alg_count; i++) {
- struct crypto_shash **p_tfm;
- int shsize;
-
- algo = &hmac_algos[i];
- algo->tfms = alloc_percpu(struct crypto_shash *);
- if (!algo->tfms)
- goto error_out;
-
- for_each_possible_cpu(cpu) {
- tfm = crypto_alloc_shash(algo->name, 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto error_out;
- }
- p_tfm = per_cpu_ptr(algo->tfms, cpu);
- *p_tfm = tfm;
- }
-
- p_tfm = raw_cpu_ptr(algo->tfms);
- tfm = *p_tfm;
-
- shsize = sizeof(*shash) + crypto_shash_descsize(tfm);
-
- algo->shashs = alloc_percpu(struct shash_desc *);
- if (!algo->shashs)
- goto error_out;
-
- for_each_possible_cpu(cpu) {
- shash = kzalloc_node(shsize, GFP_KERNEL,
- cpu_to_node(cpu));
- if (!shash)
- goto error_out;
- *per_cpu_ptr(algo->shashs, cpu) = shash;
- }
- }
-
- return 0;
-
-error_out:
- seg6_hmac_exit();
- return ret;
-}
-
-int __init seg6_hmac_init(void)
-{
- return seg6_hmac_init_algo();
-}
-
int __net_init seg6_hmac_net_init(struct net *net)
{
struct seg6_pernet_data *sdata = seg6_pernet(net);
@@ -429,36 +310,6 @@ int __net_init seg6_hmac_net_init(struct net *net)
return rhashtable_init(&sdata->hmac_infos, &rht_params);
}
-void seg6_hmac_exit(void)
-{
- struct seg6_hmac_algo *algo = NULL;
- struct crypto_shash *tfm;
- struct shash_desc *shash;
- int i, alg_count, cpu;
-
- alg_count = ARRAY_SIZE(hmac_algos);
- for (i = 0; i < alg_count; i++) {
- algo = &hmac_algos[i];
-
- if (algo->shashs) {
- for_each_possible_cpu(cpu) {
- shash = *per_cpu_ptr(algo->shashs, cpu);
- kfree(shash);
- }
- free_percpu(algo->shashs);
- }
-
- if (algo->tfms) {
- for_each_possible_cpu(cpu) {
- tfm = *per_cpu_ptr(algo->tfms, cpu);
- crypto_free_shash(tfm);
- }
- free_percpu(algo->tfms);
- }
- }
-}
-EXPORT_SYMBOL(seg6_hmac_exit);
-
void __net_exit seg6_hmac_net_exit(struct net *net)
{
struct seg6_pernet_data *sdata = seg6_pernet(net);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 12496ba1b7d4..cf37ad9686e6 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -848,6 +848,49 @@ static inline __be32 try_6rd(struct ip_tunnel *tunnel,
return dst;
}
+static bool ipip6_tunnel_dst_find(struct sk_buff *skb, __be32 *dst,
+ bool is_isatap)
+{
+ const struct ipv6hdr *iph6 = ipv6_hdr(skb);
+ struct neighbour *neigh = NULL;
+ const struct in6_addr *addr6;
+ bool found = false;
+ int addr_type;
+
+ if (skb_dst(skb))
+ neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
+
+ if (!neigh) {
+ net_dbg_ratelimited("nexthop == NULL\n");
+ return false;
+ }
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (is_isatap) {
+ if ((addr_type & IPV6_ADDR_UNICAST) &&
+ ipv6_addr_is_isatap(addr6)) {
+ *dst = addr6->s6_addr32[3];
+ found = true;
+ }
+ } else {
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) != 0) {
+ *dst = addr6->s6_addr32[3];
+ found = true;
+ }
+ }
+
+ neigh_release(neigh);
+
+ return found;
+}
+
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
@@ -867,8 +910,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
__be32 dst = tiph->daddr;
struct flowi4 fl4;
int mtu;
- const struct in6_addr *addr6;
- int addr_type;
u8 ttl;
u8 protocol = IPPROTO_IPV6;
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
@@ -877,64 +918,15 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
tos = ipv6_get_dsfield(iph6);
/* ISATAP (RFC4214) - must come before 6to4 */
- if (dev->priv_flags & IFF_ISATAP) {
- struct neighbour *neigh = NULL;
- bool do_tx_error = false;
-
- if (skb_dst(skb))
- neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
-
- if (!neigh) {
- net_dbg_ratelimited("nexthop == NULL\n");
- goto tx_error;
- }
-
- addr6 = (const struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
-
- if ((addr_type & IPV6_ADDR_UNICAST) &&
- ipv6_addr_is_isatap(addr6))
- dst = addr6->s6_addr32[3];
- else
- do_tx_error = true;
-
- neigh_release(neigh);
- if (do_tx_error)
- goto tx_error;
- }
+ if ((dev->priv_flags & IFF_ISATAP) &&
+ !ipip6_tunnel_dst_find(skb, &dst, true))
+ goto tx_error;
if (!dst)
dst = try_6rd(tunnel, &iph6->daddr);
- if (!dst) {
- struct neighbour *neigh = NULL;
- bool do_tx_error = false;
-
- if (skb_dst(skb))
- neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
-
- if (!neigh) {
- net_dbg_ratelimited("nexthop == NULL\n");
- goto tx_error;
- }
-
- addr6 = (const struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
-
- if (addr_type == IPV6_ADDR_ANY) {
- addr6 = &ipv6_hdr(skb)->daddr;
- addr_type = ipv6_addr_type(addr6);
- }
-
- if ((addr_type & IPV6_ADDR_COMPATv4) != 0)
- dst = addr6->s6_addr32[3];
- else
- do_tx_error = true;
-
- neigh_release(neigh);
- if (do_tx_error)
- goto tx_error;
- }
+ if (!dst && !ipip6_tunnel_dst_find(skb, &dst, false))
+ goto tx_error;
flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark,
tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e885629312a4..0562e939b2e3 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -388,8 +388,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
bool fatal;
int err;
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- &hdr->daddr, th->dest,
+ sk = __inet6_lookup_established(net, &hdr->daddr, th->dest,
&hdr->saddr, ntohs(th->source),
skb->dev->ifindex, inet6_sdif(skb));
@@ -1073,8 +1072,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
- NULL, 0, &ipv6h->saddr, th->source,
+ sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source,
&ipv6h->daddr, ntohs(th->source),
dif, sdif);
if (!sk1)
@@ -1787,7 +1785,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
hdr = ipv6_hdr(skb);
lookup:
- sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th),
+ sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th),
th->source, th->dest, inet6_iif(skb), sdif,
&refcounted);
if (!sk)
@@ -1809,7 +1807,7 @@ lookup:
&hdr->saddr, &hdr->daddr,
AF_INET6, dif, sdif);
if (drop_reason) {
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -1948,7 +1946,7 @@ discard_it:
return 0;
discard_and_relse:
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
@@ -1974,8 +1972,7 @@ do_time_wait:
{
struct sock *sk2;
- sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
- skb, __tcp_hdrlen(th),
+ sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th),
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
ntohs(th->dest),
@@ -2027,8 +2024,7 @@ void tcp_v6_early_demux(struct sk_buff *skb)
return;
/* Note : We use inet6_iif() here, not tcp_v6_iif() */
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- &hdr->saddr, th->source,
+ sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
inet6_iif(skb), inet6_sdif(skb));
if (sk) {
@@ -2048,7 +2044,6 @@ void tcp_v6_early_demux(struct sk_buff *skb)
static struct timewait_sock_ops tcp6_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp6_timewait_sock),
- .twsk_destructor = tcp_twsk_destructor,
};
INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
@@ -2228,9 +2223,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
+ READ_ONCE(icsk->icsk_retransmits),
from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
- icsk->icsk_probes_out,
+ READ_ONCE(icsk->icsk_probes_out),
sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
jiffies_to_clock_t(icsk->icsk_rto),
@@ -2356,7 +2351,6 @@ struct proto tcpv6_prot = {
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
.memory_pressure = &tcp_memory_pressure,
- .orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index a8a04f441e78..effeba58630b 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -36,8 +36,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
inet6_get_iif_sdif(skb, &iif, &sdif);
hdr = skb_gro_network_header(skb);
net = dev_net_rcu(skb->dev);
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- &hdr->saddr, th->source,
+ sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
iif, sdif);
NAPI_GRO_CB(skb)->is_flist = !sk;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6a68f77da44b..a35ee6d693a8 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -524,7 +524,7 @@ try_again:
}
if (unlikely(err)) {
if (!peeking) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
kfree_skb(skb);
@@ -908,7 +908,7 @@ csum_error:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
@@ -1013,7 +1013,7 @@ start_lookup:
}
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
__UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP6_INC_STATS(net, UDP_MIB_INERRORS,
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index cc2b3c44bc05..6c717a7ef292 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1187,7 +1187,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
IUCV_SKB_CB(skb)->offset = 0;
if (sk_filter(sk, skb)) {
- atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ sk_drops_inc(sk); /* skb rejected by filter */
kfree_skb(skb);
return;
}
@@ -2011,7 +2011,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
skb_reset_network_header(skb);
IUCV_SKB_CB(skb)->offset = 0;
if (sk_filter(sk, skb)) {
- atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ sk_drops_inc(sk); /* skb rejected by filter */
kfree_skb(skb);
return NET_RX_SUCCESS;
}
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 685524800d70..b99ba14f39d2 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -256,7 +256,7 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
skb_reserve(skb, hlen);
- /* set type as fist byte in payload */
+ /* set type as first byte in payload */
*(u8 *)skb_put(skb, 1) = addr->smctp_type;
rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len);
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index b08ba959ac4f..31948e18d97d 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -22,7 +22,6 @@
#include <linux/kernel.h>
#include <crypto/sha2.h>
-#include <linux/unaligned.h>
#include "protocol.h"
@@ -43,39 +42,9 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
{
- u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
- u8 key1be[8];
- u8 key2be[8];
- int i;
+ __be64 key[2] = { cpu_to_be64(key1), cpu_to_be64(key2) };
- if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
- len = SHA256_DIGEST_SIZE;
-
- put_unaligned_be64(key1, key1be);
- put_unaligned_be64(key2, key2be);
-
- /* Generate key xored with ipad */
- memset(input, 0x36, SHA256_BLOCK_SIZE);
- for (i = 0; i < 8; i++)
- input[i] ^= key1be[i];
- for (i = 0; i < 8; i++)
- input[i + 8] ^= key2be[i];
-
- memcpy(&input[SHA256_BLOCK_SIZE], msg, len);
-
- /* emit sha256(K1 || msg) on the second input block, so we can
- * reuse 'input' for the last hashing
- */
- sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]);
-
- /* Prepare second part of hmac */
- memset(input, 0x5C, SHA256_BLOCK_SIZE);
- for (i = 0; i < 8; i++)
- input[i] ^= key1be[i];
- for (i = 0; i < 8; i++)
- input[i + 8] ^= key2be[i];
-
- sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac);
+ hmac_sha256_usingrawkey((const u8 *)key, sizeof(key), msg, len, hmac);
}
#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST)
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index cf879c188ca2..6003e47c770a 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -85,7 +85,6 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK),
SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
- SNMP_MIB_SENTINEL
};
/* mptcp_mib_alloc - allocate percpu mib counters
@@ -108,22 +107,23 @@ bool mptcp_mib_alloc(struct net *net)
void mptcp_seq_show(struct seq_file *seq)
{
- unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1];
+ unsigned long sum[ARRAY_SIZE(mptcp_snmp_list)];
+ const int cnt = ARRAY_SIZE(mptcp_snmp_list);
struct net *net = seq->private;
int i;
seq_puts(seq, "MPTcpExt:");
- for (i = 0; mptcp_snmp_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %s", mptcp_snmp_list[i].name);
seq_puts(seq, "\nMPTcpExt:");
memset(sum, 0, sizeof(sum));
if (net->mib.mptcp_statistics)
- snmp_get_cpu_field_batch(sum, mptcp_snmp_list,
- net->mib.mptcp_statistics);
+ snmp_get_cpu_field_batch_cnt(sum, mptcp_snmp_list, cnt,
+ net->mib.mptcp_statistics);
- for (i = 0; mptcp_snmp_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %lu", sum[i]);
seq_putc(seq, '\n');
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index 0566dd793810..ac974299de71 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -15,9 +15,9 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
@@ -76,9 +76,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba
const struct inet_diag_req_v2 *r,
bool net_admin)
{
- struct inet_diag_dump_data *cb_data = cb->data;
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
- struct nlattr *bc = cb_data->inet_diag_nla_bc;
struct net *net = sock_net(skb->sk);
struct inet_hashinfo *hinfo;
int i;
@@ -121,7 +119,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba
if (!refcount_inc_not_zero(&sk->sk_refcnt))
goto next_listen;
- ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+ ret = sk_diag_dump(sk, skb, cb, r, net_admin);
sock_put(sk);
@@ -154,15 +152,10 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
- struct inet_diag_dump_data *cb_data;
struct mptcp_sock *msk;
- struct nlattr *bc;
BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx));
- cb_data = cb->data;
- bc = cb_data->inet_diag_nla_bc;
-
while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot,
&diag_ctx->s_num)) != NULL) {
struct inet_sock *inet = (struct inet_sock *)msk;
@@ -181,7 +174,7 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
r->id.idiag_dport)
goto next;
- ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+ ret = sk_diag_dump(sk, skb, cb, r, net_admin);
next:
sock_put(sk);
if (ret < 0) {
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 136a380602ca..204e1f61212e 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -268,6 +268,27 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk,
return -EINVAL;
}
+static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk)
+{
+ const struct net *net = sock_net((struct sock *)msk);
+ unsigned int rto = mptcp_get_add_addr_timeout(net);
+ struct mptcp_subflow_context *subflow;
+ unsigned int max = 0;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct inet_connection_sock *icsk = inet_csk(ssk);
+
+ if (icsk->icsk_rto > max)
+ max = icsk->icsk_rto;
+ }
+
+ if (max && max < rto)
+ rto = max;
+
+ return rto;
+}
+
static void mptcp_pm_add_timer(struct timer_list *timer)
{
struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer,
@@ -292,7 +313,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
goto out;
}
- timeout = mptcp_get_add_addr_timeout(sock_net(sk));
+ timeout = mptcp_adjust_add_addr_timeout(msk);
if (!timeout)
goto out;
@@ -307,7 +328,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
sk_reset_timer(sk, timer,
- jiffies + timeout);
+ jiffies + (timeout << entry->retrans_times));
spin_unlock_bh(&msk->pm.lock);
@@ -348,7 +369,6 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
{
struct mptcp_pm_add_entry *add_entry = NULL;
struct sock *sk = (struct sock *)msk;
- struct net *net = sock_net(sk);
unsigned int timeout;
lockdep_assert_held(&msk->pm.lock);
@@ -374,7 +394,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
reset_timer:
- timeout = mptcp_get_add_addr_timeout(net);
+ timeout = mptcp_adjust_add_addr_timeout(msk);
if (timeout)
sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout);
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e6fd97b21e9e..7e9eb0ab21c3 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -12,6 +12,7 @@
#include <linux/sched/signal.h>
#include <linux/atomic.h>
#include <net/aligned_data.h>
+#include <net/rps.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
@@ -137,7 +138,7 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
{
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
__kfree_skb(skb);
}
@@ -1740,6 +1741,20 @@ static u32 mptcp_send_limit(const struct sock *sk)
return limit - not_sent;
}
+static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (!rfs_is_needed())
+ return;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ sock_rps_record_flow(ssk);
+ }
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1753,6 +1768,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
+ mptcp_rps_record_subflows(msk);
+
if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
msg->msg_flags & MSG_FASTOPEN)) {
int copied_syn = 0;
@@ -2131,6 +2148,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out_err;
}
+ mptcp_rps_record_subflows(msk);
+
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
len = min_t(size_t, len, INT_MAX);
@@ -2587,7 +2606,8 @@ static void __mptcp_retrans(struct sock *sk)
if (mptcp_data_fin_enabled(msk)) {
struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_retransmits++;
+ WRITE_ONCE(icsk->icsk_retransmits,
+ icsk->icsk_retransmits + 1);
mptcp_set_datafin_timeout(sk);
mptcp_send_ack(msk);
@@ -3920,6 +3940,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
mptcp_sock_graft(ssk, newsock);
}
+ mptcp_rps_record_subflows(msk);
+
/* Do late cleanup for the first subflow as necessary. Also
* deal with bad peers not doing a complete shutdown.
*/
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index b15d7fab5c4b..a1787a1344ac 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -788,9 +788,7 @@ static inline bool mptcp_epollin_ready(const struct sock *sk)
* as it can always coalesce them
*/
return (data_avail >= sk->sk_rcvlowat) ||
- (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- mem_cgroup_under_socket_pressure(sk->sk_memcg)) ||
- READ_ONCE(tcp_memory_pressure);
+ tcp_under_memory_pressure(sk);
}
int mptcp_set_rcvlowat(struct sock *sk, int val);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 3f1b62a9fe88..c8a7e4b59db1 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1717,19 +1717,14 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
/* only the additional subflows created by kworkers have to be modified */
if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
cgroup_id(sock_cgroup_ptr(child_skcd))) {
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = parent->sk_memcg;
-
- mem_cgroup_sk_free(child);
- if (memcg && css_tryget(&memcg->css))
- child->sk_memcg = memcg;
-#endif /* CONFIG_MEMCG */
-
cgroup_sk_free(child_skcd);
*child_skcd = *parent_skcd;
cgroup_sk_clone(child_skcd);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
+
+ if (mem_cgroup_sockets_enabled)
+ mem_cgroup_sk_inherit(parent, child);
}
static void mptcp_subflow_ops_override(struct sock *ssk)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 50fd6809380f..3a04665adf99 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -60,7 +60,7 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("List and change connection tracking table");
struct ctnetlink_list_dump_ctx {
- struct nf_conn *last;
+ unsigned long last_id;
unsigned int cpu;
bool done;
};
@@ -1733,16 +1733,6 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb,
return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
-static int ctnetlink_done_list(struct netlink_callback *cb)
-{
- struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
-
- if (ctx->last)
- nf_ct_put(ctx->last);
-
- return 0;
-}
-
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int ctnetlink_dump_one_entry(struct sk_buff *skb,
struct netlink_callback *cb,
@@ -1757,11 +1747,11 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
if (l3proto && nf_ct_l3num(ct) != l3proto)
return 0;
- if (ctx->last) {
- if (ct != ctx->last)
+ if (ctx->last_id) {
+ if (ctnetlink_get_id(ct) != ctx->last_id)
return 0;
- ctx->last = NULL;
+ ctx->last_id = 0;
}
/* We can't dump extension info for the unconfirmed
@@ -1775,12 +1765,8 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
ct, dying, 0);
- if (res < 0) {
- if (!refcount_inc_not_zero(&ct->ct_general.use))
- return 0;
-
- ctx->last = ct;
- }
+ if (res < 0)
+ ctx->last_id = ctnetlink_get_id(ct);
return res;
}
@@ -1796,10 +1782,10 @@ static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
- struct nf_conn *last = ctx->last;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
const struct net *net = sock_net(skb->sk);
struct nf_conntrack_net_ecache *ecache_net;
+ unsigned long last_id = ctx->last_id;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
#endif
@@ -1807,7 +1793,7 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
if (ctx->done)
return 0;
- ctx->last = NULL;
+ ctx->last_id = 0;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache_net = nf_conn_pernet_ecache(net);
@@ -1818,24 +1804,21 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
int res;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (last && last != ct)
+ if (last_id && last_id != ctnetlink_get_id(ct))
continue;
res = ctnetlink_dump_one_entry(skb, cb, ct, true);
if (res < 0) {
spin_unlock_bh(&ecache_net->dying_lock);
- nf_ct_put(last);
return skb->len;
}
- nf_ct_put(last);
- last = NULL;
+ last_id = 0;
}
spin_unlock_bh(&ecache_net->dying_lock);
#endif
ctx->done = true;
- nf_ct_put(last);
return skb->len;
}
@@ -1847,7 +1830,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb,
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_dying,
- .done = ctnetlink_done_list,
};
return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
@@ -1862,7 +1844,6 @@ static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_unconfirmed,
- .done = ctnetlink_done_list,
};
return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c3c73411c40c..eed434e0a970 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -151,12 +151,12 @@ static void nft_ctx_init(struct nft_ctx *ctx,
bitmap_zero(ctx->reg_inited, NFT_REG32_NUM);
}
-static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
- int msg_type, u32 size, gfp_t gfp)
+static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
+ int msg_type, u32 size)
{
struct nft_trans *trans;
- trans = kzalloc(size, gfp);
+ trans = kzalloc(size, GFP_KERNEL);
if (trans == NULL)
return NULL;
@@ -172,12 +172,6 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
return trans;
}
-static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
- int msg_type, u32 size)
-{
- return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
-}
-
static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans)
{
switch (trans->msg_type) {
@@ -442,8 +436,7 @@ static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a,
static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
struct nft_trans_elem *tail,
- struct nft_trans_elem *trans,
- gfp_t gfp)
+ struct nft_trans_elem *trans)
{
unsigned int nelems, old_nelems = tail->nelems;
struct nft_trans_elem *new_trans;
@@ -466,9 +459,11 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
/* krealloc might free tail which invalidates list pointers */
list_del_init(&tail->nft_trans.list);
- new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp);
+ new_trans = krealloc(tail, struct_size(tail, elems, nelems),
+ GFP_KERNEL);
if (!new_trans) {
- list_add_tail(&tail->nft_trans.list, &nft_net->commit_list);
+ list_add_tail(&tail->nft_trans.list,
+ &nft_net->commit_list);
return false;
}
@@ -484,7 +479,7 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
}
static bool nft_trans_try_collapse(struct nftables_pernet *nft_net,
- struct nft_trans *trans, gfp_t gfp)
+ struct nft_trans *trans)
{
struct nft_trans *tail;
@@ -501,7 +496,7 @@ static bool nft_trans_try_collapse(struct nftables_pernet *nft_net,
case NFT_MSG_DELSETELEM:
return nft_trans_collapse_set_elem(nft_net,
nft_trans_container_elem(tail),
- nft_trans_container_elem(trans), gfp);
+ nft_trans_container_elem(trans));
}
return false;
@@ -537,17 +532,14 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr
}
}
-static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans,
- gfp_t gfp)
+static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans)
{
struct nftables_pernet *nft_net = nft_pernet(net);
WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM &&
trans->msg_type != NFT_MSG_DELSETELEM);
- might_alloc(gfp);
-
- if (nft_trans_try_collapse(nft_net, trans, gfp)) {
+ if (nft_trans_try_collapse(nft_net, trans)) {
kfree(trans);
return;
}
@@ -7573,7 +7565,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
}
ue->priv = elem_priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
goto err_elem_free;
}
}
@@ -7597,7 +7589,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
}
nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
err_set_full:
@@ -7863,7 +7855,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
nft_setelem_data_deactivate(ctx->net, set, elem.priv);
nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
fail_ops:
@@ -7888,9 +7880,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx,
if (!nft_set_elem_active(ext, iter->genmask))
return 0;
- trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
- struct_size_t(struct nft_trans_elem, elems, 1),
- GFP_ATOMIC);
+ trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM,
+ struct_size_t(struct nft_trans_elem, elems, 1));
if (!trans)
return -ENOMEM;
@@ -7901,7 +7892,7 @@ static int nft_setelem_flush(const struct nft_ctx *ctx,
nft_trans_elem_set(trans) = set;
nft_trans_container_elem(trans)->nelems = 1;
nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
}
@@ -7918,7 +7909,7 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
nft_setelem_data_deactivate(ctx->net, set, elem_priv);
nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
}
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 225ff293cd50..14dd1c0698c3 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -9,7 +9,7 @@
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_tables.h>
#include <net/ip.h>
-#include <net/inet_dscp.h>
+#include <net/flow.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -236,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
- fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb)));
+ fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
fl.u.ip4.flowi4_mark = pkt->skb->mark;
fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
break;
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 7dfc5343dae4..b0214418f75a 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -40,7 +40,7 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
-nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
+nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len)
{
int mac_off = skb_mac_header(skb) - skb->data;
u8 *vlanh, *dst_u8 = (u8 *) d;
@@ -212,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 },
[NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
[NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
@@ -684,7 +684,7 @@ static const struct nft_expr_ops nft_payload_inner_ops = {
static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
{
- *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
+ csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum);
if (*sum == 0)
*sum = CSUM_MANGLED_0;
}
@@ -797,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
struct nft_payload_set {
enum nft_payload_bases base:8;
- u8 offset;
+ u16 offset;
u8 len;
u8 sreg;
u8 csum_type;
@@ -812,7 +812,7 @@ struct nft_payload_vlan_hdr {
};
static bool
-nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len,
+nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len,
int *vlan_hlen)
{
struct nft_payload_vlan_hdr *vlanh;
@@ -940,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
struct nft_payload_set *priv = nft_expr_priv(expr);
- u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
int err;
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
- priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
+ if (err < 0)
+ return err;
+ priv->offset = offset;
+
if (tb[NFTA_PAYLOAD_CSUM_TYPE])
csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) {
@@ -1069,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_DREG] == NULL)
return ERR_PTR(-EINVAL);
- err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset);
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
if (err < 0)
return ERR_PTR(err);
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 266d0c637225..ba01ce75d6de 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -30,6 +30,7 @@ struct nft_rhash {
struct nft_rhash_elem {
struct nft_elem_priv priv;
struct rhash_head node;
+ struct llist_node walk_node;
u32 wq_gc_seq;
struct nft_set_ext ext;
};
@@ -144,6 +145,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key,
goto err1;
he = nft_elem_priv_cast(elem_priv);
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
@@ -180,6 +182,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
};
struct nft_rhash_elem *prev;
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
@@ -261,12 +264,12 @@ static bool nft_rhash_delete(const struct nft_set *set,
return true;
}
-static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
{
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he;
struct rhashtable_iter hti;
+ struct nft_rhash_elem *he;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
@@ -295,6 +298,97 @@ cont:
rhashtable_walk_exit(&hti);
}
+static void nft_rhash_walk_update(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_elem *he, *tmp;
+ struct llist_node *first_node;
+ struct rhashtable_iter hti;
+ LLIST_HEAD(walk_list);
+
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+
+ if (set->in_update_walk) {
+ /* This can happen with bogus rulesets during ruleset validation
+ * when a verdict map causes a jump back to the same map.
+ *
+ * Without this extra check the walk_next loop below will see
+ * elems on the callers walk_list and skip (not validate) them.
+ */
+ iter->err = -EMLINK;
+ return;
+ }
+
+ /* walk happens under RCU.
+ *
+ * We create a snapshot list so ->iter callback can sleep.
+ * commit_mutex is held, elements can ...
+ * .. be added in parallel from dataplane (dynset)
+ * .. be marked as dead in parallel from dataplane (dynset).
+ * .. be queued for removal in parallel (gc timeout).
+ * .. not be freed: transaction mutex is held.
+ */
+ rhashtable_walk_enter(&priv->ht, &hti);
+ rhashtable_walk_start(&hti);
+
+ while ((he = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(he)) {
+ if (PTR_ERR(he) != -EAGAIN) {
+ iter->err = PTR_ERR(he);
+ break;
+ }
+
+ continue;
+ }
+
+ /* rhashtable resized during walk, skip */
+ if (llist_on_list(&he->walk_node))
+ continue;
+
+ llist_add(&he->walk_node, &walk_list);
+ }
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ first_node = __llist_del_all(&walk_list);
+ set->in_update_walk = true;
+ llist_for_each_entry_safe(he, tmp, first_node, walk_node) {
+ if (iter->err == 0) {
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
+ if (iter->err == 0)
+ iter->count++;
+ }
+
+ /* all entries must be cleared again, else next ->walk iteration
+ * will skip entries.
+ */
+ init_llist_node(&he->walk_node);
+ }
+ set->in_update_walk = false;
+}
+
+static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ /* only relevant for netlink dumps which use READ type */
+ WARN_ON_ONCE(iter->skip != 0);
+
+ nft_rhash_walk_update(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ nft_rhash_walk_ro(ctx, set, iter);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
struct nft_set_ext *ext)
{
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 793790d79d13..a7b8fa8cab7c 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -397,7 +397,7 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
}
/**
- * pipapo_get() - Get matching element reference given key data
+ * pipapo_get_slow() - Get matching element reference given key data
* @m: storage containing the set elements
* @data: Key data to be matched against existing elements
* @genmask: If set, check that element is active in given genmask
@@ -414,12 +414,12 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
*
* Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
- const u8 *data, u8 genmask,
- u64 tstamp)
+static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
+ unsigned long *res_map, *fill_map, *map;
struct nft_pipapo_scratch *scratch;
- unsigned long *res_map, *fill_map;
const struct nft_pipapo_field *f;
bool map_index;
int i;
@@ -429,11 +429,13 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
scratch = *raw_cpu_ptr(m->scratch);
if (unlikely(!scratch))
goto out;
+ __local_lock_nested_bh(&scratch->bh_lock);
map_index = scratch->map_index;
- res_map = scratch->map + (map_index ? m->bsize_max : 0);
- fill_map = scratch->map + (map_index ? 0 : m->bsize_max);
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res_map = map + (map_index ? m->bsize_max : 0);
+ fill_map = map + (map_index ? 0 : m->bsize_max);
pipapo_resmap_init(m, res_map);
@@ -464,6 +466,7 @@ next_match:
last);
if (b < 0) {
scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
return NULL;
@@ -483,6 +486,7 @@ next_match:
* *next* bitmap (not initial) for the next packet.
*/
scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
return e;
}
@@ -497,12 +501,47 @@ next_match:
data += NFT_PIPAPO_GROUPS_PADDING(f);
}
+ __local_unlock_nested_bh(&scratch->bh_lock);
out:
local_bh_enable();
return NULL;
}
/**
+ * pipapo_get() - Get matching element reference given key data
+ * @m: Storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
+ *
+ * This is a dispatcher function, either calling out the generic C
+ * implementation or, if available, the AVX2 one.
+ * This helper is only called from the control plane, with either RCU
+ * read lock or transaction mutex held.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
+ */
+static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
+{
+ struct nft_pipapo_elem *e;
+
+ local_bh_disable();
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) {
+ e = pipapo_get_avx2(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
+ }
+#endif
+ e = pipapo_get_slow(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
+}
+
+/**
* nft_pipapo_lookup() - Dataplane fronted for main lookup function
* @net: Network namespace
* @set: nftables API set representation
@@ -539,7 +578,7 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
const struct nft_pipapo_elem *e;
m = rcu_dereference(priv->match);
- e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64());
+ e = pipapo_get_slow(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64());
return e ? &e->ext : NULL;
}
@@ -1152,22 +1191,17 @@ static void pipapo_map(struct nft_pipapo_match *m,
}
/**
- * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address
+ * pipapo_free_scratch() - Free per-CPU map at original address
* @m: Matching data
* @cpu: CPU number
*/
static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu)
{
struct nft_pipapo_scratch *s;
- void *mem;
s = *per_cpu_ptr(m->scratch, cpu);
- if (!s)
- return;
- mem = s;
- mem -= s->align_off;
- kvfree(mem);
+ kvfree(s);
}
/**
@@ -1184,11 +1218,8 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
for_each_possible_cpu(i) {
struct nft_pipapo_scratch *scratch;
-#ifdef NFT_PIPAPO_ALIGN
- void *scratch_aligned;
- u32 align_off;
-#endif
- scratch = kvzalloc_node(struct_size(scratch, map, bsize_max * 2) +
+
+ scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) +
NFT_PIPAPO_ALIGN_HEADROOM,
GFP_KERNEL_ACCOUNT, cpu_to_node(i));
if (!scratch) {
@@ -1203,23 +1234,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
}
pipapo_free_scratch(clone, i);
-
-#ifdef NFT_PIPAPO_ALIGN
- /* Align &scratch->map (not the struct itself): the extra
- * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node()
- * above guarantee we can waste up to those bytes in order
- * to align the map field regardless of its offset within
- * the struct.
- */
- BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM);
-
- scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map);
- scratch_aligned -= offsetof(struct nft_pipapo_scratch, map);
- align_off = scratch_aligned - (void *)scratch;
-
- scratch = scratch_aligned;
- scratch->align_off = align_off;
-#endif
+ local_lock_init(&scratch->bh_lock);
*per_cpu_ptr(clone->scratch, i) = scratch;
}
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index 4a2ff85ce1c4..eaab422aa56a 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -124,14 +124,14 @@ struct nft_pipapo_field {
/**
* struct nft_pipapo_scratch - percpu data used for lookup and matching
+ * @bh_lock: PREEMPT_RT local spinlock
* @map_index: Current working bitmap index, toggled between field matches
- * @align_off: Offset to get the originally allocated address
- * @map: store partial matching results during lookup
+ * @__map: store partial matching results during lookup
*/
struct nft_pipapo_scratch {
+ local_lock_t bh_lock;
u8 map_index;
- u32 align_off;
- unsigned long map[];
+ unsigned long __map[];
};
/**
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index c0884fa68c79..27dab3667548 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -1099,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
desc->field_count < NFT_PIPAPO_MIN_FIELDS)
return false;
- if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
+ if (!boot_cpu_has(X86_FEATURE_AVX2))
return false;
est->size = pipapo_estimate_size(desc);
@@ -1133,65 +1133,50 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns
}
/**
- * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
- * @net: Network namespace
- * @set: nftables API set representation
- * @key: nftables API element representation containing key data
+ * pipapo_get_avx2() - Lookup function for AVX2 implementation
+ * @m: Storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
*
* For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
*
* This implementation exploits the repetitive characteristic of the algorithm
* to provide a fast, vectorised version using the AVX2 SIMD instruction set.
*
- * Return: true on match, false otherwise.
+ * The caller must check that the FPU is usable.
+ * This function must be called with BH disabled.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-const struct nft_set_ext *
-nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key)
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- const struct nft_set_ext *ext = NULL;
struct nft_pipapo_scratch *scratch;
- const struct nft_pipapo_match *m;
const struct nft_pipapo_field *f;
- const u8 *rp = (const u8 *)key;
- unsigned long *res, *fill;
+ unsigned long *res, *fill, *map;
bool map_index;
int i;
- local_bh_disable();
-
- if (unlikely(!irq_fpu_usable())) {
- ext = nft_pipapo_lookup(net, set, key);
+ scratch = *raw_cpu_ptr(m->scratch);
+ if (unlikely(!scratch))
+ return NULL;
- local_bh_enable();
- return ext;
- }
+ __local_lock_nested_bh(&scratch->bh_lock);
+ map_index = scratch->map_index;
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res = map + (map_index ? m->bsize_max : 0);
+ fill = map + (map_index ? 0 : m->bsize_max);
- m = rcu_dereference(priv->match);
+ pipapo_resmap_init_avx2(m, res);
- /* This also protects access to all data related to scratch maps.
- *
- * Note that we don't need a valid MXCSR state for any of the
+ /* Note that we don't need a valid MXCSR state for any of the
* operations we use here, so pass 0 as mask and spare a LDMXCSR
* instruction.
*/
kernel_fpu_begin_mask(0);
- scratch = *raw_cpu_ptr(m->scratch);
- if (unlikely(!scratch)) {
- kernel_fpu_end();
- local_bh_enable();
- return NULL;
- }
-
- map_index = scratch->map_index;
-
- res = scratch->map + (map_index ? m->bsize_max : 0);
- fill = scratch->map + (map_index ? 0 : m->bsize_max);
-
- pipapo_resmap_init_avx2(m, res);
-
nft_pipapo_avx2_prepare();
next_match:
@@ -1201,7 +1186,7 @@ next_match:
#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
- ret, rp, \
+ ret, data, \
first, last))
if (likely(f->bb == 8)) {
@@ -1217,7 +1202,7 @@ next_match:
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
} else {
ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
- ret, rp,
+ ret, data,
first, last);
}
} else {
@@ -1233,7 +1218,7 @@ next_match:
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
} else {
ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
- ret, rp,
+ ret, data,
first, last);
}
}
@@ -1241,28 +1226,74 @@ next_match:
#undef NFT_SET_PIPAPO_AVX2_LOOKUP
- if (ret < 0)
- goto out;
+ if (ret < 0) {
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+ }
if (last) {
- const struct nft_set_ext *e = &f->mt[ret].e->ext;
+ struct nft_pipapo_elem *e;
- if (unlikely(nft_set_elem_expired(e)))
+ e = f->mt[ret].e;
+ if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
+ !nft_set_elem_active(&e->ext, genmask)))
goto next_match;
- ext = e;
- goto out;
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return e;
}
+ map_index = !map_index;
swap(res, fill);
- rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
-out:
- if (i % 2)
- scratch->map_index = !map_index;
kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+}
+
+/**
+ * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @key: nftables API element representation containing key data
+ *
+ * This function is called from the data path. It will search for
+ * an element matching the given key in the current active copy using
+ * the AVX2 routines if the FPU is usable or fall back to the generic
+ * implementation of the algorithm otherwise.
+ *
+ * Return: nftables API extension pointer or NULL if no match.
+ */
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+ const u8 *rp = (const u8 *)key;
+ const struct nft_pipapo_elem *e;
+
+ local_bh_disable();
+
+ if (unlikely(!irq_fpu_usable())) {
+ const struct nft_set_ext *ext;
+
+ ext = nft_pipapo_lookup(net, set, key);
+
+ local_bh_enable();
+ return ext;
+ }
+
+ m = rcu_dereference(priv->match);
+
+ e = pipapo_get_avx2(m, rp, NFT_GENMASK_ANY, get_jiffies_64());
local_bh_enable();
- return ext;
+ return e ? &e->ext : NULL;
}
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
index dbb6aaca8a7a..c2999b63da3f 100644
--- a/net/netfilter/nft_set_pipapo_avx2.h
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -5,8 +5,12 @@
#include <asm/fpu/xstate.h>
#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
+struct nft_pipapo_match;
bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est);
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp);
#endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */
#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index b1f04168ec93..ca594161b840 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -584,15 +584,14 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
return NULL;
}
-static void nft_rbtree_walk(const struct nft_ctx *ctx,
- struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_rbtree_do_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
struct rb_node *node;
- read_lock_bh(&priv->lock);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -600,14 +599,34 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
goto cont;
iter->err = iter->fn(ctx, set, iter, &rbe->priv);
- if (iter->err < 0) {
- read_unlock_bh(&priv->lock);
+ if (iter->err < 0)
return;
- }
cont:
iter->count++;
}
- read_unlock_bh(&priv->lock);
+}
+
+static void nft_rbtree_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+ nft_rbtree_do_walk(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ read_lock_bh(&priv->lock);
+ nft_rbtree_do_walk(ctx, set, iter);
+ read_unlock_bh(&priv->lock);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e2f7080dd5d7..2b46c0cd752a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -356,7 +356,7 @@ static void netlink_overrun(struct sock *sk)
sk_error_report(sk);
}
}
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
}
static void netlink_rcv_wake(struct sock *sk)
@@ -2711,7 +2711,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v)
sk_wmem_alloc_get(s),
READ_ONCE(nlk->cb_running),
refcount_read(&s->sk_refcnt),
- atomic_read(&s->sk_drops),
+ sk_drops_read(s),
sock_i_ino(s)
);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index b80bd3a90773..66366982f604 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -129,15 +129,13 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
struct ovs_flow_stats *ovs_stats,
unsigned long *used, __be16 *tcp_flags)
{
- int cpu;
+ unsigned int cpu;
*used = 0;
*tcp_flags = 0;
memset(ovs_stats, 0, sizeof(*ovs_stats));
- /* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids;
- cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
+ for_each_cpu(cpu, flow->cpu_used_mask) {
struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
if (stats) {
@@ -158,11 +156,9 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
/* Called with ovs_mutex. */
void ovs_flow_stats_clear(struct sw_flow *flow)
{
- int cpu;
+ unsigned int cpu;
- /* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids;
- cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
+ for_each_cpu(cpu, flow->cpu_used_mask) {
struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
if (stats) {
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d108ae0bd0ee..ffc72a741a50 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -107,16 +107,15 @@ int ovs_flow_tbl_count(const struct flow_table *table)
static void flow_free(struct sw_flow *flow)
{
- int cpu;
+ unsigned int cpu;
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
ovs_nla_free_flow_actions((struct sw_flow_actions __force *)
flow->sf_acts);
- /* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids;
- cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
+
+ for_each_cpu(cpu, flow->cpu_used_mask) {
if (flow->stats[cpu])
kmem_cache_free(flow_stats_cache,
(struct sw_flow_stats __force *)flow->stats[cpu]);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a7017d7f0927..9d42c4bd6e39 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2265,7 +2265,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct:
atomic_inc(&po->tp_drops);
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;
drop_n_restore:
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index a27efa4faa4e..238a9638d2b0 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -22,7 +22,7 @@
#include <net/phonet/pn_dev.h>
/* Transport protocol registration */
-static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+static const struct phonet_protocol __rcu *proto_tab[PHONET_NPROTO] __read_mostly;
static const struct phonet_protocol *phonet_proto_get(unsigned int protocol)
{
@@ -482,7 +482,7 @@ void phonet_proto_unregister(unsigned int protocol,
const struct phonet_protocol *pp)
{
mutex_lock(&proto_tab_lock);
- BUG_ON(proto_tab[protocol] != pp);
+ BUG_ON(rcu_access_pointer(proto_tab[protocol]) != pp);
RCU_INIT_POINTER(proto_tab[protocol], NULL);
mutex_unlock(&proto_tab_lock);
synchronize_rcu();
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 62527e1ebb88..4db564d9d522 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -376,7 +376,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
case PNS_PEP_CTRL_REQ:
if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
break;
}
__skb_pull(skb, 4);
@@ -397,7 +397,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
}
if (pn->rx_credits == 0) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
err = -ENOBUFS;
break;
}
@@ -567,7 +567,7 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
}
if (pn->rx_credits == 0) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
err = NET_RX_DROP;
break;
}
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index ea4d5e6533db..db2d552e9b32 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -587,7 +587,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v)
from_kuid_munged(seq_user_ns(seq), sk_uid(sk)),
sock_i_ino(sk),
refcount_read(&sk->sk_refcnt), sk,
- atomic_read(&sk->sk_drops));
+ sk_drops_read(sk));
}
seq_pad(seq, '\n');
return 0;
@@ -602,7 +602,7 @@ const struct seq_operations pn_sock_seq_ops = {
#endif
static struct {
- struct sock *sk[256];
+ struct sock __rcu *sk[256];
} pnres;
/*
@@ -654,7 +654,7 @@ int pn_sock_unbind_res(struct sock *sk, u8 res)
return -EPERM;
mutex_lock(&resource_mutex);
- if (pnres.sk[res] == sk) {
+ if (rcu_access_pointer(pnres.sk[res]) == sk) {
RCU_INIT_POINTER(pnres.sk[res], NULL);
ret = 0;
}
@@ -673,7 +673,7 @@ void pn_sock_unbind_all_res(struct sock *sk)
mutex_lock(&resource_mutex);
for (res = 0; res < 256; res++) {
- if (pnres.sk[res] == sk) {
+ if (rcu_access_pointer(pnres.sk[res]) == sk) {
RCU_INIT_POINTER(pnres.sk[res], NULL);
match++;
}
@@ -688,7 +688,7 @@ void pn_sock_unbind_all_res(struct sock *sk)
}
#ifdef CONFIG_PROC_FS
-static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
+static struct sock __rcu **pn_res_get_idx(struct seq_file *seq, loff_t pos)
{
struct net *net = seq_file_net(seq);
unsigned int i;
@@ -697,7 +697,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
return NULL;
for (i = 0; i < 256; i++) {
- if (pnres.sk[i] == NULL)
+ if (rcu_access_pointer(pnres.sk[i]) == NULL)
continue;
if (!pos)
return pnres.sk + i;
@@ -706,7 +706,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
return NULL;
}
-static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk)
+static struct sock __rcu **pn_res_get_next(struct seq_file *seq, struct sock __rcu **sk)
{
struct net *net = seq_file_net(seq);
unsigned int i;
@@ -728,7 +728,7 @@ static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos)
static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct sock **sk;
+ struct sock __rcu **sk;
if (v == SEQ_START_TOKEN)
sk = pn_res_get_idx(seq, 0);
@@ -747,11 +747,12 @@ static void pn_res_seq_stop(struct seq_file *seq, void *v)
static int pn_res_seq_show(struct seq_file *seq, void *v)
{
seq_setwidth(seq, 63);
- if (v == SEQ_START_TOKEN)
+ if (v == SEQ_START_TOKEN) {
seq_puts(seq, "rs uid inode");
- else {
- struct sock **psk = v;
- struct sock *sk = *psk;
+ } else {
+ struct sock __rcu **psk = v;
+ struct sock *sk = rcu_dereference_protected(*psk,
+ lockdep_is_held(&resource_mutex));
seq_printf(seq, "%02X %5u %lu",
(int) (psk - pnres.sk),
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 086a13170e09..4a7217fbeab6 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -242,7 +242,7 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
mask |= (EPOLLOUT | EPOLLWRNORM);
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= EPOLLERR;
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
/* clear state any time we wake a seen-congested socket */
diff --git a/net/rds/connection.c b/net/rds/connection.c
index d62f486ab29f..68bc88cce84e 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -57,16 +57,17 @@ static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
static u32 rds6_hash_secret __read_mostly;
static u32 rds_hash_secret __read_mostly;
- u32 lhash, fhash, hash;
+ __be32 lhash, fhash;
+ u32 hash;
net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
- lhash = (__force u32)laddr->s6_addr32[3];
+ lhash = laddr->s6_addr32[3];
#if IS_ENABLED(CONFIG_IPV6)
- fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
+ fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret);
#else
- fhash = (__force u32)faddr->s6_addr32[3];
+ fhash = faddr->s6_addr32[3];
#endif
hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index ea5e9aee4959..5884de8c6f45 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -108,7 +108,6 @@ struct rds_ib_mr_pool {
};
extern struct workqueue_struct *rds_ib_mr_wq;
-extern bool prefer_frmr;
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
int npages);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index e53b7f266bd7..4248dfa816eb 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1034,7 +1034,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
rds_ib_stats_inc(s_ib_rx_ring_empty);
if (rds_ib_ring_low(&ic->i_recv_ring)) {
- rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN);
+ rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
rds_ib_stats_inc(s_ib_rx_refill_from_cq);
}
}
diff --git a/net/rds/message.c b/net/rds/message.c
index 7af59d2443e5..199a899a43e9 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -44,8 +44,8 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
-[RDS_EXTHDR_NPATHS] = sizeof(u16),
-[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
+[RDS_EXTHDR_NPATHS] = sizeof(__be16),
+[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
};
void rds_message_addref(struct rds_message *rm)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index dc360252c515..5b1c072e2e7f 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -93,7 +93,7 @@ enum {
/* Max number of multipaths per RDS connection. Must be a power of 2 */
#define RDS_MPATH_WORKERS 8
-#define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \
+#define RDS_MPATH_HASH(rs, n) (jhash_1word(ntohs((rs)->rs_bound_port), \
(rs)->rs_hash_initval) & ((n) - 1))
#define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr))
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 5627f80013f8..66205d6924bf 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -202,8 +202,8 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
unsigned int pos = 0, type, len;
union {
struct rds_ext_header_version version;
- u16 rds_npaths;
- u32 rds_gen_num;
+ __be16 rds_npaths;
+ __be32 rds_gen_num;
} buffer;
u32 new_peer_gen_num = 0;
diff --git a/net/rds/send.c b/net/rds/send.c
index 42d991bc8543..0b3d0ef2f008 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1454,8 +1454,8 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) &&
cp->cp_conn->c_trans->t_mp_capable) {
- u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
- u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
+ __be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
+ __be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9e468e463467..ff6be5cfe2b0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1585,7 +1585,7 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets,
}
_bstats_update(&a->tcfa_bstats, bytes, packets);
- a->tcfa_qstats.drops += drops;
+ atomic_add(drops, &a->tcfa_drops);
if (hw)
_bstats_update(&a->tcfa_bstats_hw, bytes, packets);
}
@@ -1594,8 +1594,9 @@ EXPORT_SYMBOL(tcf_action_update_stats);
int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
int compat_mode)
{
- int err = 0;
+ struct gnet_stats_queue qstats = {0};
struct gnet_dump d;
+ int err = 0;
if (p == NULL)
goto errout;
@@ -1619,14 +1620,17 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
if (err < 0)
goto errout;
+ qstats.drops = atomic_read(&p->tcfa_drops);
+ qstats.overlimits = atomic_read(&p->tcfa_overlimits);
+
if (gnet_stats_copy_basic(&d, p->cpu_bstats,
&p->tcfa_bstats, false) < 0 ||
gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw,
&p->tcfa_bstats_hw, false) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
- &p->tcfa_qstats,
- p->tcfa_qstats.qlen) < 0)
+ &qstats,
+ qstats.qlen) < 0)
goto errout;
if (gnet_stats_finish_copy(&d) < 0)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index f3abe0545989..8e69a919b4fe 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -72,7 +72,6 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata,
d = to_defact(a);
spin_lock_bh(&d->tcf_lock);
goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch);
- memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
spin_unlock_bh(&d->tcf_lock);
if (goto_ch)
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index dc0229693461..a9e0c1326e2a 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -27,19 +27,18 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb,
struct tcf_result *res)
{
struct tcf_skbmod *d = to_skbmod(a);
- int action, max_edit_len, err;
struct tcf_skbmod_params *p;
+ int max_edit_len, err;
u64 flags;
tcf_lastuse_update(&d->tcf_tm);
bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb);
- action = READ_ONCE(d->tcf_action);
- if (unlikely(action == TC_ACT_SHOT))
+ p = rcu_dereference_bh(d->skbmod_p);
+ if (unlikely(p->action == TC_ACT_SHOT))
goto drop;
max_edit_len = skb_mac_header_len(skb);
- p = rcu_dereference_bh(d->skbmod_p);
flags = p->flags;
/* tcf_skbmod_init() guarantees "flags" to be one of the following:
@@ -85,7 +84,7 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb,
INET_ECN_set_ce(skb);
out:
- return action;
+ return p->action;
drop:
qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
@@ -193,7 +192,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
}
p->flags = lflags;
-
+ p->action = parm->action;
if (ovr)
spin_lock_bh(&d->tcf_lock);
/* Protected by tcf_lock if overwriting existing action. */
@@ -248,10 +247,9 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
opt.index = d->tcf_index;
opt.refcnt = refcount_read(&d->tcf_refcnt) - ref;
opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind;
- spin_lock_bh(&d->tcf_lock);
- opt.action = d->tcf_action;
- p = rcu_dereference_protected(d->skbmod_p,
- lockdep_is_held(&d->tcf_lock));
+ rcu_read_lock();
+ p = rcu_dereference(d->skbmod_p);
+ opt.action = p->action;
opt.flags = p->flags;
if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -269,10 +267,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
goto nla_put_failure;
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 2cef4b08befb..876b30c5709e 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -29,13 +29,11 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb,
{
struct tcf_tunnel_key *t = to_tunnel_key(a);
struct tcf_tunnel_key_params *params;
- int action;
params = rcu_dereference_bh(t->params);
tcf_lastuse_update(&t->tcf_tm);
tcf_action_update_bstats(&t->common, skb);
- action = READ_ONCE(t->tcf_action);
switch (params->tcft_action) {
case TCA_TUNNEL_KEY_ACT_RELEASE:
@@ -51,7 +49,7 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb,
break;
}
- return action;
+ return params->action;
}
static const struct nla_policy
@@ -532,6 +530,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
params_new->tcft_action = parm->t_action;
params_new->tcft_enc_metadata = metadata;
+ params_new->action = parm->action;
spin_lock_bh(&t->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
params_new = rcu_replace_pointer(t->params, params_new,
@@ -726,10 +725,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t tm;
- spin_lock_bh(&t->tcf_lock);
- params = rcu_dereference_protected(t->params,
- lockdep_is_held(&t->tcf_lock));
- opt.action = t->tcf_action;
+ rcu_read_lock();
+ params = rcu_dereference(t->params);
+ opt.action = params->action;
opt.t_action = params->tcft_action;
if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
@@ -766,12 +764,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
&tm, TCA_TUNNEL_KEY_PAD))
goto nla_put_failure;
- spin_unlock_bh(&t->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&t->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 383bf18b6862..a74621797d69 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -25,7 +25,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb,
{
struct tcf_vlan *v = to_vlan(a);
struct tcf_vlan_params *p;
- int action;
int err;
u16 tci;
@@ -38,8 +37,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb,
if (skb_at_tc_ingress(skb))
skb_push_rcsum(skb, skb->mac_len);
- action = READ_ONCE(v->tcf_action);
-
p = rcu_dereference_bh(v->vlan_p);
switch (p->tcfv_action) {
@@ -97,7 +94,7 @@ out:
skb_pull_rcsum(skb, skb->mac_len);
skb_reset_mac_len(skb);
- return action;
+ return p->action;
drop:
tcf_action_inc_drop_qstats(&v->common);
@@ -255,6 +252,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
ETH_ALEN);
}
+ p->action = parm->action;
spin_lock_bh(&v->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
@@ -297,9 +295,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- spin_lock_bh(&v->tcf_lock);
- opt.action = v->tcf_action;
- p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock));
+ rcu_read_lock();
+ p = rcu_dereference(v->vlan_p);
+ opt.action = p->action;
opt.v_action = p->tcfv_action;
if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -325,12 +323,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
- spin_unlock_bh(&v->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&v->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d7c767b861a4..1e058b46d3e1 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -431,7 +431,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
- !memcmp(&rtab->data, nla_data(tab), 1024)) {
+ !memcmp(&rtab->data, nla_data(tab), TC_RTAB_SIZE)) {
rtab->refcnt++;
return rtab;
}
@@ -441,7 +441,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
if (rtab) {
rtab->rate = *r;
rtab->refcnt = 1;
- memcpy(rtab->data, nla_data(tab), 1024);
+ memcpy(rtab->data, nla_data(tab), TC_RTAB_SIZE);
if (r->linklayer == TC_LINKLAYER_UNAWARE)
r->linklayer = __detect_linklayer(r, rtab->data);
rtab->next = qdisc_rtab_list;
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index 24d5a35ce894..e947646a380c 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -7,9 +7,9 @@ menuconfig IP_SCTP
tristate "The SCTP Protocol"
depends on INET
depends on IPV6 || IPV6=n
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_SHA1
+ select CRYPTO_LIB_SHA1
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_UTILS
select NET_CRC32C
select NET_UDP_TUNNEL
help
@@ -49,46 +49,25 @@ config SCTP_DBG_OBJCNT
'cat /proc/net/sctp/sctp_dbg_objcnt'
If unsure, say N
+
choice
- prompt "Default SCTP cookie HMAC encoding"
- default SCTP_DEFAULT_COOKIE_HMAC_MD5
+ prompt "Default SCTP cookie authentication method"
+ default SCTP_DEFAULT_COOKIE_HMAC_SHA256
help
- This option sets the default sctp cookie hmac algorithm
- when in doubt select 'md5'
+ This option sets the default SCTP cookie authentication method, for
+ when a method hasn't been explicitly selected via the
+ net.sctp.cookie_hmac_alg sysctl.
-config SCTP_DEFAULT_COOKIE_HMAC_MD5
- bool "Enable optional MD5 hmac cookie generation"
- help
- Enable optional MD5 hmac based SCTP cookie generation
- select SCTP_COOKIE_HMAC_MD5
+ If unsure, choose the default (HMAC-SHA256).
-config SCTP_DEFAULT_COOKIE_HMAC_SHA1
- bool "Enable optional SHA1 hmac cookie generation"
- help
- Enable optional SHA1 hmac based SCTP cookie generation
- select SCTP_COOKIE_HMAC_SHA1
+config SCTP_DEFAULT_COOKIE_HMAC_SHA256
+ bool "HMAC-SHA256"
config SCTP_DEFAULT_COOKIE_HMAC_NONE
- bool "Use no hmac alg in SCTP cookie generation"
- help
- Use no hmac algorithm in SCTP cookie generation
+ bool "None"
endchoice
-config SCTP_COOKIE_HMAC_MD5
- bool "Enable optional MD5 hmac cookie generation"
- help
- Enable optional MD5 hmac based SCTP cookie generation
- select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5
- select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5
-
-config SCTP_COOKIE_HMAC_SHA1
- bool "Enable optional SHA1 hmac cookie generation"
- help
- Enable optional SHA1 hmac based SCTP cookie generation
- select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1
- select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1
-
config INET_SCTP_DIAG
depends on INET_DIAG
def_tristate INET_DIAG
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index c58fffc86a0c..82aad477590e 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -12,36 +12,37 @@
* Vlad Yasevich <vladislav.yasevich@hp.com>
*/
-#include <crypto/hash.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
#include <linux/slab.h>
#include <linux/types.h>
-#include <linux/scatterlist.h>
#include <net/sctp/sctp.h>
#include <net/sctp/auth.h>
-static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
+static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
{
/* id 0 is reserved. as all 0 */
.hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0,
},
{
.hmac_id = SCTP_AUTH_HMAC_ID_SHA1,
- .hmac_name = "hmac(sha1)",
- .hmac_len = SCTP_SHA1_SIG_SIZE,
+ .hmac_len = SHA1_DIGEST_SIZE,
},
{
/* id 2 is reserved as well */
.hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2,
},
-#if IS_ENABLED(CONFIG_CRYPTO_SHA256)
{
.hmac_id = SCTP_AUTH_HMAC_ID_SHA256,
- .hmac_name = "hmac(sha256)",
- .hmac_len = SCTP_SHA256_SIG_SIZE,
+ .hmac_len = SHA256_DIGEST_SIZE,
}
-#endif
};
+static bool sctp_hmac_supported(__u16 hmac_id)
+{
+ return hmac_id < ARRAY_SIZE(sctp_hmac_list) &&
+ sctp_hmac_list[hmac_id].hmac_len != 0;
+}
void sctp_auth_key_put(struct sctp_auth_bytes *key)
{
@@ -444,76 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey(
return NULL;
}
-/*
- * Initialize all the possible digest transforms that we can use. Right
- * now, the supported digests are SHA1 and SHA256. We do this here once
- * because of the restrictiong that transforms may only be allocated in
- * user context. This forces us to pre-allocated all possible transforms
- * at the endpoint init time.
- */
-int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
-{
- struct crypto_shash *tfm = NULL;
- __u16 id;
-
- /* If the transforms are already allocated, we are done */
- if (ep->auth_hmacs)
- return 0;
-
- /* Allocated the array of pointers to transorms */
- ep->auth_hmacs = kcalloc(SCTP_AUTH_NUM_HMACS,
- sizeof(struct crypto_shash *),
- gfp);
- if (!ep->auth_hmacs)
- return -ENOMEM;
-
- for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) {
-
- /* See is we support the id. Supported IDs have name and
- * length fields set, so that we can allocated and use
- * them. We can safely just check for name, for without the
- * name, we can't allocate the TFM.
- */
- if (!sctp_hmac_list[id].hmac_name)
- continue;
-
- /* If this TFM has been allocated, we are all set */
- if (ep->auth_hmacs[id])
- continue;
-
- /* Allocate the ID */
- tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0);
- if (IS_ERR(tfm))
- goto out_err;
-
- ep->auth_hmacs[id] = tfm;
- }
-
- return 0;
-
-out_err:
- /* Clean up any successful allocations */
- sctp_auth_destroy_hmacs(ep->auth_hmacs);
- ep->auth_hmacs = NULL;
- return -ENOMEM;
-}
-
-/* Destroy the hmac tfm array */
-void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[])
-{
- int i;
-
- if (!auth_hmacs)
- return;
-
- for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) {
- crypto_free_shash(auth_hmacs[i]);
- }
- kfree(auth_hmacs);
-}
-
-
-struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id)
+const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id)
{
return &sctp_hmac_list[hmac_id];
}
@@ -521,7 +453,8 @@ struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id)
/* Get an hmac description information that we can use to build
* the AUTH chunk
*/
-struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
+const struct sctp_hmac *
+sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
{
struct sctp_hmac_algo_param *hmacs;
__u16 n_elt;
@@ -543,26 +476,10 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
sizeof(struct sctp_paramhdr)) >> 1;
for (i = 0; i < n_elt; i++) {
id = ntohs(hmacs->hmac_ids[i]);
-
- /* Check the id is in the supported range. And
- * see if we support the id. Supported IDs have name and
- * length fields set, so that we can allocate and use
- * them. We can safely just check for name, for without the
- * name, we can't allocate the TFM.
- */
- if (id > SCTP_AUTH_HMAC_ID_MAX ||
- !sctp_hmac_list[id].hmac_name) {
- id = 0;
- continue;
- }
-
- break;
+ if (sctp_hmac_supported(id))
+ return &sctp_hmac_list[id];
}
-
- if (id == 0)
- return NULL;
-
- return &sctp_hmac_list[id];
+ return NULL;
}
static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id)
@@ -606,7 +523,6 @@ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc,
void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
struct sctp_hmac_algo_param *hmacs)
{
- struct sctp_endpoint *ep;
__u16 id;
int i;
int n_params;
@@ -617,16 +533,9 @@ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
n_params = (ntohs(hmacs->param_hdr.length) -
sizeof(struct sctp_paramhdr)) >> 1;
- ep = asoc->ep;
for (i = 0; i < n_params; i++) {
id = ntohs(hmacs->hmac_ids[i]);
-
- /* Check the id is in the supported range */
- if (id > SCTP_AUTH_HMAC_ID_MAX)
- continue;
-
- /* If this TFM has been allocated, use this id */
- if (ep->auth_hmacs[id]) {
+ if (sctp_hmac_supported(id)) {
asoc->default_hmac_id = id;
break;
}
@@ -709,10 +618,9 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
struct sctp_shared_key *ep_key, gfp_t gfp)
{
struct sctp_auth_bytes *asoc_key;
- struct crypto_shash *tfm;
__u16 key_id, hmac_id;
- unsigned char *end;
int free_key = 0;
+ size_t data_len;
__u8 *digest;
/* Extract the info we need:
@@ -733,19 +641,17 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
free_key = 1;
}
- /* set up scatter list */
- end = skb_tail_pointer(skb);
-
- tfm = asoc->ep->auth_hmacs[hmac_id];
-
+ data_len = skb_tail_pointer(skb) - (unsigned char *)auth;
digest = (u8 *)(&auth->auth_hdr + 1);
- if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len))
- goto free;
-
- crypto_shash_tfm_digest(tfm, (u8 *)auth, end - (unsigned char *)auth,
- digest);
+ if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) {
+ hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len,
+ (const u8 *)auth, data_len, digest);
+ } else {
+ WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256);
+ hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len,
+ (const u8 *)auth, data_len, digest);
+ }
-free:
if (free_key)
sctp_auth_key_put(asoc_key);
}
@@ -788,14 +694,11 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
for (i = 0; i < hmacs->shmac_num_idents; i++) {
id = hmacs->shmac_idents[i];
- if (id > SCTP_AUTH_HMAC_ID_MAX)
+ if (!sctp_hmac_supported(id))
return -EOPNOTSUPP;
if (SCTP_AUTH_HMAC_ID_SHA1 == id)
has_sha1 = 1;
-
- if (!sctp_hmac_list[id].hmac_name)
- return -EOPNOTSUPP;
}
if (!has_sha1)
@@ -1021,8 +924,6 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep,
int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp)
{
- int err = -ENOMEM;
-
/* Allocate space for HMACS and CHUNKS authentication
* variables. There are arrays that we encode directly
* into parameters to make the rest of the operations easier.
@@ -1060,13 +961,6 @@ int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp)
ep->auth_chunk_list = auth_chunks;
}
- /* Allocate and initialize transorms arrays for supported
- * HMACs.
- */
- err = sctp_auth_init_hmacs(ep, gfp);
- if (err)
- goto nomem;
-
return 0;
nomem:
@@ -1075,7 +969,7 @@ nomem:
kfree(ep->auth_chunk_list);
ep->auth_hmacs_list = NULL;
ep->auth_chunk_list = NULL;
- return err;
+ return -ENOMEM;
}
void sctp_auth_free(struct sctp_endpoint *ep)
@@ -1084,6 +978,4 @@ void sctp_auth_free(struct sctp_endpoint *ep)
kfree(ep->auth_chunk_list);
ep->auth_hmacs_list = NULL;
ep->auth_chunk_list = NULL;
- sctp_auth_destroy_hmacs(ep->auth_hmacs);
- ep->auth_hmacs = NULL;
}
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index fd4f8243cc35..c655b571ca01 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -184,7 +184,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
* DATA.
*/
if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) {
- struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
+ const struct sctp_hmac *hmac_desc =
+ sctp_auth_asoc_get_hmac(asoc);
if (hmac_desc)
max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) +
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index 23359e522273..996c2018f0e6 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -173,7 +173,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
- mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+ mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0)
goto errout;
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 7e77b450697c..31e989dfe846 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -35,6 +35,15 @@
/* Forward declarations for internal helpers. */
static void sctp_endpoint_bh_rcv(struct work_struct *work);
+static void gen_cookie_auth_key(struct hmac_sha256_key *key)
+{
+ u8 raw_key[SCTP_COOKIE_KEY_SIZE];
+
+ get_random_bytes(raw_key, sizeof(raw_key));
+ hmac_sha256_preparekey(key, raw_key, sizeof(raw_key));
+ memzero_explicit(raw_key, sizeof(raw_key));
+}
+
/*
* Initialize the base fields of the endpoint structure.
*/
@@ -45,10 +54,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
struct net *net = sock_net(sk);
struct sctp_shared_key *null_key;
- ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp);
- if (!ep->digest)
- return NULL;
-
ep->asconf_enable = net->sctp.addip_enable;
ep->auth_enable = net->sctp.auth_enable;
if (ep->auth_enable) {
@@ -90,8 +95,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
/* Get the receive buffer policy for this endpoint */
ep->rcvbuf_policy = net->sctp.rcvbuf_policy;
- /* Initialize the secret key used with cookie. */
- get_random_bytes(ep->secret_key, sizeof(ep->secret_key));
+ /* Generate the cookie authentication key. */
+ gen_cookie_auth_key(&ep->cookie_auth_key);
/* SCTP-AUTH extensions*/
INIT_LIST_HEAD(&ep->endpoint_shared_keys);
@@ -118,7 +123,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
nomem_shkey:
sctp_auth_free(ep);
nomem:
- kfree(ep->digest);
return NULL;
}
@@ -205,9 +209,6 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
return;
}
- /* Free the digest buffer */
- kfree(ep->digest);
-
/* SCTP-AUTH: Free up AUTH releated data such as shared keys
* chunks and hmacs arrays that were allocated
*/
@@ -218,7 +219,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
sctp_inq_free(&ep->base.inqueue);
sctp_bind_addr_free(&ep->base.bind_addr);
- memset(ep->secret_key, 0, sizeof(ep->secret_key));
+ memzero_explicit(&ep->cookie_auth_key, sizeof(ep->cookie_auth_key));
sk = ep->base.sk;
/* Remove and free the port */
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 74bff317e205..1ed281f3c355 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -52,21 +52,21 @@ static const struct snmp_mib sctp_snmp_list[] = {
SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG),
SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS),
SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS),
- SNMP_MIB_SENTINEL
};
/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */
static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
{
- unsigned long buff[SCTP_MIB_MAX];
+ unsigned long buff[ARRAY_SIZE(sctp_snmp_list)];
+ const int cnt = ARRAY_SIZE(sctp_snmp_list);
struct net *net = seq->private;
int i;
- memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX);
+ memset(buff, 0, sizeof(buff));
- snmp_get_cpu_field_batch(buff, sctp_snmp_list,
- net->sctp.sctp_statistics);
- for (i = 0; sctp_snmp_list[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, sctp_snmp_list, cnt,
+ net->sctp.sctp_statistics);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name,
buff[i]);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index a5ccada55f2b..9dbc24af749b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -34,6 +34,7 @@
#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/ip.h>
@@ -437,7 +438,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
fl4->fl4_dport = daddr->v4.sin_port;
fl4->flowi4_proto = IPPROTO_SCTP;
if (asoc) {
- fl4->flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4->flowi4_dscp = dscp;
fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk);
fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
fl4->fl4_sport = htons(asoc->base.bind_addr.port);
@@ -1334,14 +1335,9 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Whether Cookie Preservative is enabled(1) or not(0) */
net->sctp.cookie_preserve_enable = 1;
- /* Default sctp sockets to use md5 as their hmac alg */
-#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5)
- net->sctp.sctp_hmac_alg = "md5";
-#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1)
- net->sctp.sctp_hmac_alg = "sha1";
-#else
- net->sctp.sctp_hmac_alg = NULL;
-#endif
+ /* Whether cookie authentication is enabled(1) or not(0) */
+ net->sctp.cookie_auth_enable =
+ !IS_ENABLED(CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE);
/* Max.Burst - 4 */
net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 3ead591c72fd..2c0017d058d4 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -30,7 +30,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <crypto/hash.h>
+#include <crypto/utils.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/ip.h>
@@ -1319,7 +1319,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc,
__u16 key_id)
{
struct sctp_authhdr auth_hdr;
- struct sctp_hmac *hmac_desc;
+ const struct sctp_hmac *hmac_desc;
struct sctp_chunk *retval;
/* Get the first hmac that the peer told us to use */
@@ -1674,8 +1674,10 @@ static struct sctp_cookie_param *sctp_pack_cookie(
* out on the network.
*/
retval = kzalloc(*cookie_len, GFP_ATOMIC);
- if (!retval)
- goto nodata;
+ if (!retval) {
+ *cookie_len = 0;
+ return NULL;
+ }
cookie = (struct sctp_signed_cookie *) retval->body;
@@ -1706,26 +1708,14 @@ static struct sctp_cookie_param *sctp_pack_cookie(
memcpy((__u8 *)(cookie + 1) +
ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);
- if (sctp_sk(ep->base.sk)->hmac) {
- struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac;
- int err;
-
- /* Sign the message. */
- err = crypto_shash_setkey(tfm, ep->secret_key,
- sizeof(ep->secret_key)) ?:
- crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize,
- cookie->signature);
- if (err)
- goto free_cookie;
+ /* Sign the cookie, if cookie authentication is enabled. */
+ if (sctp_sk(ep->base.sk)->cookie_auth_enable) {
+ static_assert(sizeof(cookie->mac) == SHA256_DIGEST_SIZE);
+ hmac_sha256(&ep->cookie_auth_key, (const u8 *)&cookie->c,
+ bodysize, cookie->mac);
}
return retval;
-
-free_cookie:
- kfree(retval);
-nodata:
- *cookie_len = 0;
- return NULL;
}
/* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */
@@ -1740,7 +1730,6 @@ struct sctp_association *sctp_unpack_cookie(
struct sctp_signed_cookie *cookie;
struct sk_buff *skb = chunk->skb;
struct sctp_cookie *bear_cookie;
- __u8 *digest = ep->digest;
enum sctp_scope scope;
unsigned int len;
ktime_t kt;
@@ -1770,30 +1759,19 @@ struct sctp_association *sctp_unpack_cookie(
cookie = chunk->subh.cookie_hdr;
bear_cookie = &cookie->c;
- if (!sctp_sk(ep->base.sk)->hmac)
- goto no_hmac;
+ /* Verify the cookie's MAC, if cookie authentication is enabled. */
+ if (sctp_sk(ep->base.sk)->cookie_auth_enable) {
+ u8 mac[SHA256_DIGEST_SIZE];
- /* Check the signature. */
- {
- struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac;
- int err;
-
- err = crypto_shash_setkey(tfm, ep->secret_key,
- sizeof(ep->secret_key)) ?:
- crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize,
- digest);
- if (err) {
- *error = -SCTP_IERROR_NOMEM;
+ hmac_sha256(&ep->cookie_auth_key, (const u8 *)bear_cookie,
+ bodysize, mac);
+ static_assert(sizeof(cookie->mac) == sizeof(mac));
+ if (crypto_memneq(mac, cookie->mac, sizeof(mac))) {
+ *error = -SCTP_IERROR_BAD_SIG;
goto fail;
}
}
- if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) {
- *error = -SCTP_IERROR_BAD_SIG;
- goto fail;
- }
-
-no_hmac:
/* IG Section 2.35.2:
* 3) Compare the port numbers and the verification tag contained
* within the COOKIE ECHO chunk to the actual port numbers and the
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index a0524ba8d787..4cb8f393434d 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -30,6 +30,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/utils.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/ip.h>
@@ -4361,7 +4362,7 @@ static enum sctp_ierror sctp_sf_authenticate(
struct sctp_shared_key *sh_key = NULL;
struct sctp_authhdr *auth_hdr;
__u8 *save_digest, *digest;
- struct sctp_hmac *hmac;
+ const struct sctp_hmac *hmac;
unsigned int sig_len;
__u16 key_id;
@@ -4416,7 +4417,7 @@ static enum sctp_ierror sctp_sf_authenticate(
sh_key, GFP_ATOMIC);
/* Discard the packet if the digests do not match */
- if (memcmp(save_digest, digest, sig_len)) {
+ if (crypto_memneq(save_digest, digest, sig_len)) {
kfree(save_digest);
return SCTP_IERROR_BAD_SIG;
}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 4921416434f9..ed8293a34240 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -37,7 +37,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <crypto/hash.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/wait.h>
@@ -4987,7 +4986,7 @@ static int sctp_init_sock(struct sock *sk)
sp->default_rcv_context = 0;
sp->max_burst = net->sctp.max_burst;
- sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg;
+ sp->cookie_auth_enable = net->sctp.cookie_auth_enable;
/* Initialize default setup parameters. These parameters
* can be modified with the SCTP_INITMSG socket option or
@@ -5079,8 +5078,6 @@ static int sctp_init_sock(struct sock *sk)
if (!sp->ep)
return -ENOMEM;
- sp->hmac = NULL;
-
sk->sk_destruct = sctp_destruct_sock;
SCTP_DBG_OBJCNT_INC(sock);
@@ -5117,18 +5114,8 @@ static void sctp_destroy_sock(struct sock *sk)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}
-/* Triggered when there are no references on the socket anymore */
-static void sctp_destruct_common(struct sock *sk)
-{
- struct sctp_sock *sp = sctp_sk(sk);
-
- /* Free up the HMAC transform. */
- crypto_free_shash(sp->hmac);
-}
-
static void sctp_destruct_sock(struct sock *sk)
{
- sctp_destruct_common(sk);
inet_sock_destruct(sk);
}
@@ -8530,22 +8517,8 @@ static int sctp_listen_start(struct sock *sk, int backlog)
{
struct sctp_sock *sp = sctp_sk(sk);
struct sctp_endpoint *ep = sp->ep;
- struct crypto_shash *tfm = NULL;
- char alg[32];
int err;
- /* Allocate HMAC for generating cookie. */
- if (!sp->hmac && sp->sctp_hmac_alg) {
- sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg);
- tfm = crypto_alloc_shash(alg, 0, 0);
- if (IS_ERR(tfm)) {
- net_info_ratelimited("failed to load transform for %s: %ld\n",
- sp->sctp_hmac_alg, PTR_ERR(tfm));
- return -ENOSYS;
- }
- sctp_sk(sk)->hmac = tfm;
- }
-
/*
* If a bind() or sctp_bindx() is not called prior to a listen()
* call that allows new associations to be accepted, the system
@@ -9561,7 +9534,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
* copy.
*/
newsp->ep = newep;
- newsp->hmac = NULL;
/* Hook this new socket in to the bind_hash list. */
head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk),
@@ -9581,16 +9553,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
if (err)
return err;
- /* New ep's auth_hmacs should be set if old ep's is set, in case
- * that net->sctp.auth_enable has been changed to 0 by users and
- * new ep's auth_hmacs couldn't be set in sctp_endpoint_init().
- */
- if (oldsp->ep->auth_hmacs) {
- err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL);
- if (err)
- return err;
- }
-
sctp_auto_asconf_init(newsp);
/* Move any messages in the old socket's receive queue that are for the
@@ -9723,7 +9685,6 @@ struct proto sctp_prot = {
static void sctp_v6_destruct_sock(struct sock *sk)
{
- sctp_destruct_common(sk);
inet6_sock_destruct(sk);
}
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index ee3eac338a9d..15e7db9a3ab2 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -174,7 +174,7 @@ static struct ctl_table sctp_net_table[] = {
},
{
.procname = "cookie_hmac_alg",
- .data = &init_net.sctp.sctp_hmac_alg,
+ .data = &init_net.sctp.cookie_auth_enable,
.maxlen = 8,
.mode = 0644,
.proc_handler = proc_sctp_do_hmac_alg,
@@ -388,10 +388,8 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(ctl->data, struct net,
- sctp.sctp_hmac_alg);
+ sctp.cookie_auth_enable);
struct ctl_table tbl;
- bool changed = false;
- char *none = "none";
char tmp[8] = {0};
int ret;
@@ -399,35 +397,26 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write,
if (write) {
tbl.data = tmp;
- tbl.maxlen = sizeof(tmp);
- } else {
- tbl.data = net->sctp.sctp_hmac_alg ? : none;
- tbl.maxlen = strlen(tbl.data);
- }
-
- ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
- if (write && ret == 0) {
-#ifdef CONFIG_CRYPTO_MD5
- if (!strncmp(tmp, "md5", 3)) {
- net->sctp.sctp_hmac_alg = "md5";
- changed = true;
+ tbl.maxlen = sizeof(tmp) - 1;
+ ret = proc_dostring(&tbl, 1, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+ if (!strcmp(tmp, "sha256")) {
+ net->sctp.cookie_auth_enable = 1;
+ return 0;
}
-#endif
-#ifdef CONFIG_CRYPTO_SHA1
- if (!strncmp(tmp, "sha1", 4)) {
- net->sctp.sctp_hmac_alg = "sha1";
- changed = true;
+ if (!strcmp(tmp, "none")) {
+ net->sctp.cookie_auth_enable = 0;
+ return 0;
}
-#endif
- if (!strncmp(tmp, "none", 4)) {
- net->sctp.sctp_hmac_alg = NULL;
- changed = true;
- }
- if (!changed)
- ret = -EINVAL;
+ return -EINVAL;
}
-
- return ret;
+ if (net->sctp.cookie_auth_enable)
+ tbl.data = (char *)"sha256";
+ else
+ tbl.data = (char *)"none";
+ tbl.maxlen = strlen(tbl.data);
+ return proc_dostring(&tbl, 0, buffer, lenp, ppos);
}
static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write,
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index a42ef3f77b96..0052f02756eb 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -974,13 +974,17 @@ static int smc_ib_add_dev(struct ib_device *ibdev)
smcibdev->pnetid[i]))
smc_pnetid_by_table_ib(smcibdev, i + 1);
smc_copy_netdev_ifindex(smcibdev, i);
- pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
- "%.16s%s\n",
- smcibdev->ibdev->name, i + 1,
- smcibdev->pnetid[i],
- smcibdev->pnetid_by_user[i] ?
- " (user defined)" :
- "");
+ if (smc_pnet_is_pnetid_set(smcibdev->pnetid[i]))
+ pr_warn_ratelimited("smc: ib device %s port %d has pnetid %.16s%s\n",
+ smcibdev->ibdev->name, i + 1,
+ smcibdev->pnetid[i],
+ smcibdev->pnetid_by_user[i] ?
+ " (user defined)" :
+ "");
+ else
+ pr_warn_ratelimited("smc: ib device %s port %d has no pnetid\n",
+ smcibdev->ibdev->name, i + 1);
+
}
schedule_work(&smcibdev->port_event_work);
return 0;
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 84f98e18c7db..a58ffb7a0610 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -518,10 +518,15 @@ static void smcd_register_dev(struct ism_dev *ism)
}
mutex_unlock(&smcd_dev_list.mutex);
- pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
- dev_name(&ism->dev), smcd->pnetid,
- smcd->pnetid_by_user ? " (user defined)" : "");
-
+ if (smc_pnet_is_pnetid_set(smcd->pnetid))
+ pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
+ dev_name(&ism->dev), smcd->pnetid,
+ smcd->pnetid_by_user ?
+ " (user defined)" :
+ "");
+ else
+ pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n",
+ dev_name(&ism->dev));
return;
}
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 76ad29e31d60..b90337f86e83 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -450,7 +450,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
return -ENOMEM;
new_pe->type = SMC_PNET_IB;
memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
- strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX);
+ strscpy(new_pe->ib_name, ib_name);
new_pe->ib_port = ib_port;
new_ibdev = true;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index e028bf658499..1574a83384f8 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2366,7 +2366,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) {
trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL,
"err_overload2!");
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
err = TIPC_ERR_OVERLOAD;
}
@@ -2458,7 +2458,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!");
/* Overload => reject message back to sender */
onode = tipc_own_addr(sock_net(sk));
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) {
trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL,
"@sk_enqueue!");
@@ -3657,7 +3657,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ,
skb_queue_len(&sk->sk_write_queue)) ||
nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP,
- atomic_read(&sk->sk_drops)))
+ sk_drops_read(sk)))
goto stat_msg_cancel;
if (tsk->cong_link_cnt &&
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 367666aa07b8..4012c4372d4c 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -27,17 +27,19 @@ static const struct snmp_mib tls_mib_list[] = {
SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK),
SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR),
SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED),
- SNMP_MIB_SENTINEL
};
static int tls_statistics_seq_show(struct seq_file *seq, void *v)
{
- unsigned long buf[LINUX_MIB_TLSMAX] = {};
+ unsigned long buf[ARRAY_SIZE(tls_mib_list)];
+ const int cnt = ARRAY_SIZE(tls_mib_list);
struct net *net = seq->private;
int i;
- snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics);
- for (i = 0; tls_mib_list[i].name; i++)
+ memset(buf, 0, sizeof(buf));
+ snmp_get_cpu_field_batch_cnt(buf, tls_mib_list, cnt,
+ net->mib.tls_statistics);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]);
return 0;
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index bebb355f3ffe..0538948d5fd9 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1029,12 +1029,7 @@ static int vsock_getname(struct socket *sock,
vm_addr = &vsk->local_addr;
}
- /* sys_getsockname() and sys_getpeername() pass us a
- * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately
- * that macro is defined in socket.c instead of .h, so we hardcode its
- * value here.
- */
- BUILD_BUG_ON(sizeof(*vm_addr) > 128);
+ BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage));
memcpy(addr, vm_addr, sizeof(*vm_addr));
err = sizeof(*vm_addr);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index c5035a9bc3bb..62486f866975 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2594,7 +2594,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
static dscp_t xfrm_get_dscp(const struct flowi *fl, int family)
{
if (family == AF_INET)
- return inet_dsfield_to_dscp(fl->u.ip4.flowi4_tos);
+ return fl->u.ip4.flowi4_dscp;
return 0;
}
@@ -3462,7 +3462,7 @@ decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reve
}
fl4->flowi4_proto = flkeys->basic.ip_proto;
- fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK;
+ fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos);
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -3594,7 +3594,7 @@ static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family,
fl1->flowi_oif = fl->flowi_oif;
fl1->flowi_mark = fl->flowi_mark;
- fl1->flowi_tos = fl->flowi_tos;
+ fl1->flowi_dscp = fl->flowi_dscp;
nf_nat_decode_session(newskb, fl1, family);
ret = false;
@@ -3881,12 +3881,18 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
}
skb_dst_force(skb);
- if (!skb_dst(skb)) {
+ dst = skb_dst(skb);
+ if (!dst) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
return 0;
}
- dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
+
+ dst = xfrm_lookup(net, dst, &fl, NULL, XFRM_LOOKUP_QUEUE);
if (IS_ERR(dst)) {
res = 0;
dst = NULL;
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index 8e07dd614b0b..5e1fd6b1d503 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -45,21 +45,21 @@ static const struct snmp_mib xfrm_mib_list[] = {
SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR),
SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR),
SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE),
- SNMP_MIB_SENTINEL
};
static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
{
- unsigned long buff[LINUX_MIB_XFRMMAX];
+ unsigned long buff[ARRAY_SIZE(xfrm_mib_list)];
+ const int cnt = ARRAY_SIZE(xfrm_mib_list);
struct net *net = seq->private;
int i;
- memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX);
+ memset(buff, 0, sizeof(buff));
xfrm_state_update_stats(net);
- snmp_get_cpu_field_batch(buff, xfrm_mib_list,
- net->mib.xfrm_statistics);
- for (i = 0; xfrm_mib_list[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, xfrm_mib_list, cnt,
+ net->mib.xfrm_statistics);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name,
buff[i]);