diff options
Diffstat (limited to 'net')
600 files changed, 22981 insertions, 23771 deletions
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c index c40b98f7743cd..868d28583c0ae 100644 --- a/net/6lowpan/ndisc.c +++ b/net/6lowpan/ndisc.c @@ -20,9 +20,8 @@ static int lowpan_ndisc_parse_802154_options(const struct net_device *dev, switch (nd_opt->nd_opt_len) { case NDISC_802154_SHORT_ADDR_LENGTH: if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type]) - ND_PRINTK(2, warn, - "%s: duplicated short addr ND6 option found: type=%d\n", - __func__, nd_opt->nd_opt_type); + net_dbg_ratelimited("%s: duplicated short addr ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); else ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt; return 1; @@ -63,8 +62,7 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr, IEEE802154_SHORT_ADDR_LEN, 0); if (!lladdr_short) { - ND_PRINTK(2, warn, - "NA: invalid short link-layer address length\n"); + net_dbg_ratelimited("NA: invalid short link-layer address length\n"); return; } } @@ -75,8 +73,7 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr, IEEE802154_SHORT_ADDR_LEN, 0); if (!lladdr_short) { - ND_PRINTK(2, warn, - "NA: invalid short link-layer address length\n"); + net_dbg_ratelimited("NA: invalid short link-layer address length\n"); return; } } @@ -209,9 +206,8 @@ static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net, sllao, tokenized, valid_lft, prefered_lft); if (err) - ND_PRINTK(2, warn, - "RA: could not add a short address based address for prefix: %pI6c\n", - &pinfo->prefix); + net_dbg_ratelimited("RA: could not add a short address based address for prefix: %pI6c\n", + &pinfo->prefix); } } #endif diff --git a/net/802/Makefile b/net/802/Makefile index bfed80221b8bc..99abc29d537c0 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -3,12 +3,11 @@ # Makefile for the Linux 802.x protocol layers. # -# Check the p8022 selections against net/core/Makefile. -obj-$(CONFIG_LLC) += p8022.o psnap.o +obj-$(CONFIG_LLC) += psnap.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o obj-$(CONFIG_HIPPI) += hippi.o -obj-$(CONFIG_ATALK) += p8022.o psnap.o +obj-$(CONFIG_ATALK) += psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o obj-$(CONFIG_MRP) += mrp.o diff --git a/net/802/garp.c b/net/802/garp.c index 27f0ab146026b..2d1ffc4d94626 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -414,7 +414,7 @@ static void garp_join_timer_arm(struct garp_applicant *app) static void garp_join_timer(struct timer_list *t) { - struct garp_applicant *app = from_timer(app, t, join_timer); + struct garp_applicant *app = timer_container_of(app, t, join_timer); spin_lock(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); diff --git a/net/802/mrp.c b/net/802/mrp.c index e0c96d0da8d59..23a88305f900c 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -599,7 +599,7 @@ static void mrp_join_timer_arm(struct mrp_applicant *app) static void mrp_join_timer(struct timer_list *t) { - struct mrp_applicant *app = from_timer(app, t, join_timer); + struct mrp_applicant *app = timer_container_of(app, t, join_timer); spin_lock(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); @@ -621,7 +621,7 @@ static void mrp_periodic_timer_arm(struct mrp_applicant *app) static void mrp_periodic_timer(struct timer_list *t) { - struct mrp_applicant *app = from_timer(app, t, periodic_timer); + struct mrp_applicant *app = timer_container_of(app, t, periodic_timer); spin_lock(&app->lock); if (likely(app->active)) { diff --git a/net/802/p8022.c b/net/802/p8022.c deleted file mode 100644 index 78c25168d7c91..0000000000000 --- a/net/802/p8022.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET3: Support for 802.2 demultiplexing off Ethernet - * - * Demultiplex 802.2 encoded protocols. We match the entry by the - * SSAP/DSAP pair and then deliver to the registered datalink that - * matches. The control byte is ignored and handling of such items - * is up to the routine passed the frame. - * - * Unlike the 802.3 datalink we have a list of 802.2 entries as - * there are multiple protocols to demux. The list is currently - * short (3 or 4 entries at most). The current demux assumes this. - */ -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <net/datalink.h> -#include <linux/mm.h> -#include <linux/in.h> -#include <linux/init.h> -#include <net/llc.h> -#include <net/p8022.h> - -static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, - const unsigned char *dest) -{ - llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap); - return 0; -} - -struct datalink_proto *register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)) -{ - struct datalink_proto *proto; - - proto = kmalloc(sizeof(*proto), GFP_ATOMIC); - if (proto) { - proto->type[0] = type; - proto->header_length = 3; - proto->request = p8022_request; - proto->sap = llc_sap_open(type, func); - if (!proto->sap) { - kfree(proto); - proto = NULL; - } - } - return proto; -} - -void unregister_8022_client(struct datalink_proto *proto) -{ - llc_sap_put(proto->sap); - kfree(proto); -} - -EXPORT_SYMBOL(register_8022_client); -EXPORT_SYMBOL(unregister_8022_client); - -MODULE_DESCRIPTION("Support for 802.2 demultiplexing off Ethernet"); -MODULE_LICENSE("GPL"); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 41be38264493d..fda3a80e9340c 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -23,7 +23,6 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> -#include <net/p8022.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> @@ -358,6 +357,35 @@ static int __vlan_device_event(struct net_device *dev, unsigned long event) return err; } +static void vlan_vid0_add(struct net_device *dev) +{ + struct vlan_info *vlan_info; + int err; + + if (!(dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) + return; + + pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); + + err = vlan_vid_add(dev, htons(ETH_P_8021Q), 0); + if (err) + return; + + vlan_info = rtnl_dereference(dev->vlan_info); + vlan_info->auto_vid0 = true; +} + +static void vlan_vid0_del(struct net_device *dev) +{ + struct vlan_info *vlan_info = rtnl_dereference(dev->vlan_info); + + if (!vlan_info || !vlan_info->auto_vid0) + return; + + vlan_info->auto_vid0 = false; + vlan_vid_del(dev, htons(ETH_P_8021Q), 0); +} + static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -379,15 +407,10 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, return notifier_from_errno(err); } - if ((event == NETDEV_UP) && - (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { - pr_info("adding VLAN 0 to HW filter on device %s\n", - dev->name); - vlan_vid_add(dev, htons(ETH_P_8021Q), 0); - } - if (event == NETDEV_DOWN && - (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) - vlan_vid_del(dev, htons(ETH_P_8021Q), 0); + if (event == NETDEV_UP) + vlan_vid0_add(dev); + else if (event == NETDEV_DOWN) + vlan_vid0_del(dev); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) @@ -447,7 +470,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, list_add(&vlandev->close_list, &close_list); } - dev_close_many(&close_list, false); + netif_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, @@ -460,7 +483,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { - flgs = dev_get_flags(vlandev); + flgs = netif_get_flags(vlandev); if (flgs & IFF_UP) continue; @@ -742,3 +765,4 @@ module_exit(vlan_cleanup_module); MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); +MODULE_IMPORT_NS("NETDEV_INTERNAL"); diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 5eaf38875554b..c7ffe591d5936 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -33,6 +33,7 @@ struct vlan_info { struct vlan_group grp; struct list_head vid_list; unsigned int nr_vids; + bool auto_vid0; struct rcu_head rcu; }; diff --git a/net/9p/client.c b/net/9p/client.c index 61461b9fa1343..5c1ca57ccd285 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1704,7 +1704,7 @@ p9_client_write_subreq(struct netfs_io_subrequest *subreq) start, len, &subreq->io_iter); } if (IS_ERR(req)) { - netfs_write_subrequest_terminated(subreq, PTR_ERR(req), false); + netfs_write_subrequest_terminated(subreq, PTR_ERR(req)); return; } @@ -1712,7 +1712,7 @@ p9_client_write_subreq(struct netfs_io_subrequest *subreq) if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); - netfs_write_subrequest_terminated(subreq, err, false); + netfs_write_subrequest_terminated(subreq, err); return; } @@ -1724,7 +1724,7 @@ p9_client_write_subreq(struct netfs_io_subrequest *subreq) p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", len); p9_req_put(clnt, req); - netfs_write_subrequest_terminated(subreq, written, false); + netfs_write_subrequest_terminated(subreq, written); } EXPORT_SYMBOL(p9_client_write_subreq); diff --git a/net/Kconfig b/net/Kconfig index c3fca69a7c834..d5865cf197995 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -68,13 +68,17 @@ config SKB_EXTENSIONS config NET_DEVMEM def_bool y + select GENERIC_ALLOCATOR depends on DMA_SHARED_BUFFER - depends on GENERIC_ALLOCATOR depends on PAGE_POOL config NET_SHAPER bool +config NET_CRC32C + bool + select CRC32 + menu "Networking options" source "net/packet/Kconfig" @@ -243,9 +247,8 @@ source "net/ipv4/netfilter/Kconfig" source "net/ipv6/netfilter/Kconfig" source "net/bridge/netfilter/Kconfig" -endif +endif # if NETFILTER -source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" source "net/tipc/Kconfig" @@ -405,9 +408,9 @@ config NET_DROP_MONITOR just checking the various proc files and other utilities for drop statistics, say N here. -endmenu +endmenu # Network testing -endmenu +endmenu # Networking options source "net/ax25/Kconfig" source "net/can/Kconfig" diff --git a/net/Makefile b/net/Makefile index 60ed5190eda8d..aac960c41db6d 100644 --- a/net/Makefile +++ b/net/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_PHONET) += phonet/ ifneq ($(CONFIG_VLAN_8021Q),) obj-y += 8021q/ endif -obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ obj-$(CONFIG_RDS) += rds/ obj-$(CONFIG_WIRELESS) += wireless/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 9c787e2e4b173..4744e3fd45447 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -35,6 +35,7 @@ #include <linux/seq_file.h> #include <linux/export.h> #include <linux/etherdevice.h> +#include <linux/refcount.h> int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME; int sysctl_aarp_tick_time = AARP_TICK_TIME; @@ -44,6 +45,7 @@ int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME; /* Lists of aarp entries */ /** * struct aarp_entry - AARP entry + * @refcnt: Reference count * @last_sent: Last time we xmitted the aarp request * @packet_queue: Queue of frames wait for resolution * @status: Used for proxy AARP @@ -55,6 +57,7 @@ int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME; * @next: Next entry in chain */ struct aarp_entry { + refcount_t refcnt; /* These first two are only used for unresolved entries */ unsigned long last_sent; struct sk_buff_head packet_queue; @@ -79,6 +82,17 @@ static DEFINE_RWLOCK(aarp_lock); /* Used to walk the list and purge/kick entries. */ static struct timer_list aarp_timer; +static inline void aarp_entry_get(struct aarp_entry *a) +{ + refcount_inc(&a->refcnt); +} + +static inline void aarp_entry_put(struct aarp_entry *a) +{ + if (refcount_dec_and_test(&a->refcnt)) + kfree(a); +} + /* * Delete an aarp queue * @@ -87,7 +101,7 @@ static struct timer_list aarp_timer; static void __aarp_expire(struct aarp_entry *a) { skb_queue_purge(&a->packet_queue); - kfree(a); + aarp_entry_put(a); } /* @@ -380,9 +394,11 @@ static void aarp_purge(void) static struct aarp_entry *aarp_alloc(void) { struct aarp_entry *a = kmalloc(sizeof(*a), GFP_ATOMIC); + if (!a) + return NULL; - if (a) - skb_queue_head_init(&a->packet_queue); + refcount_set(&a->refcnt, 1); + skb_queue_head_init(&a->packet_queue); return a; } @@ -477,6 +493,7 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa) entry->dev = atif->dev; write_lock_bh(&aarp_lock); + aarp_entry_get(entry); hash = sa->s_node % (AARP_HASH_SIZE - 1); entry->next = proxies[hash]; @@ -502,6 +519,7 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa) retval = 1; } + aarp_entry_put(entry); write_unlock_bh(&aarp_lock); out: return retval; diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index 9c1241292d1d2..01787fb6a7bce 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -181,7 +181,7 @@ static int atalk_seq_socket_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), s->sk_state, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(s))); + from_kuid_munged(seq_user_ns(seq), sk_uid(s))); out: return 0; } diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index b068651984fe3..30242fe103419 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -169,7 +169,7 @@ found: static void atalk_destroy_timer(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); if (sk_has_allocations(sk)) { sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; @@ -576,6 +576,7 @@ static int atrtr_create(struct rtentry *r, struct net_device *devhint) /* Fill in the routing entry */ rt->target = ta->sat_addr; + dev_put(rt->dev); /* Release old device */ dev_hold(devhint); rt->dev = devhint; rt->flags = r->rt_flags; diff --git a/net/atm/clip.c b/net/atm/clip.c index 61b5b700817de..f7a5565e794ef 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -45,7 +45,8 @@ #include <net/atmclip.h> static struct net_device *clip_devs; -static struct atm_vcc *atmarpd; +static struct atm_vcc __rcu *atmarpd; +static DEFINE_MUTEX(atmarpd_lock); static struct timer_list idle_timer; static const struct neigh_ops clip_neigh_ops; @@ -53,24 +54,35 @@ static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) { struct sock *sk; struct atmarp_ctrl *ctrl; + struct atm_vcc *vcc; struct sk_buff *skb; + int err = 0; pr_debug("(%d)\n", type); - if (!atmarpd) - return -EUNATCH; + + rcu_read_lock(); + vcc = rcu_dereference(atmarpd); + if (!vcc) { + err = -EUNATCH; + goto unlock; + } skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC); - if (!skb) - return -ENOMEM; + if (!skb) { + err = -ENOMEM; + goto unlock; + } ctrl = skb_put(skb, sizeof(struct atmarp_ctrl)); ctrl->type = type; ctrl->itf_num = itf; ctrl->ip = ip; - atm_force_charge(atmarpd, skb->truesize); + atm_force_charge(vcc, skb->truesize); - sk = sk_atm(atmarpd); + sk = sk_atm(vcc); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); - return 0; +unlock: + rcu_read_unlock(); + return err; } static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry) @@ -193,12 +205,6 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("\n"); - if (!clip_devs) { - atm_return(vcc, skb->truesize); - kfree_skb(skb); - return; - } - if (!skb) { pr_debug("removing VCC %p\n", clip_vcc); if (clip_vcc->entry) @@ -208,6 +214,11 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) return; } atm_return(vcc, skb->truesize); + if (!clip_devs) { + kfree_skb(skb); + return; + } + skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs; /* clip_vcc->entry == NULL if we don't have an IP address yet */ if (!skb->dev) { @@ -418,6 +429,8 @@ static int clip_mkip(struct atm_vcc *vcc, int timeout) if (!vcc->push) return -EBADFD; + if (vcc->user_back) + return -EINVAL; clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL); if (!clip_vcc) return -ENOMEM; @@ -608,17 +621,27 @@ static void atmarpd_close(struct atm_vcc *vcc) { pr_debug("\n"); - rtnl_lock(); - atmarpd = NULL; + mutex_lock(&atmarpd_lock); + RCU_INIT_POINTER(atmarpd, NULL); + mutex_unlock(&atmarpd_lock); + + synchronize_rcu(); skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); - rtnl_unlock(); pr_debug("(done)\n"); module_put(THIS_MODULE); } +static int atmarpd_send(struct atm_vcc *vcc, struct sk_buff *skb) +{ + atm_return_tx(vcc, skb); + dev_kfree_skb_any(skb); + return 0; +} + static const struct atmdev_ops atmarpd_dev_ops = { - .close = atmarpd_close + .close = atmarpd_close, + .send = atmarpd_send }; @@ -632,15 +655,18 @@ static struct atm_dev atmarpd_dev = { static int atm_init_atmarp(struct atm_vcc *vcc) { - rtnl_lock(); + if (vcc->push == clip_push) + return -EINVAL; + + mutex_lock(&atmarpd_lock); if (atmarpd) { - rtnl_unlock(); + mutex_unlock(&atmarpd_lock); return -EADDRINUSE; } mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); - atmarpd = vcc; + rcu_assign_pointer(atmarpd, vcc); set_bit(ATM_VF_META, &vcc->flags); set_bit(ATM_VF_READY, &vcc->flags); /* allow replies and avoid getting closed if signaling dies */ @@ -649,13 +675,14 @@ static int atm_init_atmarp(struct atm_vcc *vcc) vcc->push = NULL; vcc->pop = NULL; /* crash */ vcc->push_oam = NULL; /* crash */ - rtnl_unlock(); + mutex_unlock(&atmarpd_lock); return 0; } static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct atm_vcc *vcc = ATM_SD(sock); + struct sock *sk = sock->sk; int err = 0; switch (cmd) { @@ -676,14 +703,18 @@ static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = clip_create(arg); break; case ATMARPD_CTRL: + lock_sock(sk); err = atm_init_atmarp(vcc); if (!err) { sock->state = SS_CONNECTED; __module_get(THIS_MODULE); } + release_sock(sk); break; case ATMARP_MKIP: + lock_sock(sk); err = clip_mkip(vcc, arg); + release_sock(sk); break; case ATMARP_SETENTRY: err = clip_setentry(vcc, (__force __be32)arg); diff --git a/net/atm/common.c b/net/atm/common.c index 9b75699992ff9..d7f7976ea13ac 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -635,6 +635,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) skb->dev = NULL; /* for paths shared with net_device interfaces */ if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) { + atm_return_tx(vcc, skb); kfree_skb(skb); error = -EFAULT; goto out; diff --git a/net/atm/lec.c b/net/atm/lec.c index ded2f0df2ee66..afb8d3eb21850 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -124,6 +124,7 @@ static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; /* Device structures */ static struct net_device *dev_lec[MAX_LEC_ITF]; +static DEFINE_MUTEX(lec_mutex); #if IS_ENABLED(CONFIG_BRIDGE) static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) @@ -685,6 +686,7 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) int bytes_left; struct atmlec_ioc ioc_data; + lockdep_assert_held(&lec_mutex); /* Lecd must be up in this case */ bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc)); if (bytes_left != 0) @@ -710,6 +712,7 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) static int lec_mcast_attach(struct atm_vcc *vcc, int arg) { + lockdep_assert_held(&lec_mutex); if (arg < 0 || arg >= MAX_LEC_ITF) return -EINVAL; arg = array_index_nospec(arg, MAX_LEC_ITF); @@ -725,6 +728,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg) int i; struct lec_priv *priv; + lockdep_assert_held(&lec_mutex); if (arg < 0) arg = 0; if (arg >= MAX_LEC_ITF) @@ -742,6 +746,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg) snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i); if (register_netdev(dev_lec[i])) { free_netdev(dev_lec[i]); + dev_lec[i] = NULL; return -EINVAL; } @@ -904,7 +909,6 @@ static void *lec_itf_walk(struct lec_state *state, loff_t *l) v = (dev && netdev_priv(dev)) ? lec_priv_walk(state, l, netdev_priv(dev)) : NULL; if (!v && dev) { - dev_put(dev); /* Partial state reset for the next time we get called */ dev = NULL; } @@ -928,6 +932,7 @@ static void *lec_seq_start(struct seq_file *seq, loff_t *pos) { struct lec_state *state = seq->private; + mutex_lock(&lec_mutex); state->itf = 0; state->dev = NULL; state->locked = NULL; @@ -945,8 +950,9 @@ static void lec_seq_stop(struct seq_file *seq, void *v) if (state->dev) { spin_unlock_irqrestore(&state->locked->lec_arp_lock, state->flags); - dev_put(state->dev); + state->dev = NULL; } + mutex_unlock(&lec_mutex); } static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -1003,6 +1009,7 @@ static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -ENOIOCTLCMD; } + mutex_lock(&lec_mutex); switch (cmd) { case ATMLEC_CTRL: err = lecd_attach(vcc, (int)arg); @@ -1017,6 +1024,7 @@ static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; } + mutex_unlock(&lec_mutex); return err; } @@ -1551,7 +1559,7 @@ static void lec_arp_expire_arp(struct timer_list *t) { struct lec_arp_table *entry; - entry = from_timer(entry, t, timer); + entry = timer_container_of(entry, t, timer); pr_debug("\n"); if (entry->status == ESI_ARP_PENDING) { @@ -1572,7 +1580,8 @@ static void lec_arp_expire_arp(struct timer_list *t) static void lec_arp_expire_vcc(struct timer_list *t) { unsigned long flags; - struct lec_arp_table *to_remove = from_timer(to_remove, t, timer); + struct lec_arp_table *to_remove = timer_container_of(to_remove, t, + timer); struct lec_priv *priv = to_remove->priv; timer_delete(&to_remove->timer); diff --git a/net/atm/raw.c b/net/atm/raw.c index 2b5f78a7ec3e4..1e6511ec842cb 100644 --- a/net/atm/raw.c +++ b/net/atm/raw.c @@ -36,7 +36,7 @@ static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("(%d) %d -= %d\n", vcc->vci, sk_wmem_alloc_get(sk), ATM_SKB(skb)->acct_truesize); - WARN_ON(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize, &sk->sk_wmem_alloc)); + atm_return_tx(vcc, skb); dev_kfree_skb_any(skb); sk->sk_write_space(sk); } diff --git a/net/atm/resources.c b/net/atm/resources.c index 995d29e7fb138..b19d851e1f443 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -146,11 +146,10 @@ void atm_dev_deregister(struct atm_dev *dev) */ mutex_lock(&atm_dev_mutex); list_del(&dev->dev_list); - mutex_unlock(&atm_dev_mutex); - atm_dev_release_vccs(dev); atm_unregister_sysfs(dev); atm_proc_dev_deregister(dev); + mutex_unlock(&atm_dev_mutex); atm_dev_put(dev); } diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index b790bb92ed1c9..6ef8b2a57a9bf 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -287,7 +287,7 @@ void ax25_destroy_socket(ax25_cb *); */ static void ax25_destroy_timer(struct timer_list *t) { - ax25_cb *ax25 = from_timer(ax25, t, dtimer); + ax25_cb *ax25 = timer_container_of(ax25, t, dtimer); struct sock *sk; sk=ax25->sk; diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index 8d9fba0690011..0c9e7775aa54e 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -64,7 +64,7 @@ void ax25_ds_set_timer(ax25_dev *ax25_dev) static void ax25_ds_timeout(struct timer_list *t) { - ax25_dev *ax25_dev = from_timer(ax25_dev, t, dama.slave_timer); + ax25_dev *ax25_dev = timer_container_of(ax25_dev, t, dama.slave_timer); ax25_cb *ax25; if (ax25_dev == NULL || !ax25_dev->dama.slave) diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c index 3891a3923d6cb..a69bfbc8b6790 100644 --- a/net/ax25/ax25_timer.c +++ b/net/ax25/ax25_timer.c @@ -121,7 +121,7 @@ EXPORT_SYMBOL(ax25_display_timer); static void ax25_heartbeat_expiry(struct timer_list *t) { int proto = AX25_PROTO_STD_SIMPLEX; - ax25_cb *ax25 = from_timer(ax25, t, timer); + ax25_cb *ax25 = timer_container_of(ax25, t, timer); if (ax25->ax25_dev) proto = ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]; @@ -145,7 +145,7 @@ static void ax25_heartbeat_expiry(struct timer_list *t) static void ax25_t1timer_expiry(struct timer_list *t) { - ax25_cb *ax25 = from_timer(ax25, t, t1timer); + ax25_cb *ax25 = timer_container_of(ax25, t, t1timer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: @@ -164,7 +164,7 @@ static void ax25_t1timer_expiry(struct timer_list *t) static void ax25_t2timer_expiry(struct timer_list *t) { - ax25_cb *ax25 = from_timer(ax25, t, t2timer); + ax25_cb *ax25 = timer_container_of(ax25, t, t2timer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: @@ -183,7 +183,7 @@ static void ax25_t2timer_expiry(struct timer_list *t) static void ax25_t3timer_expiry(struct timer_list *t) { - ax25_cb *ax25 = from_timer(ax25, t, t3timer); + ax25_cb *ax25 = timer_container_of(ax25, t, t3timer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: @@ -204,7 +204,7 @@ static void ax25_t3timer_expiry(struct timer_list *t) static void ax25_idletimer_expiry(struct timer_list *t) { - ax25_cb *ax25 = from_timer(ax25, t, idletimer); + ax25_cb *ax25 = timer_container_of(ax25, t, idletimer); switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { case AX25_PROTO_STD_SIMPLEX: diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index c0c982b6f0292..49e5861b58ec2 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -14,6 +14,7 @@ #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/string.h> +#include <linux/types.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batman_adv.h> diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 2c486374af581..7ce9abbdb4b47 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -11,10 +11,8 @@ #include <linux/netlink.h> #include <linux/skbuff.h> -#include <linux/types.h> extern char batadv_routing_algo[]; -extern struct list_head batadv_hardif_list; void batadv_algo_init(void); struct batadv_algo_ops *batadv_algo_get(const char *name); diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 458879d21d663..54fe38b3b2fd3 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -791,6 +791,7 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *primary_if, *tmp_hard_iface; int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len; + struct list_head *iter; u32 seqno; u16 tvlv_len = 0; unsigned long send_time; @@ -847,10 +848,7 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) * interfaces. */ rcu_read_lock(); - list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { - if (tmp_hard_iface->mesh_iface != hard_iface->mesh_iface) - continue; - + netdev_for_each_lower_private_rcu(hard_iface->mesh_iface, tmp_hard_iface, iter) { if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) continue; @@ -1567,6 +1565,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, bool is_my_oldorig = false; bool is_my_addr = false; bool is_my_orig = false; + struct list_head *iter; ogm_packet = (struct batadv_ogm_packet *)(skb->data + ogm_offset); ethhdr = eth_hdr(skb); @@ -1603,11 +1602,9 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, ogm_packet->version, has_directlink_flag); rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->if_status != BATADV_IF_ACTIVE) - continue; - if (hard_iface->mesh_iface != if_incoming->mesh_iface) + netdev_for_each_lower_private_rcu(if_incoming->mesh_iface, hard_iface, iter) { + if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (batadv_compare_eth(ethhdr->h_source, @@ -1668,13 +1665,10 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, if_incoming, BATADV_IF_DEFAULT); rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->mesh_iface != bat_priv->mesh_iface) - continue; - if (!kref_get_unless_zero(&hard_iface->refcount)) continue; @@ -2142,6 +2136,7 @@ batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; + struct list_head *iter; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; @@ -2158,11 +2153,7 @@ batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, i_hardif++; } } else { - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, - list) { - if (hard_iface->mesh_iface != bat_priv->mesh_iface) - continue; - + netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (i_hardif++ < i_hardif_s) continue; diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index c16c2e60889d2..de94447142642 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -212,6 +212,7 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; + struct list_head *iter; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; @@ -227,10 +228,7 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, i_hardif++; } } else { - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->mesh_iface != bat_priv->mesh_iface) - continue; - + netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (i_hardif++ < i_hardif_s) continue; diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 70d6778da0d7b..cb16c1ed2a58f 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -35,7 +35,6 @@ #include <net/cfg80211.h> #include <uapi/linux/batadv_packet.h> -#include "bat_algo.h" #include "bat_v_ogm.h" #include "hard-interface.h" #include "log.h" @@ -472,15 +471,12 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface) { struct batadv_hard_iface *hard_iface; + struct list_head *iter; /* update orig field of every elp iface belonging to this mesh */ rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (primary_iface->mesh_iface != hard_iface->mesh_iface) - continue; - + netdev_for_each_lower_private_rcu(primary_iface->mesh_iface, hard_iface, iter) batadv_v_elp_iface_activate(primary_iface, hard_iface); - } rcu_read_unlock(); } diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index b86bb647da5b7..e3870492dab77 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -22,7 +22,6 @@ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/random.h> -#include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> @@ -33,7 +32,6 @@ #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> -#include "bat_algo.h" #include "hard-interface.h" #include "hash.h" #include "log.h" @@ -265,6 +263,7 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) struct batadv_ogm2_packet *ogm_packet; struct sk_buff *skb, *skb_tmp; unsigned char *ogm_buff; + struct list_head *iter; int ogm_buff_len; u16 tvlv_len = 0; int ret; @@ -301,10 +300,7 @@ static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) /* broadcast on every interface */ rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->mesh_iface != bat_priv->mesh_iface) - continue; - + netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (!kref_get_unless_zero(&hard_iface->refcount)) continue; @@ -859,6 +855,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; u32 ogm_throughput, link_throughput, path_throughput; + struct list_head *iter; int ret; ethhdr = eth_hdr(skb); @@ -921,13 +918,10 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, BATADV_IF_DEFAULT); rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->mesh_iface != bat_priv->mesh_iface) - continue; - if (!kref_get_unless_zero(&hard_iface->refcount)) continue; diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 558d39dffc233..bace57e4f9a51 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -438,15 +438,13 @@ out: } static struct batadv_hard_iface * -batadv_hardif_get_active(const struct net_device *mesh_iface) +batadv_hardif_get_active(struct net_device *mesh_iface) { struct batadv_hard_iface *hard_iface; + struct list_head *iter; rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->mesh_iface != mesh_iface) - continue; - + netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) { if (hard_iface->if_status == BATADV_IF_ACTIVE && kref_get_unless_zero(&hard_iface->refcount)) goto out; @@ -508,19 +506,17 @@ batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface) static void batadv_check_known_mac_addr(const struct batadv_hard_iface *hard_iface) { - const struct net_device *mesh_iface = hard_iface->mesh_iface; + struct net_device *mesh_iface = hard_iface->mesh_iface; const struct batadv_hard_iface *tmp_hard_iface; + struct list_head *iter; if (!mesh_iface) return; - list_for_each_entry(tmp_hard_iface, &batadv_hardif_list, list) { + netdev_for_each_lower_private(mesh_iface, tmp_hard_iface, iter) { if (tmp_hard_iface == hard_iface) continue; - if (tmp_hard_iface->mesh_iface != mesh_iface) - continue; - if (tmp_hard_iface->if_status == BATADV_IF_NOT_IN_USE) continue; @@ -545,15 +541,13 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *mesh_iface) unsigned short lower_headroom = 0; unsigned short lower_tailroom = 0; unsigned short needed_headroom; + struct list_head *iter; rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) { if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) continue; - if (hard_iface->mesh_iface != mesh_iface) - continue; - lower_header_len = max_t(unsigned short, lower_header_len, hard_iface->net_dev->hard_header_len); @@ -586,17 +580,15 @@ int batadv_hardif_min_mtu(struct net_device *mesh_iface) { struct batadv_priv *bat_priv = netdev_priv(mesh_iface); const struct batadv_hard_iface *hard_iface; + struct list_head *iter; int min_mtu = INT_MAX; rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE && hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; - if (hard_iface->mesh_iface != mesh_iface) - continue; - min_mtu = min_t(int, hard_iface->net_dev->mtu, min_mtu); } rcu_read_unlock(); @@ -734,7 +726,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, bat_priv = netdev_priv(hard_iface->mesh_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, - mesh_iface, NULL, NULL, NULL); + mesh_iface, hard_iface, NULL, NULL); if (ret) goto err_dev; @@ -803,18 +795,15 @@ err_dev: * * Return: number of connected/enslaved hard interfaces */ -static size_t batadv_hardif_cnt(const struct net_device *mesh_iface) +static size_t batadv_hardif_cnt(struct net_device *mesh_iface) { struct batadv_hard_iface *hard_iface; + struct list_head *iter; size_t count = 0; rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->mesh_iface != mesh_iface) - continue; - + netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) count++; - } rcu_read_unlock(); return count; diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index a08132888a3d4..20346d7b6b691 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -11,7 +11,7 @@ #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> -#include <linux/crc32c.h> +#include <linux/crc32.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/gfp.h> @@ -27,7 +27,6 @@ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/printk.h> -#include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> @@ -69,8 +68,6 @@ unsigned int batadv_hardif_generation; static int (*batadv_rx_handler[256])(struct sk_buff *skb, struct batadv_hard_iface *recv_if); -unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - struct workqueue_struct *batadv_event_workqueue; static void batadv_recv_handler_init(void); @@ -305,16 +302,14 @@ void batadv_mesh_free(struct net_device *mesh_iface) bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) { const struct batadv_hard_iface *hard_iface; + struct list_head *iter; bool is_my_mac = false; rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->mesh_iface != bat_priv->mesh_iface) - continue; - if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) { is_my_mac = true; break; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 67af435ee04e1..1481eb2bacee3 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2025.1" +#define BATADV_SOURCE_VERSION "2025.3" #endif /* B.A.T.M.A.N. parameters */ @@ -235,7 +235,6 @@ static inline int batadv_print_vid(unsigned short vid) extern struct list_head batadv_hardif_list; extern unsigned int batadv_hardif_generation; -extern unsigned char batadv_broadcast_addr[]; extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *mesh_iface); diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c index 59e7b5aacbc93..de2c2d9c6e4db 100644 --- a/net/batman-adv/mesh-interface.c +++ b/net/batman-adv/mesh-interface.c @@ -36,7 +36,6 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> -#include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -77,18 +76,6 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) return 0; } -static int batadv_interface_open(struct net_device *dev) -{ - netif_start_queue(dev); - return 0; -} - -static int batadv_interface_release(struct net_device *dev) -{ - netif_stop_queue(dev); - return 0; -} - /** * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' * @bat_priv: the bat priv with all the mesh interface information @@ -890,8 +877,6 @@ out: static const struct net_device_ops batadv_netdev_ops = { .ndo_init = batadv_meshif_init_late, - .ndo_open = batadv_interface_open, - .ndo_stop = batadv_interface_release, .ndo_get_stats = batadv_interface_stats, .ndo_vlan_rx_add_vid = batadv_interface_add_vid, .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid, @@ -1116,9 +1101,9 @@ static void batadv_meshif_destroy_netlink(struct net_device *mesh_iface, struct batadv_hard_iface *hard_iface; struct batadv_meshif_vlan *vlan; - list_for_each_entry(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->mesh_iface == mesh_iface) - batadv_hardif_disable_interface(hard_iface); + while (!list_empty(&mesh_iface->adj_list.lower)) { + hard_iface = netdev_adjacent_get_private(mesh_iface->adj_list.lower.next); + batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 5786680aff30c..e8c6b0bf670ff 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -246,15 +246,13 @@ static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv) { const struct batadv_hard_iface *hard_iface; + struct list_head *iter; rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->mesh_iface != bat_priv->mesh_iface) - continue; - if (hard_iface->net_dev->mtu < IPV6_MIN_MTU) { rcu_read_unlock(); return BATADV_NO_FLAGS; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index e7c8f9f2bb1f9..beb181b3a7d88 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -20,7 +20,6 @@ #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/limits.h> -#include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> @@ -968,6 +967,7 @@ batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) struct batadv_priv *bat_priv; int portid = NETLINK_CB(cb->skb).portid; int skip = cb->args[0]; + struct list_head *iter; int i = 0; mesh_iface = batadv_netlink_get_meshif(cb); @@ -979,10 +979,7 @@ batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) rtnl_lock(); cb->seq = batadv_hardif_generation << 1 | 1; - list_for_each_entry(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->mesh_iface != mesh_iface) - continue; - + netdev_for_each_lower_private(mesh_iface, hard_iface, iter) { if (i++ < skip) continue; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index d9cfc5c6b2088..a464ff96b9291 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -29,7 +29,6 @@ #include <linux/workqueue.h> #include <uapi/linux/batadv_packet.h> -#include "bat_algo.h" #include "distributed-arp-table.h" #include "fragmentation.h" #include "gateway_client.h" @@ -1208,6 +1207,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, struct batadv_neigh_node *best_neigh_node; struct batadv_hard_iface *hard_iface; bool changed_ifinfo, changed_neigh; + struct list_head *iter; if (batadv_has_timed_out(orig_node->last_seen, 2 * BATADV_PURGE_TIMEOUT)) { @@ -1232,13 +1232,10 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, /* ... then for all other interfaces. */ rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->mesh_iface != bat_priv->mesh_iface) - continue; - if (!kref_get_unless_zero(&hard_iface->refcount)) continue; diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 735ac80778219..95849ba004e75 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -21,7 +21,6 @@ #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> -#include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> @@ -124,7 +123,9 @@ send_skb_err: int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { - return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr); + static const u8 broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + + return batadv_send_skb_packet(skb, hard_iface, broadcast_addr); } /** @@ -922,6 +923,7 @@ static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv, { struct batadv_hard_iface *hard_iface; struct batadv_hard_iface *primary_if; + struct list_head *iter; int ret = NETDEV_TX_OK; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -929,10 +931,7 @@ static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv, return NETDEV_TX_BUSY; rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->mesh_iface != bat_priv->mesh_iface) - continue; - + netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (!kref_get_unless_zero(&hard_iface->refcount)) continue; diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index adbadb436033c..350b149e48beb 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -485,7 +485,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars) */ static void batadv_tp_sender_timeout(struct timer_list *t) { - struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer); + struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer); struct batadv_priv *bat_priv = tp_vars->bat_priv; if (atomic_read(&tp_vars->sending) == 0) @@ -1101,7 +1101,7 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars) */ static void batadv_tp_receiver_shutdown(struct timer_list *t) { - struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer); + struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer); struct batadv_tp_unacked *un, *safe; struct batadv_priv *bat_priv; diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 4a3165920de19..8d0e04e770cb6 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -14,7 +14,7 @@ #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> -#include <linux/crc32c.h> +#include <linux/crc32.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 5a3835b7dfcdc..a7eede7616d85 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -14,7 +14,8 @@ bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ - ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o + ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o \ + hci_drv.o bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 0b4d0a8bd3614..2b94e20772038 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -34,6 +34,9 @@ #include <net/bluetooth/bluetooth.h> #include <linux/proc_fs.h> +#include <linux/ethtool.h> +#include <linux/sockios.h> + #include "leds.h" #include "selftest.h" @@ -361,6 +364,13 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS, sizeof(pkt_status), &pkt_status); } + + if (test_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags)) { + u16 pkt_seqnum = hci_skb_pkt_seqnum(skb); + + put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_SEQNUM, + sizeof(pkt_seqnum), &pkt_seqnum); + } } skb_free_datagram(sk, skb); @@ -563,6 +573,86 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, } EXPORT_SYMBOL(bt_sock_poll); +static int bt_ethtool_get_ts_info(struct sock *sk, unsigned int index, + void __user *useraddr) +{ + struct ethtool_ts_info info; + struct kernel_ethtool_ts_info ts_info = {}; + int ret; + + ret = hci_ethtool_ts_info(index, sk->sk_protocol, &ts_info); + if (ret == -ENODEV) + return ret; + else if (ret < 0) + return -EIO; + + memset(&info, 0, sizeof(info)); + + info.cmd = ETHTOOL_GET_TS_INFO; + info.so_timestamping = ts_info.so_timestamping; + info.phc_index = ts_info.phc_index; + info.tx_types = ts_info.tx_types; + info.rx_filters = ts_info.rx_filters; + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int bt_ethtool(struct sock *sk, const struct ifreq *ifr, + void __user *useraddr) +{ + unsigned int index; + u32 ethcmd; + int n; + + if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) + return -EFAULT; + + if (sscanf(ifr->ifr_name, "hci%u%n", &index, &n) != 1 || + n != strlen(ifr->ifr_name)) + return -ENODEV; + + switch (ethcmd) { + case ETHTOOL_GET_TS_INFO: + return bt_ethtool_get_ts_info(sk, index, useraddr); + } + + return -EOPNOTSUPP; +} + +static int bt_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg) +{ + struct sock *sk = sock->sk; + struct ifreq ifr = {}; + void __user *data; + char *colon; + int ret = -ENOIOCTLCMD; + + if (get_user_ifreq(&ifr, &data, arg)) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ - 1] = 0; + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + switch (cmd) { + case SIOCETHTOOL: + ret = bt_ethtool(sk, &ifr, data); + break; + } + + if (colon) + *colon = ':'; + + if (put_user_ifreq(&ifr, arg)) + return -EFAULT; + + return ret; +} + int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; @@ -595,6 +685,10 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = put_user(amount, (int __user *)arg); break; + case SIOCETHTOOL: + err = bt_dev_ioctl(sock, cmd, (void __user *)arg); + break; + default: err = -ENOIOCTLCMD; break; @@ -728,7 +822,7 @@ static int bt_seq_show(struct seq_file *seq, void *v) refcount_read(&sk->sk_refcnt), sk_rmem_alloc_get(sk), sk_wmem_alloc_get(sk), - from_kuid(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), bt->parent ? sock_i_ino(bt->parent) : 0LU); diff --git a/net/bluetooth/aosp.c b/net/bluetooth/aosp.c index 1d67836e95e16..59025771af53a 100644 --- a/net/bluetooth/aosp.c +++ b/net/bluetooth/aosp.c @@ -70,7 +70,7 @@ void aosp_do_open(struct hci_dev *hdev) rp = (struct aosp_rp_le_get_vendor_capa *)skb->data; version_supported = le16_to_cpu(rp->version_supported); - /* AOSP displays the verion number like v0.98, v1.00, etc. */ + /* AOSP displays the version number like v0.98, v1.00, etc. */ bt_dev_info(hdev, "AOSP extensions version v%u.%02u", version_supported >> 8, version_supported & 0xff); diff --git a/net/bluetooth/coredump.c b/net/bluetooth/coredump.c index 819eacb387622..720cb79adf964 100644 --- a/net/bluetooth/coredump.c +++ b/net/bluetooth/coredump.c @@ -249,15 +249,15 @@ static void hci_devcd_dump(struct hci_dev *hdev) size = hdev->dump.tail - hdev->dump.head; - /* Emit a devcoredump with the available data */ - dev_coredumpv(&hdev->dev, hdev->dump.head, size, GFP_KERNEL); - /* Send a copy to monitor as a diagnostic packet */ skb = bt_skb_alloc(size, GFP_ATOMIC); if (skb) { skb_put_data(skb, hdev->dump.head, size); hci_recv_diag(hdev, skb); } + + /* Emit a devcoredump with the available data */ + dev_coredumpv(&hdev->dev, hdev->dump.head, size, GFP_KERNEL); } static void hci_devcd_handle_pkt_complete(struct hci_dev *hdev, diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c index 1bc51e2b05a34..3f72111ba651f 100644 --- a/net/bluetooth/eir.c +++ b/net/bluetooth/eir.c @@ -242,7 +242,7 @@ u8 eir_create_per_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) return ad_len; } -u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) +u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr, u8 size) { struct adv_info *adv = NULL; u8 ad_len = 0, flags = 0; @@ -286,7 +286,7 @@ u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) /* If flags would still be empty, then there is no need to * include the "Flags" AD field". */ - if (flags) { + if (flags && (ad_len + eir_precalc_len(1) <= size)) { ptr[0] = 0x02; ptr[1] = EIR_FLAGS; ptr[2] = flags; @@ -316,7 +316,8 @@ skip_flags: } /* Provide Tx Power only if we can provide a valid value for it */ - if (adv_tx_power != HCI_TX_POWER_INVALID) { + if (adv_tx_power != HCI_TX_POWER_INVALID && + (ad_len + eir_precalc_len(1) <= size)) { ptr[0] = 0x02; ptr[1] = EIR_TX_POWER; ptr[2] = (u8)adv_tx_power; @@ -366,17 +367,19 @@ u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr) void *eir_get_service_data(u8 *eir, size_t eir_len, u16 uuid, size_t *len) { - while ((eir = eir_get_data(eir, eir_len, EIR_SERVICE_DATA, len))) { + size_t dlen; + + while ((eir = eir_get_data(eir, eir_len, EIR_SERVICE_DATA, &dlen))) { u16 value = get_unaligned_le16(eir); if (uuid == value) { if (len) - *len -= 2; + *len = dlen - 2; return &eir[2]; } - eir += *len; - eir_len -= *len; + eir += dlen; + eir_len -= dlen; } return NULL; diff --git a/net/bluetooth/eir.h b/net/bluetooth/eir.h index 5c89a05e8b290..9372db83f912f 100644 --- a/net/bluetooth/eir.h +++ b/net/bluetooth/eir.h @@ -9,7 +9,7 @@ void eir_create(struct hci_dev *hdev, u8 *data); -u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr); +u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr, u8 size); u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr); u8 eir_create_per_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 946d2ae551f86..7d1e79f69cd1c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -785,7 +785,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c d->sync_handle = conn->sync_handle; if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { - hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK, + hci_conn_hash_list_flag(hdev, find_bis, PA_LINK, HCI_CONN_PA_SYNC, d); if (!d->count) @@ -795,7 +795,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c } if (test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags)) { - hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK, + hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK, HCI_CONN_BIG_SYNC, d); if (!d->count) @@ -814,7 +814,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c * * Detects if there any BIS left connected in a BIG * broadcaster: Remove advertising instance and terminate BIG. - * broadcaster receiver: Teminate BIG sync and terminate PA sync. + * broadcaster receiver: Terminate BIG sync and terminate PA sync. */ static void bis_cleanup(struct hci_conn *conn) { @@ -885,9 +885,11 @@ static void cis_cleanup(struct hci_conn *conn) /* Check if ISO connection is a CIS and remove CIG if there are * no other connections using it. */ - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_BOUND, &d); - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECT, &d); - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECTED, &d); + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_BOUND, &d); + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT, + &d); + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED, + &d); if (d.count) return; @@ -910,7 +912,9 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t if (!hdev->acl_mtu) return ERR_PTR(-ECONNREFUSED); break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: if (hdev->iso_mtu) /* Dedicated ISO Buffer exists */ break; @@ -974,7 +978,9 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t hci_copy_identity_address(hdev, &conn->src, &conn->src_type); conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); @@ -1029,7 +1035,6 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t } hci_conn_init_sysfs(conn); - return conn; } @@ -1071,7 +1076,9 @@ static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) if (HCI_CONN_HANDLE_UNSET(conn->handle)) hci_conn_failed(conn, reason); break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: if ((conn->state != BT_CONNECTED && !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) || test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) @@ -1146,7 +1153,9 @@ void hci_conn_del(struct hci_conn *conn) hdev->acl_cnt += conn->sent; } else { /* Unacked ISO frames */ - if (conn->type == ISO_LINK) { + if (conn->type == CIS_LINK || + conn->type == BIS_LINK || + conn->type == PA_LINK) { if (hdev->iso_pkts) hdev->iso_cnt += conn->sent; else if (hdev->le_pkts) @@ -1495,8 +1504,8 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, - struct bt_iso_qos *qos, __u8 base_len, - __u8 *base) + __u8 sid, struct bt_iso_qos *qos, + __u8 base_len, __u8 *base) { struct hci_conn *conn; int err; @@ -1532,11 +1541,12 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, memcmp(conn->le_per_adv_data, base, base_len))) return ERR_PTR(-EADDRINUSE); - conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); + conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; conn->state = BT_CONNECT; + conn->sid = sid; hci_conn_hold(conn); return conn; @@ -1740,7 +1750,7 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) data.count = 0; /* Create a BIS for each bound connection */ - hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, + hci_conn_hash_list_state(hdev, bis_list, BIS_LINK, BT_BOUND, &data); cp.handle = qos->bcast.big; @@ -1829,12 +1839,12 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) for (data.cig = 0x00; data.cig < 0xf0; data.cig++) { data.count = 0; - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT, &data); if (data.count) continue; - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED, &data); if (!data.count) break; @@ -1884,7 +1894,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig, qos->ucast.cis); if (!cis) { - cis = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); + cis = hci_conn_add_unset(hdev, CIS_LINK, dst, + HCI_ROLE_MASTER); if (IS_ERR(cis)) return cis; cis->cleanup = cis_cleanup; @@ -1976,7 +1987,7 @@ bool hci_iso_setup_path(struct hci_conn *conn) int hci_conn_check_create_cis(struct hci_conn *conn) { - if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY)) + if (conn->type != CIS_LINK) return -EINVAL; if (!conn->parent || conn->parent->state != BT_CONNECTED || @@ -2055,7 +2066,8 @@ static int create_big_sync(struct hci_dev *hdev, void *data) if (qos->bcast.bis) sync_interval = interval * 4; - err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->le_per_adv_data_len, + err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->sid, + conn->le_per_adv_data_len, conn->le_per_adv_data, flags, interval, interval, sync_interval); if (err) @@ -2070,7 +2082,9 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, { struct hci_conn *conn; - conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE); + bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid); + + conn = hci_conn_add_unset(hdev, PA_LINK, dst, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; @@ -2125,7 +2139,7 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) } } -struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, __u8 base_len, __u8 *base) { @@ -2135,7 +2149,8 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, struct hci_link *link; /* Look for any BIS that is open for rebinding */ - conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN); + conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN, + HCI_ROLE_MASTER); if (conn) { memcpy(qos, &conn->iso_qos, sizeof(*qos)); conn->state = BT_CONNECTED; @@ -2147,7 +2162,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ - conn = hci_add_bis(hdev, dst, qos, base_len, eir); + conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir); if (IS_ERR(conn)) return conn; @@ -2198,20 +2213,35 @@ static void bis_mark_per_adv(struct hci_conn *conn, void *data) } struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos, + __u8 dst_type, __u8 sid, + struct bt_iso_qos *qos, __u8 base_len, __u8 *base) { struct hci_conn *conn; int err; struct iso_list_data data; - conn = hci_bind_bis(hdev, dst, qos, base_len, base); + conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base); if (IS_ERR(conn)) return conn; if (conn->state == BT_CONNECTED) return conn; + /* Check if SID needs to be allocated then search for the first + * available. + */ + if (conn->sid == HCI_SID_INVALID) { + u8 sid; + + for (sid = 0; sid <= 0x0f; sid++) { + if (!hci_find_adv_sid(hdev, sid)) { + conn->sid = sid; + break; + } + } + } + data.big = qos->bcast.big; data.bis = qos->bcast.bis; @@ -2219,7 +2249,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, * the start periodic advertising and create BIG commands have * been queued */ - hci_conn_hash_list_state(hdev, bis_mark_per_adv, ISO_LINK, + hci_conn_hash_list_state(hdev, bis_mark_per_adv, PA_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ @@ -2951,7 +2981,9 @@ void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb) * TODO: SCO support without flowctl (needs to be done in drivers) */ switch (conn->type) { - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: case ACL_LINK: case LE_LINK: break; @@ -3047,3 +3079,36 @@ u8 *hci_conn_key_enc_size(struct hci_conn *conn) return NULL; } + +int hci_ethtool_ts_info(unsigned int index, int sk_proto, + struct kernel_ethtool_ts_info *info) +{ + struct hci_dev *hdev; + + hdev = hci_dev_get(index); + if (!hdev) + return -ENODEV; + + info->so_timestamping = + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + info->phc_index = -1; + info->tx_types = BIT(HWTSTAMP_TX_OFF); + info->rx_filters = BIT(HWTSTAMP_FILTER_NONE); + + switch (sk_proto) { + case BTPROTO_ISO: + case BTPROTO_L2CAP: + info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; + info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION; + break; + case BTPROTO_SCO: + info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) + info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION; + break; + } + + hci_dev_put(hdev); + return 0; +} diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 5eb0600bbd03c..55e0722fd0662 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -64,7 +64,7 @@ static DEFINE_IDA(hci_index_ida); /* Get HCI device by index. * Device is held on return. */ -struct hci_dev *hci_dev_get(int index) +static struct hci_dev *__hci_dev_get(int index, int *srcu_index) { struct hci_dev *hdev = NULL, *d; @@ -77,6 +77,8 @@ struct hci_dev *hci_dev_get(int index) list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); + if (srcu_index) + *srcu_index = srcu_read_lock(&d->srcu); break; } } @@ -84,6 +86,22 @@ struct hci_dev *hci_dev_get(int index) return hdev; } +struct hci_dev *hci_dev_get(int index) +{ + return __hci_dev_get(index, NULL); +} + +static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index) +{ + return __hci_dev_get(index, srcu_index); +} + +static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index) +{ + srcu_read_unlock(&hdev->srcu, srcu_index); + hci_dev_put(hdev); +} + /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) @@ -568,9 +586,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev) int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; - int err; + int err, srcu_index; - hdev = hci_dev_get(dev); + hdev = hci_dev_get_srcu(dev, &srcu_index); if (!hdev) return -ENODEV; @@ -592,7 +610,7 @@ int hci_dev_reset(__u16 dev) err = hci_dev_do_reset(hdev); done: - hci_dev_put(hdev); + hci_dev_put_srcu(hdev, srcu_index); return err; } @@ -1238,12 +1256,10 @@ struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, if (addr_type == irk->addr_type && bacmp(bdaddr, &irk->bdaddr) == 0) { irk_to_return = irk; - goto done; + break; } } -done: - if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", @@ -1585,6 +1601,19 @@ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) } /* This function requires the caller holds hdev->lock */ +struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid) +{ + struct adv_info *adv; + + list_for_each_entry(adv, &hdev->adv_instances, list) { + if (adv->sid == sid) + return adv; + } + + return NULL; +} + +/* This function requires the caller holds hdev->lock */ struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *cur_instance; @@ -1736,7 +1765,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, } /* This function requires the caller holds hdev->lock */ -struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, +struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval) { @@ -1748,6 +1777,7 @@ struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, if (IS_ERR(adv)) return adv; + adv->sid = sid; adv->periodic = true; adv->per_adv_data_len = data_len; @@ -1877,10 +1907,8 @@ void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) if (monitor->handle) idr_remove(&hdev->adv_monitors_idr, monitor->handle); - if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) { + if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) hdev->adv_monitors_cnt--; - mgmt_adv_monitor_removed(hdev, monitor->handle); - } kfree(monitor); } @@ -2421,6 +2449,11 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) if (!hdev) return NULL; + if (init_srcu_struct(&hdev->srcu)) { + kfree(hdev); + return NULL; + } + hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); @@ -2487,6 +2520,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); + mutex_init(&hdev->mgmt_pending_lock); ida_init(&hdev->unset_handle_ida); @@ -2618,7 +2652,7 @@ int hci_register_dev(struct hci_dev *hdev) /* Devices that are marked for raw-only usage are unconfigured * and should not be included in normal operation. */ - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* Mark Remote Wakeup connection flag as supported if driver has wakeup @@ -2665,6 +2699,9 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); + synchronize_srcu(&hdev->srcu); + cleanup_srcu_struct(&hdev->srcu); + disable_work_sync(&hdev->rx_work); disable_work_sync(&hdev->cmd_work); disable_work_sync(&hdev->tx_work); @@ -2745,7 +2782,7 @@ int hci_register_suspend_notifier(struct hci_dev *hdev) int ret = 0; if (!hdev->suspend_notifier.notifier_call && - !test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { + !hci_test_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER)) { hdev->suspend_notifier.notifier_call = hci_suspend_notifier; ret = register_pm_notifier(&hdev->suspend_notifier); } @@ -2898,12 +2935,15 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) break; case HCI_ACLDATA_PKT: /* Detect if ISO packet has been sent as ACL */ - if (hci_conn_num(hdev, ISO_LINK)) { + if (hci_conn_num(hdev, CIS_LINK) || + hci_conn_num(hdev, BIS_LINK) || + hci_conn_num(hdev, PA_LINK)) { __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle); __u8 type; type = hci_conn_lookup_type(hdev, hci_handle(handle)); - if (type == ISO_LINK) + if (type == CIS_LINK || type == BIS_LINK || + type == PA_LINK) hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; } break; @@ -2911,6 +2951,8 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) break; case HCI_ISODATA_PKT: break; + case HCI_DRV_PKT: + break; default: kfree_skb(skb); return -EINVAL; @@ -3019,6 +3061,15 @@ static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) return -EINVAL; } + if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) { + /* Intercept HCI Drv packet here and don't go with hdev->send + * callback. + */ + err = hci_drv_process_cmd(hdev, skb); + kfree_skb(skb); + return err; + } + err = hdev->send(hdev, skb); if (err < 0) { bt_dev_err(hdev, "sending frame failed (%d)", err); @@ -3345,7 +3396,9 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) case LE_LINK: cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: cnt = hdev->iso_mtu ? hdev->iso_cnt : hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; @@ -3371,7 +3424,8 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != type || skb_queue_empty(&c->data_q)) + if (c->type != type || + skb_queue_empty(&c->data_q)) continue; if (c->state != BT_CONNECTED && c->state != BT_CONFIG) @@ -3403,23 +3457,18 @@ static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) bt_dev_err(hdev, "link tx timeout"); - rcu_read_lock(); + hci_dev_lock(hdev); /* Kill stalled connections */ - list_for_each_entry_rcu(c, &h->list, list) { + list_for_each_entry(c, &h->list, list) { if (c->type == type && c->sent) { bt_dev_err(hdev, "killing stalled connection %pMR", &c->dst); - /* hci_disconnect might sleep, so, we have to release - * the RCU read lock before calling it. - */ - rcu_read_unlock(); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); - rcu_read_lock(); } } - rcu_read_unlock(); + hci_dev_unlock(hdev); } static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type, @@ -3698,8 +3747,8 @@ static void hci_sched_le(struct hci_dev *hdev) hci_prio_recalculate(hdev, LE_LINK); } -/* Schedule CIS */ -static void hci_sched_iso(struct hci_dev *hdev) +/* Schedule iso */ +static void hci_sched_iso(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; @@ -3707,12 +3756,12 @@ static void hci_sched_iso(struct hci_dev *hdev) BT_DBG("%s", hdev->name); - if (!hci_conn_num(hdev, ISO_LINK)) + if (!hci_conn_num(hdev, type)) return; cnt = hdev->iso_pkts ? &hdev->iso_cnt : hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; - while (*cnt && (conn = hci_low_sent(hdev, ISO_LINK, "e))) { + while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); @@ -3737,7 +3786,9 @@ static void hci_tx_work(struct work_struct *work) /* Schedule queues and send stuff to HCI driver */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); - hci_sched_iso(hdev); + hci_sched_iso(hdev, CIS_LINK); + hci_sched_iso(hdev, BIS_LINK); + hci_sched_iso(hdev, PA_LINK); hci_sched_acl(hdev); hci_sched_le(hdev); } @@ -4057,10 +4108,13 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) return; } - err = hci_send_frame(hdev, skb); - if (err < 0) { - hci_cmd_sync_cancel_sync(hdev, -err); - return; + if (hci_skb_opcode(skb) != HCI_OP_NOP) { + err = hci_send_frame(hdev, skb); + if (err < 0) { + hci_cmd_sync_cancel_sync(hdev, -err); + return; + } + atomic_dec(&hdev->cmd_cnt); } if (hdev->req_status == HCI_REQ_PEND && @@ -4068,8 +4122,6 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(hdev->req_skb); hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); } - - atomic_dec(&hdev->cmd_cnt); } static void hci_cmd_work(struct work_struct *work) diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index f625074d1f002..99e2e9fc70e8c 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -38,7 +38,7 @@ static ssize_t __name ## _read(struct file *file, \ struct hci_dev *hdev = file->private_data; \ char buf[3]; \ \ - buf[0] = test_bit(__quirk, &hdev->quirks) ? 'Y' : 'N'; \ + buf[0] = test_bit(__quirk, hdev->quirk_flags) ? 'Y' : 'N'; \ buf[1] = '\n'; \ buf[2] = '\0'; \ return simple_read_from_buffer(user_buf, count, ppos, buf, 2); \ @@ -59,10 +59,10 @@ static ssize_t __name ## _write(struct file *file, \ if (err) \ return err; \ \ - if (enable == test_bit(__quirk, &hdev->quirks)) \ + if (enable == test_bit(__quirk, hdev->quirk_flags)) \ return -EALREADY; \ \ - change_bit(__quirk, &hdev->quirks); \ + change_bit(__quirk, hdev->quirk_flags); \ \ return count; \ } \ @@ -1356,7 +1356,7 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, * for the vendor callback. Instead just store the desired value and * the setting will be programmed when the controller gets powered on. */ - if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) && (!test_bit(HCI_RUNNING, &hdev->flags) || hci_dev_test_flag(hdev, HCI_USER_CHANNEL))) goto done; diff --git a/net/bluetooth/hci_drv.c b/net/bluetooth/hci_drv.c new file mode 100644 index 0000000000000..3dd2d8a006b9e --- /dev/null +++ b/net/bluetooth/hci_drv.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google Corporation + */ + +#include <linux/skbuff.h> +#include <linux/types.h> + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci.h> +#include <net/bluetooth/hci_core.h> +#include <net/bluetooth/hci_drv.h> + +int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status) +{ + struct hci_drv_ev_hdr *hdr; + struct hci_drv_ev_cmd_status *ev; + struct sk_buff *skb; + + skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_STATUS); + hdr->len = __cpu_to_le16(sizeof(*ev)); + + ev = skb_put(skb, sizeof(*ev)); + ev->opcode = __cpu_to_le16(cmd); + ev->status = status; + + hci_skb_pkt_type(skb) = HCI_DRV_PKT; + + return hci_recv_frame(hdev, skb); +} +EXPORT_SYMBOL(hci_drv_cmd_status); + +int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp, + size_t rp_len) +{ + struct hci_drv_ev_hdr *hdr; + struct hci_drv_ev_cmd_complete *ev; + struct sk_buff *skb; + + skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_COMPLETE); + hdr->len = __cpu_to_le16(sizeof(*ev) + rp_len); + + ev = skb_put(skb, sizeof(*ev)); + ev->opcode = __cpu_to_le16(cmd); + ev->status = status; + + skb_put_data(skb, rp, rp_len); + + hci_skb_pkt_type(skb) = HCI_DRV_PKT; + + return hci_recv_frame(hdev, skb); +} +EXPORT_SYMBOL(hci_drv_cmd_complete); + +int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_drv_cmd_hdr *hdr; + const struct hci_drv_handler *handler = NULL; + u16 opcode, len, ogf, ocf; + + hdr = skb_pull_data(skb, sizeof(*hdr)); + if (!hdr) + return -EILSEQ; + + opcode = __le16_to_cpu(hdr->opcode); + len = __le16_to_cpu(hdr->len); + if (len != skb->len) + return -EILSEQ; + + ogf = hci_opcode_ogf(opcode); + ocf = hci_opcode_ocf(opcode); + + if (!hdev->hci_drv) + return hci_drv_cmd_status(hdev, opcode, + HCI_DRV_STATUS_UNKNOWN_COMMAND); + + if (ogf != HCI_DRV_OGF_DRIVER_SPECIFIC) { + if (opcode < hdev->hci_drv->common_handler_count) + handler = &hdev->hci_drv->common_handlers[opcode]; + } else { + if (ocf < hdev->hci_drv->specific_handler_count) + handler = &hdev->hci_drv->specific_handlers[ocf]; + } + + if (!handler || !handler->func) + return hci_drv_cmd_status(hdev, opcode, + HCI_DRV_STATUS_UNKNOWN_COMMAND); + + if (len != handler->data_len) + return hci_drv_cmd_status(hdev, opcode, + HCI_DRV_STATUS_INVALID_PARAMETERS); + + return handler->func(hdev, skb->data, len); +} +EXPORT_SYMBOL(hci_drv_process_cmd); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index c38ada69c3d7f..8aa5039b975a2 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -908,8 +908,8 @@ static u8 hci_cc_read_local_ext_features(struct hci_dev *hdev, void *data, return rp->status; if (hdev->max_page < rp->max_page) { - if (test_bit(HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2, - &hdev->quirks)) + if (hci_test_quirk(hdev, + HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2)) bt_dev_warn(hdev, "broken local ext features page 2"); else hdev->max_page = rp->max_page; @@ -936,7 +936,7 @@ static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data, hdev->acl_pkts = __le16_to_cpu(rp->acl_max_pkt); hdev->sco_pkts = __le16_to_cpu(rp->sco_max_pkt); - if (test_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks)) { + if (hci_test_quirk(hdev, HCI_QUIRK_FIXUP_BUFFER_SIZE)) { hdev->sco_mtu = 64; hdev->sco_pkts = 8; } @@ -2150,40 +2150,6 @@ static u8 hci_cc_set_adv_param(struct hci_dev *hdev, void *data, return rp->status; } -static u8 hci_cc_set_ext_adv_param(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_rp_le_set_ext_adv_params *rp = data; - struct hci_cp_le_set_ext_adv_params *cp; - struct adv_info *adv_instance; - - bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); - - if (rp->status) - return rp->status; - - cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS); - if (!cp) - return rp->status; - - hci_dev_lock(hdev); - hdev->adv_addr_type = cp->own_addr_type; - if (!cp->handle) { - /* Store in hdev for instance 0 */ - hdev->adv_tx_power = rp->tx_power; - } else { - adv_instance = hci_find_adv_instance(hdev, cp->handle); - if (adv_instance) - adv_instance->tx_power = rp->tx_power; - } - /* Update adv data as tx power is known now */ - hci_update_adv_data(hdev, cp->handle); - - hci_dev_unlock(hdev); - - return rp->status; -} - static u8 hci_cc_read_rssi(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -3005,7 +2971,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data, * state to indicate completion. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) || - !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); goto unlock; } @@ -3024,7 +2990,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data, * state to indicate completion. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) || - !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); } @@ -3648,8 +3614,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, /* We skip the WRITE_AUTH_PAYLOAD_TIMEOUT for ATS2851 based controllers * to avoid unexpected SMP command errors when pairing. */ - if (test_bit(HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT, - &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT)) goto notify; /* Set the default Authenticated Payload Timeout after @@ -3804,7 +3769,7 @@ static void hci_unbound_cis_failed(struct hci_dev *hdev, u8 cig, u8 status) lockdep_assert_held(&hdev->lock); list_for_each_entry_safe(conn, tmp, &hdev->conn_hash.list, list) { - if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY) || + if (conn->type != CIS_LINK || conn->state == BT_OPEN || conn->iso_qos.ucast.cig != cig) continue; @@ -4164,8 +4129,6 @@ static const struct hci_cc { HCI_CC(HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS, hci_cc_le_read_num_adv_sets, sizeof(struct hci_rp_le_read_num_supported_adv_sets)), - HCI_CC(HCI_OP_LE_SET_EXT_ADV_PARAMS, hci_cc_set_ext_adv_param, - sizeof(struct hci_rp_le_set_ext_adv_params)), HCI_CC_STATUS(HCI_OP_LE_SET_EXT_ADV_ENABLE, hci_cc_le_set_ext_adv_enable), HCI_CC_STATUS(HCI_OP_LE_SET_ADV_SET_RAND_ADDR, @@ -4467,7 +4430,9 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: if (hdev->iso_pkts) { hdev->iso_cnt += count; if (hdev->iso_cnt > hdev->iso_pkts) @@ -5753,7 +5718,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->state = BT_CONFIG; /* Store current advertising instance as connection advertising instance - * when sotfware rotation is in use so it can be re-enabled when + * when software rotation is in use so it can be re-enabled when * disconnected. */ if (!ext_adv_capable(hdev)) @@ -5949,7 +5914,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, * while we have an existing one in peripheral role. */ if (hdev->conn_hash.le_num_peripheral > 0 && - (test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) || + (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LE_STATES) || !(hdev->le_states[3] & 0x10))) return NULL; @@ -6275,6 +6240,11 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data, static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type) { + u16 pdu_type = evt_type & ~LE_EXT_ADV_DATA_STATUS_MASK; + + if (!pdu_type) + return LE_ADV_NONCONN_IND; + if (evt_type & LE_EXT_ADV_LEGACY_PDU) { switch (evt_type) { case LE_LEGACY_ADV_IND: @@ -6306,8 +6276,7 @@ static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type) if (evt_type & LE_EXT_ADV_SCAN_IND) return LE_ADV_SCAN_IND; - if (evt_type == LE_EXT_ADV_NON_CONN_IND || - evt_type & LE_EXT_ADV_DIRECT_IND) + if (evt_type & LE_EXT_ADV_DIRECT_IND) return LE_ADV_NONCONN_IND; invalid: @@ -6345,12 +6314,23 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK; legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type); - if (test_bit(HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, - &hdev->quirks)) { + if (hci_test_quirk(hdev, + HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY)) { info->primary_phy &= 0x1f; info->secondary_phy &= 0x1f; } + /* Check if PA Sync is pending and if the hci_conn SID has not + * been set update it. + */ + if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_create_pa_sync(hdev); + if (conn && conn->sid == HCI_SID_INVALID) + conn->sid = info->sid; + } + if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &info->bdaddr, info->bdaddr_type, NULL, 0, @@ -6375,8 +6355,8 @@ static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle) return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp); } -static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) +static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) { struct hci_ev_le_pa_sync_established *ev = data; int mask = hdev->link_mode; @@ -6402,7 +6382,8 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, conn->sync_handle = le16_to_cpu(ev->handle); conn->sid = HCI_SID_INVALID; - mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ISO_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK, + &flags); if (!(mask & HCI_LM_ACCEPT)) { hci_le_pa_term_sync(hdev, ev->handle); goto unlock; @@ -6412,7 +6393,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, goto unlock; /* Add connection to indicate PA sync event */ - pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) @@ -6443,7 +6424,7 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, PA_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) goto unlock; @@ -6705,8 +6686,8 @@ unlock: hci_dev_unlock(hdev); } -static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) +static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) { struct hci_evt_le_cis_established *ev = data; struct hci_conn *conn; @@ -6727,7 +6708,7 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, goto unlock; } - if (conn->type != ISO_LINK) { + if (conn->type != CIS_LINK) { bt_dev_err(hdev, "Invalid connection link type handle 0x%4.4x", handle); @@ -6845,7 +6826,7 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data, if (!acl) goto unlock; - mask = hci_proto_connect_ind(hdev, &acl->dst, ISO_LINK, &flags); + mask = hci_proto_connect_ind(hdev, &acl->dst, CIS_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) { hci_le_reject_cis(hdev, ev->cis_handle); goto unlock; @@ -6853,8 +6834,8 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data, cis = hci_conn_hash_lookup_handle(hdev, cis_handle); if (!cis) { - cis = hci_conn_add(hdev, ISO_LINK, &acl->dst, HCI_ROLE_SLAVE, - cis_handle); + cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, + HCI_ROLE_SLAVE, cis_handle); if (IS_ERR(cis)) { hci_le_reject_cis(hdev, ev->cis_handle); goto unlock; @@ -6900,7 +6881,8 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, /* Connect all BISes that are bound to the BIG */ while ((conn = hci_conn_hash_lookup_big_state(hdev, ev->handle, - BT_BOUND))) { + BT_BOUND, + HCI_ROLE_MASTER))) { if (ev->status) { hci_connect_cfm(conn, ev->status); hci_conn_del(conn); @@ -6933,7 +6915,7 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { - struct hci_evt_le_big_sync_estabilished *ev = data; + struct hci_evt_le_big_sync_established *ev = data; struct hci_conn *bis, *conn; int i; @@ -6969,7 +6951,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "ignore too large handle %u", handle); continue; } - bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY, + bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, HCI_ROLE_SLAVE, handle); if (IS_ERR(bis)) continue; @@ -6989,7 +6971,10 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bis->iso_qos.bcast.in.sdu = le16_to_cpu(ev->max_pdu); if (!ev->status) { + bis->state = BT_CONNECTED; set_bit(HCI_CONN_BIG_SYNC, &bis->flags); + hci_debugfs_create_conn(bis); + hci_conn_add_sysfs(bis); hci_iso_setup_path(bis); } } @@ -7013,6 +6998,37 @@ unlock: hci_dev_unlock(hdev); } +static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_evt_le_big_sync_lost *ev = data; + struct hci_conn *bis, *conn; + + bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle); + + hci_dev_lock(hdev); + + /* Delete the pa sync connection */ + bis = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle); + if (bis) { + conn = hci_conn_hash_lookup_pa_sync_handle(hdev, + bis->sync_handle); + if (conn) + hci_conn_del(conn); + } + + /* Delete each bis connection */ + while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle, + BT_CONNECTED, + HCI_ROLE_SLAVE))) { + clear_bit(HCI_CONN_BIG_SYNC, &bis->flags); + hci_disconn_cfm(bis, ev->reason); + hci_conn_del(bis); + } + + hci_dev_unlock(hdev); +} + static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -7025,7 +7041,7 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) goto unlock; @@ -7064,7 +7080,7 @@ unlock: /* Entries in this table shall have their position according to the subevent * opcode they handle so the use of the macros above is recommend since it does * attempt to initialize at its proper index using Designated Initializers that - * way events without a callback function can be ommited. + * way events without a callback function can be omitted. */ static const struct hci_le_ev { void (*func)(struct hci_dev *hdev, void *data, struct sk_buff *skb); @@ -7110,7 +7126,7 @@ static const struct hci_le_ev { HCI_MAX_EVENT_SIZE), /* [0x0e = HCI_EV_LE_PA_SYNC_ESTABLISHED] */ HCI_LE_EV(HCI_EV_LE_PA_SYNC_ESTABLISHED, - hci_le_pa_sync_estabilished_evt, + hci_le_pa_sync_established_evt, sizeof(struct hci_ev_le_pa_sync_established)), /* [0x0f = HCI_EV_LE_PER_ADV_REPORT] */ HCI_LE_EV_VL(HCI_EV_LE_PER_ADV_REPORT, @@ -7121,7 +7137,7 @@ static const struct hci_le_ev { HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt, sizeof(struct hci_evt_le_ext_adv_set_term)), /* [0x19 = HCI_EVT_LE_CIS_ESTABLISHED] */ - HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_estabilished_evt, + HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_established_evt, sizeof(struct hci_evt_le_cis_established)), /* [0x1a = HCI_EVT_LE_CIS_REQ] */ HCI_LE_EV(HCI_EVT_LE_CIS_REQ, hci_le_cis_req_evt, @@ -7134,7 +7150,12 @@ static const struct hci_le_ev { /* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABLISHED] */ HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABLISHED, hci_le_big_sync_established_evt, - sizeof(struct hci_evt_le_big_sync_estabilished), + sizeof(struct hci_evt_le_big_sync_established), + HCI_MAX_EVENT_SIZE), + /* [0x1e = HCI_EVT_LE_BIG_SYNC_LOST] */ + HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_LOST, + hci_le_big_sync_lost_evt, + sizeof(struct hci_evt_le_big_sync_lost), HCI_MAX_EVENT_SIZE), /* [0x22 = HCI_EVT_LE_BIG_INFO_ADV_REPORT] */ HCI_LE_EV_VL(HCI_EVT_LE_BIG_INFO_ADV_REPORT, @@ -7155,7 +7176,8 @@ static void hci_le_meta_evt(struct hci_dev *hdev, void *data, /* Only match event if command OGF is for LE */ if (hdev->req_skb && - hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 && + (hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 || + hci_skb_opcode(hdev->req_skb) == HCI_OP_NOP) && hci_skb_event(hdev->req_skb) == ev->subevent) { *opcode = hci_skb_opcode(hdev->req_skb); hci_req_cmd_complete(hdev, *opcode, 0x00, req_complete, @@ -7419,7 +7441,7 @@ static const struct hci_ev { /* [0x2c = HCI_EV_SYNC_CONN_COMPLETE] */ HCI_EV(HCI_EV_SYNC_CONN_COMPLETE, hci_sync_conn_complete_evt, sizeof(struct hci_ev_sync_conn_complete)), - /* [0x2d = HCI_EV_EXTENDED_INQUIRY_RESULT] */ + /* [0x2f = HCI_EV_EXTENDED_INQUIRY_RESULT] */ HCI_EV_VL(HCI_EV_EXTENDED_INQUIRY_RESULT, hci_extended_inquiry_result_evt, sizeof(struct hci_ev_ext_inquiry_result), HCI_MAX_EVENT_SIZE), @@ -7511,8 +7533,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) goto done; } + hci_dev_lock(hdev); kfree_skb(hdev->recv_event); hdev->recv_event = skb_clone(skb, GFP_KERNEL); + hci_dev_unlock(hdev); event = hdr->evt; if (!event) { diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 022b86797acdc..fc866759910d9 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -118,7 +118,7 @@ static void hci_sock_free_cookie(struct sock *sk) int id = hci_pi(sk)->cookie; if (id) { - hci_pi(sk)->cookie = 0xffffffff; + hci_pi(sk)->cookie = 0; ida_free(&sock_cookie_ida, id); } } @@ -234,7 +234,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && - hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT && + hci_skb_pkt_type(skb) != HCI_DRV_PKT) continue; } else { /* Don't send frame to other channel types */ @@ -391,6 +392,12 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) else opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT); break; + case HCI_DRV_PKT: + if (bt_cb(skb)->incoming) + opcode = cpu_to_le16(HCI_MON_DRV_RX_PKT); + else + opcode = cpu_to_le16(HCI_MON_DRV_TX_PKT); + break; case HCI_DIAG_PKT: opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); break; @@ -1860,7 +1867,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && - hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT && + hci_skb_pkt_type(skb) != HCI_DRV_PKT) { err = -EINVAL; goto drop; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e56b1cbedab90..2b4f21fbf9c14 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -393,7 +393,7 @@ static void le_scan_disable(struct work_struct *work) if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED) goto _return; - if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) { + if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) { if (!test_bit(HCI_INQUIRY, &hdev->flags) && hdev->discovery.state != DISCOVERY_RESOLVING) goto discov_stopped; @@ -1205,9 +1205,126 @@ static int hci_set_adv_set_random_addr_sync(struct hci_dev *hdev, u8 instance, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } +static int +hci_set_ext_adv_params_sync(struct hci_dev *hdev, struct adv_info *adv, + const struct hci_cp_le_set_ext_adv_params *cp, + struct hci_rp_le_set_ext_adv_params *rp) +{ + struct sk_buff *skb; + + skb = __hci_cmd_sync(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(*cp), + cp, HCI_CMD_TIMEOUT); + + /* If command return a status event, skb will be set to -ENODATA */ + if (skb == ERR_PTR(-ENODATA)) + return 0; + + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Opcode 0x%4.4x failed: %ld", + HCI_OP_LE_SET_EXT_ADV_PARAMS, PTR_ERR(skb)); + return PTR_ERR(skb); + } + + if (skb->len != sizeof(*rp)) { + bt_dev_err(hdev, "Invalid response length for 0x%4.4x: %u", + HCI_OP_LE_SET_EXT_ADV_PARAMS, skb->len); + kfree_skb(skb); + return -EIO; + } + + memcpy(rp, skb->data, sizeof(*rp)); + kfree_skb(skb); + + if (!rp->status) { + hdev->adv_addr_type = cp->own_addr_type; + if (!cp->handle) { + /* Store in hdev for instance 0 */ + hdev->adv_tx_power = rp->tx_power; + } else if (adv) { + adv->tx_power = rp->tx_power; + } + } + + return rp->status; +} + +static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance) +{ + DEFINE_FLEX(struct hci_cp_le_set_ext_adv_data, pdu, data, length, + HCI_MAX_EXT_AD_LENGTH); + u8 len; + struct adv_info *adv = NULL; + int err; + + if (instance) { + adv = hci_find_adv_instance(hdev, instance); + if (!adv || !adv->adv_data_changed) + return 0; + } + + len = eir_create_adv_data(hdev, instance, pdu->data, + HCI_MAX_EXT_AD_LENGTH); + + pdu->length = len; + pdu->handle = adv ? adv->handle : instance; + pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; + pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; + + err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA, + struct_size(pdu, data, len), pdu, + HCI_CMD_TIMEOUT); + if (err) + return err; + + /* Update data if the command succeed */ + if (adv) { + adv->adv_data_changed = false; + } else { + memcpy(hdev->adv_data, pdu->data, len); + hdev->adv_data_len = len; + } + + return 0; +} + +static int hci_set_adv_data_sync(struct hci_dev *hdev, u8 instance) +{ + struct hci_cp_le_set_adv_data cp; + u8 len; + + memset(&cp, 0, sizeof(cp)); + + len = eir_create_adv_data(hdev, instance, cp.data, sizeof(cp.data)); + + /* There's nothing to do if the data hasn't changed */ + if (hdev->adv_data_len == len && + memcmp(cp.data, hdev->adv_data, len) == 0) + return 0; + + memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); + hdev->adv_data_len = len; + + cp.length = len; + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_DATA, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); +} + +int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance) +{ + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + return 0; + + if (ext_adv_capable(hdev)) + return hci_set_ext_adv_data_sync(hdev, instance); + + return hci_set_adv_data_sync(hdev, instance); +} + int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; + struct hci_rp_le_set_ext_adv_params rp; bool connectable; u32 flags; bdaddr_t random_addr; @@ -1228,7 +1345,7 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) * Command Disallowed error, so we must first disable the * instance if it is active. */ - if (adv && !adv->pending) { + if (adv) { err = hci_disable_ext_adv_instance_sync(hdev, instance); if (err) return err; @@ -1261,10 +1378,12 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) hci_cpu_to_le24(adv->min_interval, cp.min_interval); hci_cpu_to_le24(adv->max_interval, cp.max_interval); cp.tx_power = adv->tx_power; + cp.sid = adv->sid; } else { hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE; + cp.sid = 0x00; } secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); @@ -1314,8 +1433,12 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) cp.secondary_phy = HCI_ADV_PHY_1M; } - err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, - sizeof(cp), &cp, HCI_CMD_TIMEOUT); + err = hci_set_ext_adv_params_sync(hdev, adv, &cp, &rp); + if (err) + return err; + + /* Update adv data as tx power is known now */ + err = hci_set_ext_adv_data_sync(hdev, cp.handle); if (err) return err; @@ -1559,7 +1682,8 @@ static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv) { u8 bid[3]; - u8 ad[4 + 3]; + u8 ad[HCI_MAX_EXT_AD_LENGTH]; + u8 len; /* Skip if NULL adv as instance 0x00 is used for general purpose * advertising so it cannot used for the likes of Broadcast Announcement @@ -1585,14 +1709,16 @@ static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv) /* Generate Broadcast ID */ get_random_bytes(bid, sizeof(bid)); - eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid)); - hci_set_adv_instance_data(hdev, adv->instance, sizeof(ad), ad, 0, NULL); + len = eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid)); + memcpy(ad + len, adv->adv_data, adv->adv_data_len); + hci_set_adv_instance_data(hdev, adv->instance, len + adv->adv_data_len, + ad, 0, NULL); return hci_update_adv_data_sync(hdev, adv->instance); } -int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, - u8 *data, u32 flags, u16 min_interval, +int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid, + u8 data_len, u8 *data, u32 flags, u16 min_interval, u16 max_interval, u16 sync_interval) { struct adv_info *adv = NULL; @@ -1603,9 +1729,28 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, if (instance) { adv = hci_find_adv_instance(hdev, instance); - /* Create an instance if that could not be found */ - if (!adv) { - adv = hci_add_per_instance(hdev, instance, flags, + if (adv) { + if (sid != HCI_SID_INVALID && adv->sid != sid) { + /* If the SID don't match attempt to find by + * SID. + */ + adv = hci_find_adv_sid(hdev, sid); + if (!adv) { + bt_dev_err(hdev, + "Unable to find adv_info"); + return -EINVAL; + } + } + + /* Turn it into periodic advertising */ + adv->periodic = true; + adv->per_adv_data_len = data_len; + if (data) + memcpy(adv->per_adv_data, data, data_len); + adv->flags = flags; + } else if (!adv) { + /* Create an instance if that could not be found */ + adv = hci_add_per_instance(hdev, instance, sid, flags, data_len, data, sync_interval, sync_interval); @@ -1798,78 +1943,6 @@ int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason) sizeof(cp), &cp, HCI_CMD_TIMEOUT); } -static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance) -{ - DEFINE_FLEX(struct hci_cp_le_set_ext_adv_data, pdu, data, length, - HCI_MAX_EXT_AD_LENGTH); - u8 len; - struct adv_info *adv = NULL; - int err; - - if (instance) { - adv = hci_find_adv_instance(hdev, instance); - if (!adv || !adv->adv_data_changed) - return 0; - } - - len = eir_create_adv_data(hdev, instance, pdu->data); - - pdu->length = len; - pdu->handle = adv ? adv->handle : instance; - pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; - pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; - - err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA, - struct_size(pdu, data, len), pdu, - HCI_CMD_TIMEOUT); - if (err) - return err; - - /* Update data if the command succeed */ - if (adv) { - adv->adv_data_changed = false; - } else { - memcpy(hdev->adv_data, pdu->data, len); - hdev->adv_data_len = len; - } - - return 0; -} - -static int hci_set_adv_data_sync(struct hci_dev *hdev, u8 instance) -{ - struct hci_cp_le_set_adv_data cp; - u8 len; - - memset(&cp, 0, sizeof(cp)); - - len = eir_create_adv_data(hdev, instance, cp.data); - - /* There's nothing to do if the data hasn't changed */ - if (hdev->adv_data_len == len && - memcmp(cp.data, hdev->adv_data, len) == 0) - return 0; - - memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); - hdev->adv_data_len = len; - - cp.length = len; - - return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_DATA, - sizeof(cp), &cp, HCI_CMD_TIMEOUT); -} - -int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance) -{ - if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) - return 0; - - if (ext_adv_capable(hdev)) - return hci_set_ext_adv_data_sync(hdev, instance); - - return hci_set_adv_data_sync(hdev, instance); -} - int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance, bool force) { @@ -1945,13 +2018,10 @@ static int hci_clear_adv_sets_sync(struct hci_dev *hdev, struct sock *sk) static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force) { struct adv_info *adv, *n; - int err = 0; if (ext_adv_capable(hdev)) /* Remove all existing sets */ - err = hci_clear_adv_sets_sync(hdev, sk); - if (ext_adv_capable(hdev)) - return err; + return hci_clear_adv_sets_sync(hdev, sk); /* This is safe as long as there is no command send while the lock is * held. @@ -1979,13 +2049,11 @@ static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force) static int hci_remove_adv_sync(struct hci_dev *hdev, u8 instance, struct sock *sk) { - int err = 0; + int err; /* If we use extended advertising, instance has to be removed first. */ if (ext_adv_capable(hdev)) - err = hci_remove_ext_adv_instance_sync(hdev, instance, sk); - if (ext_adv_capable(hdev)) - return err; + return hci_remove_ext_adv_instance_sync(hdev, instance, sk); /* This is safe as long as there is no command send while the lock is * held. @@ -2084,16 +2152,13 @@ int hci_read_tx_power_sync(struct hci_dev *hdev, __le16 handle, u8 type) int hci_disable_advertising_sync(struct hci_dev *hdev) { u8 enable = 0x00; - int err = 0; /* If controller is not advertising we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_ADV)) return 0; if (ext_adv_capable(hdev)) - err = hci_disable_ext_adv_instance_sync(hdev, 0x00); - if (ext_adv_capable(hdev)) - return err; + return hci_disable_ext_adv_instance_sync(hdev, 0x00); return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); @@ -2456,6 +2521,10 @@ static int hci_pause_advertising_sync(struct hci_dev *hdev) int err; int old_state; + /* If controller is not advertising we are done. */ + if (!hci_dev_test_flag(hdev, HCI_LE_ADV)) + return 0; + /* If already been paused there is nothing to do. */ if (hdev->advertising_paused) return 0; @@ -2860,7 +2929,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (sent) { struct hci_conn *conn; - conn = hci_conn_hash_lookup_ba(hdev, ISO_LINK, + conn = hci_conn_hash_lookup_ba(hdev, PA_LINK, &sent->bdaddr); if (conn) { struct bt_iso_qos *qos = &conn->iso_qos; @@ -3518,7 +3587,7 @@ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; - if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BDADDR_PROPERTY_BROKEN)) baswap(&hdev->public_addr, &ba); else bacpy(&hdev->public_addr, &ba); @@ -3593,7 +3662,7 @@ static int hci_init0_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); /* Reset */ - if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { + if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) { err = hci_reset_sync(hdev); if (err) return err; @@ -3606,7 +3675,7 @@ static int hci_unconf_init_sync(struct hci_dev *hdev) { int err; - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) return 0; err = hci_init0_sync(hdev); @@ -3649,7 +3718,7 @@ static int hci_read_local_cmds_sync(struct hci_dev *hdev) * supported commands. */ if (hdev->hci_ver > BLUETOOTH_VER_1_1 && - !test_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS)) return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL, HCI_CMD_TIMEOUT); @@ -3663,7 +3732,7 @@ static int hci_init1_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); /* Reset */ - if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { + if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) { err = hci_reset_sync(hdev); if (err) return err; @@ -3726,7 +3795,7 @@ static int hci_set_event_filter_sync(struct hci_dev *hdev, u8 flt_type, if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; - if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL)) return 0; memset(&cp, 0, sizeof(cp)); @@ -3753,7 +3822,7 @@ static int hci_clear_event_filter_sync(struct hci_dev *hdev) * a hci_set_event_filter_sync() call succeeds, but we do * the check both for parity and as a future reminder. */ - if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL)) return 0; return hci_set_event_filter_sync(hdev, HCI_FLT_CLEAR_ALL, 0x00, @@ -3777,7 +3846,7 @@ static int hci_write_sync_flowctl_sync(struct hci_dev *hdev) /* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */ if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) || - !test_bit(HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED)) return 0; memset(&cp, 0, sizeof(cp)); @@ -3852,7 +3921,7 @@ static int hci_write_inquiry_mode_sync(struct hci_dev *hdev) u8 mode; if (!lmp_inq_rssi_capable(hdev) && - !test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE)) return 0; /* If Extended Inquiry Result events are supported, then @@ -4042,7 +4111,7 @@ static int hci_set_event_mask_sync(struct hci_dev *hdev) } if (lmp_inq_rssi_capable(hdev) || - test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE)) events[4] |= 0x02; /* Inquiry Result with RSSI */ if (lmp_ext_feat_capable(hdev)) @@ -4094,7 +4163,7 @@ static int hci_read_stored_link_key_sync(struct hci_dev *hdev) struct hci_cp_read_stored_link_key cp; if (!(hdev->commands[6] & 0x20) || - test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY)) return 0; memset(&cp, 0, sizeof(cp)); @@ -4143,7 +4212,7 @@ static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev) { if (!(hdev->commands[18] & 0x04) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || - test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING, @@ -4157,7 +4226,7 @@ static int hci_read_page_scan_type_sync(struct hci_dev *hdev) * this command in the bit mask of supported commands. */ if (!(hdev->commands[13] & 0x01) || - test_bit(HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE, @@ -4352,7 +4421,7 @@ static int hci_le_read_adv_tx_power_sync(struct hci_dev *hdev) static int hci_le_read_tx_power_sync(struct hci_dev *hdev) { if (!(hdev->commands[38] & 0x80) || - test_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_TRANSMIT_POWER, @@ -4395,7 +4464,7 @@ static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev) __le16 timeout = cpu_to_le16(hdev->rpa_timeout); if (!(hdev->commands[35] & 0x04) || - test_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT, @@ -4540,7 +4609,7 @@ static int hci_delete_stored_link_key_sync(struct hci_dev *hdev) * just disable this command. */ if (!(hdev->commands[6] & 0x80) || - test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY)) return 0; memset(&cp, 0, sizeof(cp)); @@ -4666,7 +4735,7 @@ static int hci_set_err_data_report_sync(struct hci_dev *hdev) if (!(hdev->commands[18] & 0x08) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || - test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING)) return 0; if (enabled == hdev->err_data_reporting) @@ -4879,7 +4948,7 @@ static int hci_dev_setup_sync(struct hci_dev *hdev) size_t i; if (!hci_dev_test_flag(hdev, HCI_SETUP) && - !test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP)) return 0; bt_dev_dbg(hdev, ""); @@ -4890,7 +4959,7 @@ static int hci_dev_setup_sync(struct hci_dev *hdev) ret = hdev->setup(hdev); for (i = 0; i < ARRAY_SIZE(hci_broken_table); i++) { - if (test_bit(hci_broken_table[i].quirk, &hdev->quirks)) + if (hci_test_quirk(hdev, hci_broken_table[i].quirk)) bt_dev_warn(hdev, "%s", hci_broken_table[i].desc); } @@ -4898,10 +4967,10 @@ static int hci_dev_setup_sync(struct hci_dev *hdev) * BD_ADDR invalid before creating the HCI device or in * its setup callback. */ - invalid_bdaddr = test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || - test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + invalid_bdaddr = hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) || + hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY); if (!ret) { - if (test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) && !bacmp(&hdev->public_addr, BDADDR_ANY)) hci_dev_get_bd_addr_from_property(hdev); @@ -4923,7 +4992,7 @@ static int hci_dev_setup_sync(struct hci_dev *hdev) * In case any of them is set, the controller has to * start up as unconfigured. */ - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) || invalid_bdaddr) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); @@ -4983,7 +5052,7 @@ static int hci_dev_init_sync(struct hci_dev *hdev) * then they need to be reprogrammed after the init procedure * completed. */ - if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) ret = hdev->set_diag(hdev, true); @@ -5240,7 +5309,7 @@ int hci_dev_close_sync(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); - if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE) && !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); hci_reset_sync(hdev); @@ -5424,7 +5493,7 @@ static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn, { struct hci_cp_disconnect cp; - if (test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) { + if (conn->type == BIS_LINK || conn->type == PA_LINK) { /* This is a BIS connection, hci_conn_del will * do the necessary cleanup. */ @@ -5477,7 +5546,7 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, if (conn->type == LE_LINK) return hci_le_connect_cancel_sync(hdev, conn, reason); - if (conn->type == ISO_LINK) { + if (conn->type == CIS_LINK) { /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 1857: * @@ -5490,9 +5559,10 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, return hci_disconnect_sync(hdev, conn, reason); /* CIS with no Create CIS sent have nothing to cancel */ - if (bacmp(&conn->dst, BDADDR_ANY)) - return HCI_ERROR_LOCAL_HOST_TERM; + return HCI_ERROR_LOCAL_HOST_TERM; + } + if (conn->type == BIS_LINK || conn->type == PA_LINK) { /* There is no way to cancel a BIS without terminating the BIG * which is done later on connection cleanup. */ @@ -5554,9 +5624,12 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, { struct hci_cp_reject_conn_req cp; - if (conn->type == ISO_LINK) + if (conn->type == CIS_LINK) return hci_le_reject_cis_sync(hdev, conn, reason); + if (conn->type == BIS_LINK || conn->type == PA_LINK) + return -EINVAL; + if (conn->type == SCO_LINK || conn->type == ESCO_LINK) return hci_reject_sco_sync(hdev, conn, reason); @@ -5604,7 +5677,7 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) } /* Cleanup hci_conn object if it cannot be cancelled as it - * likelly means the controller and host stack are out of sync + * likely means the controller and host stack are out of sync * or in case of LE it was still scanning so it can be cleanup * safely. */ @@ -5886,7 +5959,7 @@ static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval) own_addr_type = ADDR_LE_DEV_PUBLIC; if (hci_is_adv_monitoring(hdev) || - (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) && + (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER) && hdev->discovery.result_filtering)) { /* Duplicate filter should be disabled when some advertisement * monitor is activated, otherwise AdvMon can only receive one @@ -5949,8 +6022,7 @@ int hci_start_discovery_sync(struct hci_dev *hdev) * and LE scanning are done sequentially with separate * timeouts. */ - if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, - &hdev->quirks)) { + if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) { timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); /* During simultaneous discovery, we double LE scan * interval. We must leave some time for the controller @@ -6027,7 +6099,7 @@ static int hci_update_event_filter_sync(struct hci_dev *hdev) /* Some fake CSR controllers lock up after setting this type of * filter, so avoid sending the request altogether. */ - if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL)) return 0; /* Always clear event filter when starting */ @@ -6044,7 +6116,7 @@ static int hci_update_event_filter_sync(struct hci_dev *hdev) &b->bdaddr, HCI_CONN_SETUP_AUTO_ON); if (err) - bt_dev_dbg(hdev, "Failed to set event filter for %pMR", + bt_dev_err(hdev, "Failed to set event filter for %pMR", &b->bdaddr); else scan = SCAN_PAGE; @@ -6248,6 +6320,7 @@ static int hci_le_ext_directed_advertising_sync(struct hci_dev *hdev, struct hci_conn *conn) { struct hci_cp_le_set_ext_adv_params cp; + struct hci_rp_le_set_ext_adv_params rp; int err; bdaddr_t random_addr; u8 own_addr_type; @@ -6289,8 +6362,12 @@ static int hci_le_ext_directed_advertising_sync(struct hci_dev *hdev, if (err) return err; - err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, - sizeof(cp), &cp, HCI_CMD_TIMEOUT); + err = hci_set_ext_adv_params_sync(hdev, NULL, &cp, &rp); + if (err) + return err; + + /* Update adv data as tx power is known now */ + err = hci_set_ext_adv_data_sync(hdev, cp.handle); if (err) return err; @@ -6737,8 +6814,8 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, return 0; } - /* No privacy so use a public address. */ - *own_addr_type = ADDR_LE_DEV_PUBLIC; + /* No privacy, use the current address */ + hci_copy_identity_address(hdev, rand_addr, own_addr_type); return 0; } @@ -6898,20 +6975,37 @@ int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, static void create_pa_complete(struct hci_dev *hdev, void *data, int err) { + struct hci_conn *conn = data; + struct hci_conn *pa_sync; + bt_dev_dbg(hdev, "err %d", err); - if (!err) + if (err == -ECANCELED) return; + hci_dev_lock(hdev); + hci_dev_clear_flag(hdev, HCI_PA_SYNC); - if (err == -ECANCELED) - return; + if (!hci_conn_valid(hdev, conn)) + clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); - hci_dev_lock(hdev); + if (!err) + goto unlock; - hci_update_passive_scan_sync(hdev); + /* Add connection to indicate PA sync error */ + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, + HCI_ROLE_SLAVE); + if (IS_ERR(pa_sync)) + goto unlock; + + set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags); + + /* Notify iso layer */ + hci_connect_cfm(pa_sync, bt_status(err)); + +unlock: hci_dev_unlock(hdev); } @@ -6925,9 +7019,23 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) if (!hci_conn_valid(hdev, conn)) return -ECANCELED; + if (conn->sync_handle != HCI_SYNC_HANDLE_INVALID) + return -EINVAL; + if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC)) return -EBUSY; + /* Stop scanning if SID has not been set and active scanning is enabled + * so we use passive scanning which will be scanning using the allow + * list programmed to contain only the connection address. + */ + if (conn->sid == HCI_SID_INVALID && + hci_dev_test_flag(hdev, HCI_LE_SCAN)) { + hci_scan_disable_sync(hdev); + hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + } + /* Mark HCI_CONN_CREATE_PA_SYNC so hci_update_passive_scan_sync can * program the address in the allow list so PA advertisements can be * received. @@ -6936,6 +7044,14 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) hci_update_passive_scan_sync(hdev); + /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update + * it. + */ + if (conn->sid == HCI_SID_INVALID) + __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, + HCI_EV_LE_EXT_ADV_REPORT, + conn->conn_timeout, NULL); + memset(&cp, 0, sizeof(cp)); cp.options = qos->bcast.options; cp.sid = conn->sid; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index fc5af8639b1ef..6724adce615b6 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -405,7 +405,7 @@ static int hidp_raw_request(struct hid_device *hid, unsigned char reportnum, static void hidp_idle_timeout(struct timer_list *t) { - struct hidp_session *session = from_timer(session, t, timer); + struct hidp_session *session = timer_container_of(session, t, timer); /* The HIDP user-space API only contains calls to add and remove * devices. There is no way to forward events of any kind. Therefore, diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 2819cda616bce..7bd3aa0a6db9a 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -336,7 +336,7 @@ static int iso_connect_bis(struct sock *sk) struct hci_dev *hdev; int err; - BT_DBG("%pMR", &iso_pi(sk)->src); + BT_DBG("%pMR (SID 0x%2.2x)", &iso_pi(sk)->src, iso_pi(sk)->bc_sid); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); @@ -365,7 +365,7 @@ static int iso_connect_bis(struct sock *sk) /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { - hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, + hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { @@ -375,12 +375,16 @@ static int iso_connect_bis(struct sock *sk) } else { hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos, iso_pi(sk)->base_len, - iso_pi(sk)->base); + iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, + iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } + + /* Update SID if it was not set */ + if (iso_pi(sk)->bc_sid == HCI_SID_INVALID) + iso_pi(sk)->bc_sid = hcon->sid; } conn = iso_conn_add(hcon); @@ -409,7 +413,7 @@ static int iso_connect_bis(struct sock *sk) sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; - iso_sock_set_timer(sk, sk->sk_sndtimeo); + iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } release_sock(sk); @@ -499,7 +503,7 @@ static int iso_connect_cis(struct sock *sk) sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; - iso_sock_set_timer(sk, sk->sk_sndtimeo); + iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } release_sock(sk); @@ -941,7 +945,7 @@ static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type; - if (sa->iso_bc->bc_sid > 0x0f) + if (sa->iso_bc->bc_sid > 0x0f && sa->iso_bc->bc_sid != HCI_SID_INVALID) return -EINVAL; iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid; @@ -1330,20 +1334,32 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; + int len = sizeof(struct sockaddr_iso); BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; if (peer) { + struct hci_conn *hcon = iso_pi(sk)->conn ? + iso_pi(sk)->conn->hcon : NULL; + bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; + + if (hcon && hcon->type == BIS_LINK) { + sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid; + sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis; + memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis, + ISO_MAX_NUM_BIS); + len += sizeof(struct sockaddr_iso_bc); + } } else { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src); sa->iso_bdaddr_type = iso_pi(sk)->src_type; } - return sizeof(struct sockaddr_iso); + return len; } static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, @@ -1671,6 +1687,17 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; + case BT_PKT_SEQNUM: + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) + break; + + if (opt) + set_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags); + else + clear_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags); + break; + case BT_ISO_QOS: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2 && @@ -1875,7 +1902,7 @@ static void iso_sock_ready(struct sock *sk) static bool iso_match_big(struct sock *sk, void *data) { - struct hci_evt_le_big_sync_estabilished *ev = data; + struct hci_evt_le_big_sync_established *ev = data; return ev->handle == iso_pi(sk)->qos.bcast.big; } @@ -1896,7 +1923,7 @@ static void iso_conn_ready(struct iso_conn *conn) { struct sock *parent = NULL; struct sock *sk = conn->sk; - struct hci_ev_le_big_sync_estabilished *ev = NULL; + struct hci_ev_le_big_sync_established *ev = NULL; struct hci_ev_le_pa_sync_established *ev2 = NULL; struct hci_ev_le_per_adv_report *ev3 = NULL; struct hci_conn *hcon; @@ -1988,11 +2015,13 @@ static void iso_conn_ready(struct iso_conn *conn) hcon->dst_type = iso_pi(parent)->dst_type; } - if (ev3) { + if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { iso_pi(sk)->qos = iso_pi(parent)->qos; hcon->iso_qos = iso_pi(sk)->qos; + iso_pi(sk)->bc_sid = iso_pi(parent)->bc_sid; iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis; - memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS); + memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, + ISO_MAX_NUM_BIS); set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } @@ -2005,7 +2034,7 @@ static void iso_conn_ready(struct iso_conn *conn) hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); - if ((ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) || + if ((ev && ((struct hci_evt_le_big_sync_established *)ev)->status) || (ev2 && ev2->status)) { /* Trigger error signal on child socket */ sk->sk_err = ECONNREFUSED; @@ -2029,6 +2058,9 @@ static bool iso_match_sid(struct sock *sk, void *data) { struct hci_ev_le_pa_sync_established *ev = data; + if (iso_pi(sk)->bc_sid == HCI_SID_INVALID) + return true; + return ev->sid == iso_pi(sk)->bc_sid; } @@ -2061,7 +2093,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) * proceed to establishing a BIG sync: * * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific - * SID to listen to and once sync is estabilished its handle needs to + * SID to listen to and once sync is established its handle needs to * be stored in iso_pi(sk)->sync_handle so it can be matched once * receiving the BIG Info. * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a @@ -2075,8 +2107,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (ev1) { sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); - if (sk && !ev1->status) + if (sk && !ev1->status) { iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); + iso_pi(sk)->bc_sid = ev1->sid; + } goto done; } @@ -2203,7 +2237,8 @@ done: static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { - if (hcon->type != ISO_LINK) { + if (hcon->type != CIS_LINK && hcon->type != BIS_LINK && + hcon->type != PA_LINK) { if (hcon->type != LE_LINK) return; @@ -2244,7 +2279,8 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) { - if (hcon->type != ISO_LINK) + if (hcon->type != CIS_LINK && hcon->type != BIS_LINK && + hcon->type != PA_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); @@ -2255,7 +2291,8 @@ static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct iso_conn *conn = hcon->iso_data; - __u16 pb, ts, len; + struct skb_shared_hwtstamps *hwts; + __u16 pb, ts, len, sn; if (!conn) goto drop; @@ -2278,13 +2315,17 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (ts) { struct hci_iso_ts_data_hdr *hdr; - /* TODO: add timestamp to the packet? */ hdr = skb_pull_data(skb, HCI_ISO_TS_DATA_HDR_SIZE); if (!hdr) { BT_ERR("Frame is too short (len %d)", skb->len); goto drop; } + /* Record the timestamp to skb */ + hwts = skb_hwtstamps(skb); + hwts->hwtstamp = us_to_ktime(le32_to_cpu(hdr->ts)); + + sn = __le16_to_cpu(hdr->sn); len = __le16_to_cpu(hdr->slen); } else { struct hci_iso_data_hdr *hdr; @@ -2295,18 +2336,20 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) goto drop; } + sn = __le16_to_cpu(hdr->sn); len = __le16_to_cpu(hdr->slen); } flags = hci_iso_data_flags(len); len = hci_iso_data_len(len); - BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x", len, - skb->len, flags); + BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x sn %d", + len, skb->len, flags, sn); if (len == skb->len) { /* Complete frame received */ hci_skb_pkt_status(skb) = flags & 0x03; + hci_skb_pkt_seqnum(skb) = sn; iso_recv_frame(conn, skb); return; } @@ -2329,9 +2372,17 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) goto drop; hci_skb_pkt_status(conn->rx_skb) = flags & 0x03; + hci_skb_pkt_seqnum(conn->rx_skb) = sn; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len = len - skb->len; + + /* Copy hw timestamp from skb to rx_skb if present */ + if (ts) { + hwts = skb_hwtstamps(conn->rx_skb); + hwts->hwtstamp = skb_hwtstamps(skb)->hwtstamp; + } + break; case ISO_CONT: diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 042d3ac3b4a38..805c752ac0a9d 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -3415,7 +3415,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; struct l2cap_conf_efs efs; u8 remote_efs = 0; - u16 mtu = L2CAP_DEFAULT_MTU; + u16 mtu = 0; u16 result = L2CAP_CONF_SUCCESS; u16 size; @@ -3520,6 +3520,29 @@ done: /* Configure output options and let the other side know * which ones we don't like. */ + /* If MTU is not provided in configure request, try adjusting it + * to the current output MTU if it has been set + * + * Bluetooth Core 6.1, Vol 3, Part A, Section 4.5 + * + * Each configuration parameter value (if any is present) in an + * L2CAP_CONFIGURATION_RSP packet reflects an ‘adjustment’ to a + * configuration parameter value that has been sent (or, in case + * of default values, implied) in the corresponding + * L2CAP_CONFIGURATION_REQ packet. + */ + if (!mtu) { + /* Only adjust for ERTM channels as for older modes the + * remote stack may not be able to detect that the + * adjustment causing it to silently drop packets. + */ + if (chan->mode == L2CAP_MODE_ERTM && + chan->omtu && chan->omtu != L2CAP_DEFAULT_MTU) + mtu = chan->omtu; + else + mtu = L2CAP_DEFAULT_MTU; + } + if (mtu < L2CAP_DEFAULT_MIN_MTU) result = L2CAP_CONF_UNACCEPT; else { @@ -4870,7 +4893,8 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, if (!smp_sufficient_security(conn->hcon, pchan->sec_level, SMP_ALLOW_STK)) { - result = L2CAP_CR_LE_AUTHENTICATION; + result = pchan->sec_level == BT_SECURITY_MEDIUM ? + L2CAP_CR_LE_ENCRYPTION : L2CAP_CR_LE_AUTHENTICATION; chan = NULL; goto response_unlock; } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 5aa55fa695943..f4257c4d30525 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -255,7 +255,7 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid), &la.l2_bdaddr, la.l2_bdaddr_type, - sk->sk_sndtimeo); + READ_ONCE(sk->sk_sndtimeo)); if (err) return err; @@ -1703,6 +1703,9 @@ static void l2cap_sock_resume_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (!sk) + return; + if (test_and_clear_bit(FLAG_PENDING_SECURITY, &chan->flags)) { sk->sk_state = BT_CONNECTED; chan->state = BT_CONNECTED; @@ -1725,7 +1728,7 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; - return sk->sk_sndtimeo; + return READ_ONCE(sk->sk_sndtimeo); } static struct pid *l2cap_sock_get_peer_pid_cb(struct l2cap_chan *chan) diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 43aa01fd07b98..305044a844783 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL(baswap); * bt_to_errno() - Bluetooth error codes to standard errno * @code: Bluetooth error code to be converted * - * This function takes a Bluetooth error code as input and convets + * This function takes a Bluetooth error code as input and converts * it to an equivalent Unix/standard errno value. * * Return: diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 46b22708dfbd2..1ce682038b519 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -464,7 +464,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, /* Devices marked as raw-only are neither configured * nor unconfigured controllers. */ - if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) + if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE)) continue; if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) { @@ -522,7 +522,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, /* Devices marked as raw-only are neither configured * nor unconfigured controllers. */ - if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) + if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE)) continue; if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) { @@ -576,7 +576,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, /* Devices marked as raw-only are neither configured * nor unconfigured controllers. */ - if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) + if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE)) continue; if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) @@ -612,12 +612,12 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, static bool is_configured(struct hci_dev *hdev) { - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) && !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED)) return false; - if ((test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || - test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks)) && + if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) || + hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) && !bacmp(&hdev->public_addr, BDADDR_ANY)) return false; @@ -628,12 +628,12 @@ static __le32 get_missing_options(struct hci_dev *hdev) { u32 options = 0; - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) && !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED)) options |= MGMT_OPTION_EXTERNAL_CONFIG; - if ((test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || - test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks)) && + if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) || + hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) && !bacmp(&hdev->public_addr, BDADDR_ANY)) options |= MGMT_OPTION_PUBLIC_ADDRESS; @@ -669,7 +669,7 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev, memset(&rp, 0, sizeof(rp)); rp.manufacturer = cpu_to_le16(hdev->manufacturer); - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG)) options |= MGMT_OPTION_EXTERNAL_CONFIG; if (hdev->set_bdaddr) @@ -828,8 +828,7 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (lmp_sc_capable(hdev)) settings |= MGMT_SETTING_SECURE_CONN; - if (test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, - &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; } @@ -841,8 +840,7 @@ static u32 get_supported_settings(struct hci_dev *hdev) settings |= MGMT_SETTING_ADVERTISING; } - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || - hdev->set_bdaddr) + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) || hdev->set_bdaddr) settings |= MGMT_SETTING_CONFIGURATION; if (cis_central_capable(hdev)) @@ -1080,7 +1078,8 @@ static int mesh_send_done_sync(struct hci_dev *hdev, void *data) struct mgmt_mesh_tx *mesh_tx; hci_dev_clear_flag(hdev, HCI_MESH_SENDING); - hci_disable_advertising_sync(hdev); + if (list_empty(&hdev->adv_instances)) + hci_disable_advertising_sync(hdev); mesh_tx = mgmt_mesh_next(hdev, NULL); if (mesh_tx) @@ -1447,22 +1446,17 @@ static void settings_rsp(struct mgmt_pending_cmd *cmd, void *data) send_settings_rsp(cmd->sk, cmd->opcode, match->hdev); - list_del(&cmd->list); - if (match->sk == NULL) { match->sk = cmd->sk; sock_hold(match->sk); } - - mgmt_pending_free(cmd); } static void cmd_status_rsp(struct mgmt_pending_cmd *cmd, void *data) { u8 *status = data; - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, *status); - mgmt_pending_remove(cmd); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, *status); } static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) @@ -1476,8 +1470,6 @@ static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) if (cmd->cmd_complete) { cmd->cmd_complete(cmd, match->mgmt_status); - mgmt_pending_remove(cmd); - return; } @@ -1486,13 +1478,13 @@ static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) static int generic_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { - return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, + return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, cmd->param, cmd->param_len); } static int addr_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { - return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, + return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, cmd->param, sizeof(struct mgmt_addr_info)); } @@ -1532,7 +1524,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, if (err) { u8 mgmt_err = mgmt_status(err); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); goto done; } @@ -1707,7 +1699,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, if (err) { u8 mgmt_err = mgmt_status(err); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); goto done; } @@ -1943,8 +1935,8 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) new_settings(hdev, NULL); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, cmd_status_rsp, - &mgmt_err); + mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, + cmd_status_rsp, &mgmt_err); return; } @@ -1954,7 +1946,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, settings_rsp, &match); if (changed) new_settings(hdev, match.sk); @@ -2074,12 +2066,12 @@ static void set_le_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); if (status) { - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, cmd_status_rsp, - &status); + mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, cmd_status_rsp, + &status); return; } - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, settings_rsp, &match); new_settings(hdev, match.sk); @@ -2138,7 +2130,7 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) struct sock *sk = cmd->sk; if (status) { - mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, + mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, cmd_status_rsp, &status); return; } @@ -2160,6 +2152,9 @@ static int set_mesh_sync(struct hci_dev *hdev, void *data) else hci_dev_clear_flag(hdev, HCI_MESH); + hdev->le_scan_interval = __le16_to_cpu(cp->period); + hdev->le_scan_window = __le16_to_cpu(cp->window); + len -= sizeof(*cp); /* If filters don't fit, forward all adv pkts */ @@ -2174,6 +2169,7 @@ static int set_mesh(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_mesh *cp = data; struct mgmt_pending_cmd *cmd; + __u16 period, window; int err = 0; bt_dev_dbg(hdev, "sock %p", sk); @@ -2187,6 +2183,23 @@ static int set_mesh(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, MGMT_STATUS_INVALID_PARAMS); + /* Keep allowed ranges in sync with set_scan_params() */ + period = __le16_to_cpu(cp->period); + + if (period < 0x0004 || period > 0x4000) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, + MGMT_STATUS_INVALID_PARAMS); + + window = __le16_to_cpu(cp->window); + + if (window < 0x0004 || window > 0x4000) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, + MGMT_STATUS_INVALID_PARAMS); + + if (window > period) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, + MGMT_STATUS_INVALID_PARAMS); + hci_dev_lock(hdev); cmd = mgmt_pending_add(sk, MGMT_OP_SET_MESH_RECEIVER, hdev, data, len); @@ -2566,7 +2579,8 @@ static int mgmt_hci_cmd_sync(struct sock *sk, struct hci_dev *hdev, struct mgmt_pending_cmd *cmd; int err; - if (len < sizeof(*cp)) + if (len != (offsetof(struct mgmt_cp_hci_cmd_sync, params) + + le16_to_cpu(cp->params_len))) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, MGMT_STATUS_INVALID_PARAMS); @@ -2637,7 +2651,7 @@ static void mgmt_class_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), hdev->dev_class, 3); mgmt_pending_free(cmd); @@ -3221,7 +3235,9 @@ failed: static u8 link_to_bdaddr(u8 link_type, u8 addr_type) { switch (link_type) { - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: case LE_LINK: switch (addr_type) { case ADDR_LE_DEV_PUBLIC: @@ -3425,7 +3441,7 @@ static int pairing_complete(struct mgmt_pending_cmd *cmd, u8 status) bacpy(&rp.addr.bdaddr, &conn->dst); rp.addr.type = link_to_bdaddr(conn->type, conn->dst_type); - err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE, + err = mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_PAIR_DEVICE, status, &rp, sizeof(rp)); /* So we don't get further callbacks for this connection */ @@ -4290,7 +4306,7 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, bt_dev_dbg(hdev, "sock %p", sk); - if (!test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks)) + if (!hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_WIDEBAND_SPEECH, MGMT_STATUS_NOT_SUPPORTED); @@ -5106,24 +5122,14 @@ static void mgmt_adv_monitor_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_ADV_MONITOR_ADDED, hdev, &ev, sizeof(ev), sk); } -void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle) +static void mgmt_adv_monitor_removed(struct sock *sk, struct hci_dev *hdev, + __le16 handle) { struct mgmt_ev_adv_monitor_removed ev; - struct mgmt_pending_cmd *cmd; - struct sock *sk_skip = NULL; - struct mgmt_cp_remove_adv_monitor *cp; - - cmd = pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev); - if (cmd) { - cp = cmd->param; - if (cp->monitor_handle) - sk_skip = cmd->sk; - } - - ev.monitor_handle = cpu_to_le16(handle); + ev.monitor_handle = handle; - mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk_skip); + mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk); } static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, @@ -5194,7 +5200,7 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, hci_update_passive_scan(hdev); } - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); mgmt_pending_remove(cmd); @@ -5225,8 +5231,7 @@ static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, if (pending_find(MGMT_OP_SET_LE, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || - pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev) || - pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) { + pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) { status = MGMT_STATUS_BUSY; goto unlock; } @@ -5396,8 +5401,7 @@ static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_remove_adv_monitor *cp; - if (status == -ECANCELED || - cmd != pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) + if (status == -ECANCELED) return; hci_dev_lock(hdev); @@ -5406,12 +5410,14 @@ static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, rp.monitor_handle = cp->monitor_handle; - if (!status) + if (!status) { + mgmt_adv_monitor_removed(cmd->sk, hdev, cp->monitor_handle); hci_update_passive_scan(hdev); + } - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); bt_dev_dbg(hdev, "remove monitor %d complete, status %d", @@ -5421,10 +5427,6 @@ static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, static int mgmt_remove_adv_monitor_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - - if (cmd != pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) - return -ECANCELED; - struct mgmt_cp_remove_adv_monitor *cp = cmd->param; u16 handle = __le16_to_cpu(cp->monitor_handle); @@ -5443,14 +5445,13 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, hci_dev_lock(hdev); if (pending_find(MGMT_OP_SET_LE, hdev) || - pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) { status = MGMT_STATUS_BUSY; goto unlock; } - cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_ADV_MONITOR, hdev, data, len); + cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_ADV_MONITOR, hdev, data, len); if (!cmd) { status = MGMT_STATUS_NO_RESOURCES; goto unlock; @@ -5460,7 +5461,7 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, mgmt_remove_adv_monitor_complete); if (err) { - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); if (err == -ENOMEM) status = MGMT_STATUS_NO_RESOURCES; @@ -5790,7 +5791,7 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) return; - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); mgmt_pending_remove(cmd); @@ -6011,7 +6012,7 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); mgmt_pending_remove(cmd); @@ -6236,7 +6237,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) u8 status = mgmt_status(err); if (status) { - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, + mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, cmd_status_rsp, &status); return; } @@ -6246,7 +6247,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) else hci_dev_clear_flag(hdev, HCI_ADVERTISING); - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, settings_rsp, + mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, settings_rsp, &match); new_settings(hdev, match.sk); @@ -6452,6 +6453,7 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, MGMT_STATUS_NOT_SUPPORTED); + /* Keep allowed ranges in sync with set_mesh() */ interval = __le16_to_cpu(cp->interval); if (interval < 0x0004 || interval > 0x4000) @@ -6590,7 +6592,7 @@ static void set_bredr_complete(struct hci_dev *hdev, void *data, int err) */ hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); } else { send_settings_rsp(cmd->sk, MGMT_OP_SET_BREDR, hdev); new_settings(hdev, cmd->sk); @@ -6727,7 +6729,7 @@ static void set_secure_conn_complete(struct hci_dev *hdev, void *data, int err) if (err) { u8 mgmt_err = mgmt_status(err); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); goto done; } @@ -7174,7 +7176,7 @@ static void get_conn_info_complete(struct hci_dev *hdev, void *data, int err) rp.max_tx_power = HCI_TX_POWER_INVALID; } - mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_GET_CONN_INFO, status, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_GET_CONN_INFO, status, &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -7334,7 +7336,7 @@ static void get_clock_info_complete(struct hci_dev *hdev, void *data, int err) } complete: - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, &rp, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -7932,7 +7934,7 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, MGMT_STATUS_INVALID_PARAMS); - if (!test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks)) + if (!hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, MGMT_STATUS_NOT_SUPPORTED); @@ -8584,10 +8586,10 @@ static void add_advertising_complete(struct hci_dev *hdev, void *data, int err) rp.instance = cp->instance; if (err) - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); else - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); add_adv_complete(hdev, cmd->sk, cp->instance, err); @@ -8775,10 +8777,10 @@ static void add_ext_adv_params_complete(struct hci_dev *hdev, void *data, hci_remove_adv_instance(hdev, cp->instance); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); } else { - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); } @@ -8925,10 +8927,10 @@ static void add_ext_adv_data_complete(struct hci_dev *hdev, void *data, int err) rp.instance = cp->instance; if (err) - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); else - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -9087,10 +9089,10 @@ static void remove_advertising_complete(struct hci_dev *hdev, void *data, rp.instance = cp->instance; if (err) - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); else - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -9335,7 +9337,7 @@ void mgmt_index_added(struct hci_dev *hdev) { struct mgmt_ev_ext_index ev; - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) return; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { @@ -9359,10 +9361,10 @@ void mgmt_index_removed(struct hci_dev *hdev) struct mgmt_ev_ext_index ev; struct cmd_lookup match = { NULL, hdev, MGMT_STATUS_INVALID_INDEX }; - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) return; - mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &match); + mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match); if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0, @@ -9400,7 +9402,8 @@ void mgmt_power_on(struct hci_dev *hdev, int err) hci_update_passive_scan(hdev); } - mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp, + &match); new_settings(hdev, match.sk); @@ -9415,7 +9418,8 @@ void __mgmt_power_off(struct hci_dev *hdev) struct cmd_lookup match = { NULL, hdev }; u8 zero_cod[] = { 0, 0, 0 }; - mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp, + &match); /* If the power off is because of hdev unregistration let * use the appropriate INVALID_INDEX status. Otherwise use @@ -9429,7 +9433,7 @@ void __mgmt_power_off(struct hci_dev *hdev) else match.mgmt_status = MGMT_STATUS_NOT_POWERED; - mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &match); + mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match); if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, @@ -9670,7 +9674,6 @@ static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data) device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk); cmd->cmd_complete(cmd, 0); - mgmt_pending_remove(cmd); } bool mgmt_powering_down(struct hci_dev *hdev) @@ -9726,8 +9729,8 @@ void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, struct mgmt_cp_disconnect *cp; struct mgmt_pending_cmd *cmd; - mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp, - hdev); + mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, true, + unpair_device_rsp, hdev); cmd = pending_find(MGMT_OP_DISCONNECT, hdev); if (!cmd) @@ -9920,7 +9923,7 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) if (status) { u8 mgmt_err = mgmt_status(status); - mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, + mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true, cmd_status_rsp, &mgmt_err); return; } @@ -9930,8 +9933,8 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) else changed = hci_dev_test_and_clear_flag(hdev, HCI_LINK_SECURITY); - mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, settings_rsp, - &match); + mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true, + settings_rsp, &match); if (changed) new_settings(hdev, match.sk); @@ -9955,9 +9958,12 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, { struct cmd_lookup match = { NULL, hdev, mgmt_status(status) }; - mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, sk_lookup, &match); - mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match); - mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); + mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, false, sk_lookup, + &match); + mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, false, sk_lookup, + &match); + mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, false, sk_lookup, + &match); if (!status) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, @@ -10082,7 +10088,7 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir, if (hdev->discovery.rssi != HCI_RSSI_INVALID && (rssi == HCI_RSSI_INVALID || (rssi < hdev->discovery.rssi && - !test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)))) + !hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER)))) return false; if (hdev->discovery.uuid_count != 0) { @@ -10100,7 +10106,7 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir, /* If duplicate filtering does not report RSSI changes, then restart * scanning to ensure updated result with updated RSSI values. */ - if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)) { + if (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER)) { /* Validate RSSI value against the RSSI threshold once more. */ if (hdev->discovery.rssi != HCI_RSSI_INVALID && rssi < hdev->discovery.rssi) diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index e5ff65e424b5b..a88a07da39473 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -217,30 +217,47 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev) { - struct mgmt_pending_cmd *cmd; + struct mgmt_pending_cmd *cmd, *tmp; + + mutex_lock(&hdev->mgmt_pending_lock); - list_for_each_entry(cmd, &hdev->mgmt_pending, list) { + list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (hci_sock_get_channel(cmd->sk) != channel) continue; - if (cmd->opcode == opcode) + + if (cmd->opcode == opcode) { + mutex_unlock(&hdev->mgmt_pending_lock); return cmd; + } } + mutex_unlock(&hdev->mgmt_pending_lock); + return NULL; } -void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, +void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data) { struct mgmt_pending_cmd *cmd, *tmp; + mutex_lock(&hdev->mgmt_pending_lock); + list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (opcode > 0 && cmd->opcode != opcode) continue; + if (remove) + list_del(&cmd->list); + cb(cmd, data); + + if (remove) + mgmt_pending_free(cmd); } + + mutex_unlock(&hdev->mgmt_pending_lock); } struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, @@ -254,7 +271,7 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, return NULL; cmd->opcode = opcode; - cmd->index = hdev->id; + cmd->hdev = hdev; cmd->param = kmemdup(data, len, GFP_KERNEL); if (!cmd->param) { @@ -280,7 +297,9 @@ struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, if (!cmd) return NULL; + mutex_lock(&hdev->mgmt_pending_lock); list_add_tail(&cmd->list, &hdev->mgmt_pending); + mutex_unlock(&hdev->mgmt_pending_lock); return cmd; } @@ -294,7 +313,10 @@ void mgmt_pending_free(struct mgmt_pending_cmd *cmd) void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) { + mutex_lock(&cmd->hdev->mgmt_pending_lock); list_del(&cmd->list); + mutex_unlock(&cmd->hdev->mgmt_pending_lock); + mgmt_pending_free(cmd); } @@ -304,7 +326,7 @@ void mgmt_mesh_foreach(struct hci_dev *hdev, { struct mgmt_mesh_tx *mesh_tx, *tmp; - list_for_each_entry_safe(mesh_tx, tmp, &hdev->mgmt_pending, list) { + list_for_each_entry_safe(mesh_tx, tmp, &hdev->mesh_pending, list) { if (!sk || mesh_tx->sk == sk) cb(mesh_tx, data); } diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index f2ba994ab1d84..024e51dd69375 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -33,7 +33,7 @@ struct mgmt_mesh_tx { struct mgmt_pending_cmd { struct list_head list; u16 opcode; - int index; + struct hci_dev *hdev; void *param; size_t param_len; struct sock *sk; @@ -54,7 +54,7 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev); -void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, +void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data); struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 5a8ccc491b144..c560d84676696 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -989,7 +989,7 @@ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); - if (!test_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks)) { + if (!hci_test_quirk(hdev, HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER)) { if (!handle_data) return; mgmt_handle = handle_data->mgmt_handle; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 20ea7dba0a9a1..96250807b32b4 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -235,7 +235,7 @@ static int rfcomm_check_security(struct rfcomm_dlc *d) static void rfcomm_session_timeout(struct timer_list *t) { - struct rfcomm_session *s = from_timer(s, t, timer); + struct rfcomm_session *s = timer_container_of(s, t, timer); BT_DBG("session %p state %ld", s, s->state); @@ -260,7 +260,7 @@ static void rfcomm_session_clear_timer(struct rfcomm_session *s) /* ---- RFCOMM DLCs ---- */ static void rfcomm_dlc_timeout(struct timer_list *t) { - struct rfcomm_dlc *d = from_timer(d, t, timer); + struct rfcomm_dlc *d = timer_container_of(d, t, timer); BT_DBG("dlc %p state %ld", d, d->state); @@ -1962,7 +1962,8 @@ static void rfcomm_accept_connection(struct rfcomm_session *s) int err; /* Fast check for a new connection. - * Avoids unnesesary socket allocations. */ + * Avoids unnecessary socket allocations. + */ if (list_empty(&bt_sk(sock->sk)->accept_q)) return; diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 21a5b5535ebce..376ce6de84be8 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -438,7 +438,6 @@ static int __rfcomm_release_dev(void __user *arg) { struct rfcomm_dev_req req; struct rfcomm_dev *dev; - struct tty_struct *tty; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; @@ -464,11 +463,7 @@ static int __rfcomm_release_dev(void __user *arg) rfcomm_dlc_close(dev->dlc, 0); /* Shut down TTY synchronously before freeing rfcomm_dev */ - tty = tty_port_tty_get(&dev->port); - if (tty) { - tty_vhangup(tty); - tty_kref_put(tty); - } + tty_port_tty_vhangup(&dev->port); if (!test_bit(RFCOMM_TTY_OWNED, &dev->status)) tty_port_put(&dev->port); @@ -980,7 +975,7 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, baud = RFCOMM_RPN_BR_230400; break; default: - /* 9600 is standard accordinag to the RFCOMM specification */ + /* 9600 is standard according to the RFCOMM specification */ baud = RFCOMM_RPN_BR_9600; break; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 2945d27e75dce..d382d980fd9a7 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -338,7 +338,7 @@ static int sco_connect(struct sock *sk) hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, sco_pi(sk)->setting, &sco_pi(sk)->codec, - sk->sk_sndtimeo); + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -367,7 +367,7 @@ static int sco_connect(struct sock *sk) sk->sk_state = BT_CONNECTED; } else { sk->sk_state = BT_CONNECT; - sco_sock_set_timer(sk, sk->sk_sndtimeo); + sco_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } release_sock(sk); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 47f359f24d1fd..45512b2ba951c 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1379,7 +1379,7 @@ static void smp_timeout(struct work_struct *work) bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); - hci_disconnect(conn->hcon, HCI_ERROR_REMOTE_USER_TERM); + hci_disconnect(conn->hcon, HCI_ERROR_AUTH_FAILURE); } static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) @@ -2977,8 +2977,25 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) if (code > SMP_CMD_MAX) goto drop; - if (smp && !test_and_clear_bit(code, &smp->allow_cmd)) + if (smp && !test_and_clear_bit(code, &smp->allow_cmd)) { + /* If there is a context and the command is not allowed consider + * it a failure so the session is cleanup properly. + */ + switch (code) { + case SMP_CMD_IDENT_INFO: + case SMP_CMD_IDENT_ADDR_INFO: + case SMP_CMD_SIGN_INFO: + /* 3.6.1. Key distribution and generation + * + * A device may reject a distributed key by sending the + * Pairing Failed command with the reason set to + * "Key Rejected". + */ + smp_failure(conn, SMP_KEY_REJECTED); + break; + } goto drop; + } /* If we don't have a context the only allowed commands are * pairing request and security request. @@ -3172,7 +3189,7 @@ static void smp_ready_cb(struct l2cap_chan *chan) /* No need to call l2cap_chan_hold() here since we already own * the reference taken in smp_new_conn_cb(). This is just the * first time that we tie it to a specific pointer. The code in - * l2cap_core.c ensures that there's no risk this function wont + * l2cap_core.c ensures that there's no risk this function won't * get called if smp_new_conn_cb was previously called. */ conn->smp = chan; diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h index 87a59ec2c9f02..c5da53dfab04f 100644 --- a/net/bluetooth/smp.h +++ b/net/bluetooth/smp.h @@ -138,6 +138,7 @@ struct smp_cmd_keypress_notify { #define SMP_NUMERIC_COMP_FAILED 0x0c #define SMP_BREDR_PAIRING_IN_PROGRESS 0x0d #define SMP_CROSS_TRANSP_NOT_ALLOWED 0x0e +#define SMP_KEY_REJECTED 0x0f #define SMP_MIN_ENC_KEY_SIZE 7 #define SMP_MAX_ENC_KEY_SIZE 16 diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index f71f67c6896b3..812457819b5ae 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -171,7 +171,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, } /* prog doesn't take the ownership of the reference from caller */ bpf_prog_inc(prog); - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog, + prog->expected_attach_type); op_idx = prog->expected_attach_type; err = bpf_struct_ops_prepare_trampoline(tlinks, link, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 7cb192cbd65f3..9728dbd4c66c5 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -569,6 +569,11 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } +int noinline bpf_fentry_test10(const void *a) +{ + return (long)a; +} + void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } @@ -699,7 +704,8 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 || bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 || bpf_fentry_test8(&arg) != 0 || - bpf_fentry_test9(&retval) != 0) + bpf_fentry_test9(&retval) != 0 || + bpf_fentry_test10((void *)0) != 0) goto out; break; case BPF_MODIFY_RETURN: @@ -1249,7 +1255,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, headroom -= ctx->data; } - max_data_sz = 4096 - headroom - tailroom; + max_data_sz = PAGE_SIZE - headroom - tailroom; if (size > max_data_sz) { /* disallow live data mode for jumbo frames */ if (do_live) diff --git a/net/bridge/br.c b/net/bridge/br.c index 183fcb362f9e6..1885d0c315f02 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -74,9 +74,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v if (br->dev->addr_assign_type == NET_ADDR_SET) break; prechaddr_info = ptr; - err = dev_pre_changeaddr_notify(br->dev, - prechaddr_info->dev_addr, - extack); + err = netif_pre_changeaddr_notify(br->dev, + prechaddr_info->dev_addr, + extack); if (err) return notifier_from_errno(err); break; @@ -284,6 +284,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MST_ENABLE: err = br_mst_set_enabled(br, on, extack); break; + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -302,6 +305,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); case BR_BOOLOPT_MST_ENABLE: return br_opt_get(br, BROPT_MST_ENABLED); + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -363,21 +368,20 @@ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) clear_bit(opt, &br->options); } -static void __net_exit br_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit br_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { struct net_device *dev; - struct net *net; - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - for_each_netdev(net, dev) - if (netif_is_bridge_master(dev)) - br_dev_delete(dev, dev_to_kill); + ASSERT_RTNL_NET(net); + + for_each_netdev(net, dev) + if (netif_is_bridge_master(dev)) + br_dev_delete(dev, dev_to_kill); } static struct pernet_operations br_net_ops = { - .exit_batch_rtnl = br_net_exit_batch_rtnl, + .exit_rtnl = br_net_exit_rtnl, }; static const struct stp_proto br_stp_proto = { @@ -480,3 +484,4 @@ MODULE_LICENSE("GPL"); MODULE_VERSION(BR_VERSION); MODULE_ALIAS_RTNL_LINK("bridge"); MODULE_DESCRIPTION("Ethernet bridge driver"); +MODULE_IMPORT_NS("NETDEV_INTERNAL"); diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 115a23054a58d..1e2b51769eec8 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -160,6 +160,9 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { if (br_is_neigh_suppress_enabled(p, vid)) return; + if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + parp->ar_op == htons(ARPOP_REQUEST)) + return; if (parp->ar_op != htons(ARPOP_RREQUEST) && parp->ar_op != htons(ARPOP_RREPLY) && (ipv4_is_zeronet(sip) || sip == tip)) { @@ -410,6 +413,10 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, if (br_is_neigh_suppress_enabled(p, vid)) return; + if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) + return; + if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && !msg->icmph.icmp6_solicited) { /* prevent flooding to neigh suppress ports */ diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 2450690f98cfa..98c5b9c3145f3 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -668,7 +668,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, /* Ask for permission to use this MAC address now, even if we * don't end up choosing it below. */ - err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); + err = netif_pre_changeaddr_notify(br->dev, dev->dev_addr, + extack); if (err) goto err6; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 232133a0fd21c..5f6ac9bf15275 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -189,7 +189,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || - br_multicast_is_router(brmctx, skb)) { + br_multicast_is_router(brmctx, skb) || + br->dev->flags & IFF_ALLMULTI) { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 722203b98ff74..400eb872b4032 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -144,6 +144,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; + if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED) + e->flags |= MDB_FLAGS_OFFLOAD_FAILED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, @@ -517,16 +519,17 @@ static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg) rtnl_mdb_nlmsg_pg_size(pg); } -void br_mdb_notify(struct net_device *dev, - struct net_bridge_mdb_entry *mp, - struct net_bridge_port_group *pg, - int type) +static void __br_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type, bool notify_switchdev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - br_switchdev_mdb_notify(dev, mp, pg, type); + if (notify_switchdev) + br_switchdev_mdb_notify(dev, mp, pg, type); skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) @@ -544,6 +547,21 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } +void br_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type) +{ + __br_mdb_notify(dev, mp, pg, type, true); +} + +void br_mdb_flag_change_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg) +{ + __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false); +} + static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u16 vid, u32 pid, diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 1820f09ff59ce..3f24b4ee49c27 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -80,10 +80,10 @@ static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg, if (br_vlan_get_state(v) == state) return; - br_vlan_set_state(v, state); - if (v->vid == vg->pvid) br_vlan_set_pvid_state(vg, state); + + br_vlan_set_state(v, state); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index dcbf058de1e3b..1377f31b719cd 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -648,7 +648,7 @@ static void br_multicast_del_mdb_entry(struct net_bridge_mdb_entry *mp) static void br_multicast_group_expired(struct timer_list *t) { - struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer); + struct net_bridge_mdb_entry *mp = timer_container_of(mp, t, timer); struct net_bridge *br = mp->br; spin_lock(&br->multicast_lock); @@ -856,7 +856,7 @@ static void br_multicast_find_del_pg(struct net_bridge *br, static void br_multicast_port_group_expired(struct timer_list *t) { - struct net_bridge_port_group *pg = from_timer(pg, t, timer); + struct net_bridge_port_group *pg = timer_container_of(pg, t, timer); struct net_bridge_group_src *src_ent; struct net_bridge *br = pg->key.port->br; struct hlist_node *tmp; @@ -1314,7 +1314,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br, static void br_multicast_group_src_expired(struct timer_list *t) { - struct net_bridge_group_src *src = from_timer(src, t, timer); + struct net_bridge_group_src *src = timer_container_of(src, t, timer); struct net_bridge_port_group *pg; struct net_bridge *br = src->br; @@ -1667,8 +1667,8 @@ out: static void br_ip4_multicast_router_expired(struct timer_list *t) { - struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, - ip4_mc_router_timer); + struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t, + ip4_mc_router_timer); br_multicast_router_expired(pmctx, t, &pmctx->ip4_rlist); } @@ -1676,8 +1676,8 @@ static void br_ip4_multicast_router_expired(struct timer_list *t) #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_router_expired(struct timer_list *t) { - struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, - ip6_mc_router_timer); + struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t, + ip6_mc_router_timer); br_multicast_router_expired(pmctx, t, &pmctx->ip6_rlist); } @@ -1713,8 +1713,8 @@ out: static void br_ip4_multicast_local_router_expired(struct timer_list *t) { - struct net_bridge_mcast *brmctx = from_timer(brmctx, t, - ip4_mc_router_timer); + struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, + ip4_mc_router_timer); br_multicast_local_router_expired(brmctx, t); } @@ -1722,8 +1722,8 @@ static void br_ip4_multicast_local_router_expired(struct timer_list *t) #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_local_router_expired(struct timer_list *t) { - struct net_bridge_mcast *brmctx = from_timer(brmctx, t, - ip6_mc_router_timer); + struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, + ip6_mc_router_timer); br_multicast_local_router_expired(brmctx, t); } @@ -1746,8 +1746,8 @@ out: static void br_ip4_multicast_querier_expired(struct timer_list *t) { - struct net_bridge_mcast *brmctx = from_timer(brmctx, t, - ip4_other_query.timer); + struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, + ip4_other_query.timer); br_multicast_querier_expired(brmctx, &brmctx->ip4_own_query); } @@ -1755,8 +1755,8 @@ static void br_ip4_multicast_querier_expired(struct timer_list *t) #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_querier_expired(struct timer_list *t) { - struct net_bridge_mcast *brmctx = from_timer(brmctx, t, - ip6_other_query.timer); + struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, + ip6_other_query.timer); br_multicast_querier_expired(brmctx, &brmctx->ip6_own_query); } @@ -1918,8 +1918,8 @@ out: static void br_ip4_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, - ip4_own_query.timer); + struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t, + ip4_own_query.timer); br_multicast_port_query_expired(pmctx, &pmctx->ip4_own_query); } @@ -1927,8 +1927,8 @@ static void br_ip4_multicast_port_query_expired(struct timer_list *t) #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, - ip6_own_query.timer); + struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t, + ip6_own_query.timer); br_multicast_port_query_expired(pmctx, &pmctx->ip6_own_query); } @@ -1936,7 +1936,8 @@ static void br_ip6_multicast_port_query_expired(struct timer_list *t) static void br_multicast_port_group_rexmit(struct timer_list *t) { - struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer); + struct net_bridge_port_group *pg = timer_container_of(pg, t, + rexmit_timer); struct bridge_mcast_other_query *other_query = NULL; struct net_bridge *br = pg->key.port->br; struct net_bridge_mcast_port *pmctx; @@ -2014,10 +2015,19 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { + struct net_bridge *br = pmctx->port->br; + bool del = false; + #if IS_ENABLED(CONFIG_IPV6) timer_delete_sync(&pmctx->ip6_mc_router_timer); #endif timer_delete_sync(&pmctx->ip4_mc_router_timer); + + spin_lock_bh(&br->multicast_lock); + del |= br_ip6_multicast_rport_del(pmctx); + del |= br_ip4_multicast_rport_del(pmctx); + br_multicast_rport_del_notify(pmctx, del); + spin_unlock_bh(&br->multicast_lock); } int br_multicast_add_port(struct net_bridge_port *port) @@ -2061,7 +2071,7 @@ static void br_multicast_enable(struct bridge_mcast_own_query *query) { query->startup_sent = 0; - if (try_to_del_timer_sync(&query->timer) >= 0 || + if (timer_delete_sync_try(&query->timer) >= 0 || timer_delete(&query->timer)) mod_timer(&query->timer, jiffies); } @@ -2105,12 +2115,17 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) } } -void br_multicast_enable_port(struct net_bridge_port *port) +static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; spin_lock_bh(&br->multicast_lock); - __br_multicast_enable_port_ctx(&port->multicast_ctx); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + __br_multicast_enable_port_ctx(pmctx); spin_unlock_bh(&br->multicast_lock); } @@ -2137,11 +2152,67 @@ static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) br_multicast_rport_del_notify(pmctx, del); } +static void br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) +{ + struct net_bridge *br = pmctx->port->br; + + spin_lock_bh(&br->multicast_lock); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + + __br_multicast_disable_port_ctx(pmctx); + spin_unlock_bh(&br->multicast_lock); +} + +static void br_multicast_toggle_port(struct net_bridge_port *port, bool on) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + if (br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + rcu_read_lock(); + vg = nbp_vlan_group_rcu(port); + if (!vg) { + rcu_read_unlock(); + return; + } + + /* iterate each vlan, toggle vlan multicast context */ + list_for_each_entry_rcu(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast_port *pmctx = + &vlan->port_mcast_ctx; + u8 state = br_vlan_get_state(vlan); + /* enable vlan multicast context when state is + * LEARNING or FORWARDING + */ + if (on && br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(pmctx); + else + br_multicast_disable_port_ctx(pmctx); + } + rcu_read_unlock(); + return; + } +#endif + /* toggle port multicast context when vlan snooping is disabled */ + if (on) + br_multicast_enable_port_ctx(&port->multicast_ctx); + else + br_multicast_disable_port_ctx(&port->multicast_ctx); +} + +void br_multicast_enable_port(struct net_bridge_port *port) +{ + br_multicast_toggle_port(port, true); +} + void br_multicast_disable_port(struct net_bridge_port *port) { - spin_lock_bh(&port->br->multicast_lock); - __br_multicast_disable_port_ctx(&port->multicast_ctx); - spin_unlock_bh(&port->br->multicast_lock); + br_multicast_toggle_port(port, false); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) @@ -3480,7 +3551,7 @@ static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : - try_to_del_timer_sync(&mp->timer) >= 0)) + timer_delete_sync_try(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; @@ -3488,7 +3559,7 @@ static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : - try_to_del_timer_sync(&p->timer) >= 0 && + timer_delete_sync_try(&p->timer) >= 0 && (brmctx->multicast_igmp_version == 2 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); @@ -3569,7 +3640,7 @@ static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : - try_to_del_timer_sync(&mp->timer) >= 0)) + timer_delete_sync_try(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; @@ -3577,7 +3648,7 @@ static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : - try_to_del_timer_sync(&p->timer) >= 0 && + timer_delete_sync_try(&p->timer) >= 0 && (brmctx->multicast_mld_version == 1 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); @@ -3649,7 +3720,7 @@ br_multicast_leave_group(struct net_bridge_mcast *brmctx, if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : - try_to_del_timer_sync(&p->timer) >= 0)) { + timer_delete_sync_try(&p->timer) >= 0)) { mod_timer(&p->timer, time); } @@ -3665,7 +3736,7 @@ br_multicast_leave_group(struct net_bridge_mcast *brmctx, if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : - try_to_del_timer_sync(&mp->timer) >= 0)) { + timer_delete_sync_try(&mp->timer) >= 0)) { mod_timer(&mp->timer, time); } @@ -3681,7 +3752,7 @@ br_multicast_leave_group(struct net_bridge_mcast *brmctx, if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : - try_to_del_timer_sync(&p->timer) >= 0)) { + timer_delete_sync_try(&p->timer) >= 0)) { mod_timer(&p->timer, time); } @@ -3995,8 +4066,8 @@ out: static void br_ip4_multicast_query_expired(struct timer_list *t) { - struct net_bridge_mcast *brmctx = from_timer(brmctx, t, - ip4_own_query.timer); + struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, + ip4_own_query.timer); br_multicast_query_expired(brmctx, &brmctx->ip4_own_query, &brmctx->ip4_querier); @@ -4005,8 +4076,8 @@ static void br_ip4_multicast_query_expired(struct timer_list *t) #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_expired(struct timer_list *t) { - struct net_bridge_mcast *brmctx = from_timer(brmctx, t, - ip6_own_query.timer); + struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, + ip6_own_query.timer); br_multicast_query_expired(brmctx, &brmctx->ip6_own_query, &brmctx->ip6_querier); @@ -4211,6 +4282,32 @@ static void __br_multicast_stop(struct net_bridge_mcast *brmctx) #endif } +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + struct net_bridge *br; + + if (!br_vlan_should_use(v)) + return; + + if (br_vlan_is_master(v)) + return; + + br = v->port->br; + + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return; + + if (br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(&v->port_mcast_ctx); + + /* Multicast is not disabled for the vlan when it goes in + * blocking state because the timers will expire and stop by + * themselves without sending more queries. + */ +#endif +} + void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge *br; @@ -4304,9 +4401,9 @@ int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, __br_multicast_open(&br->multicast_ctx); list_for_each_entry(p, &br->port_list, list) { if (on) - br_multicast_disable_port(p); + br_multicast_disable_port_ctx(&p->multicast_ctx); else - br_multicast_enable_port(p); + br_multicast_enable_port_ctx(&p->multicast_ctx); } list_for_each_entry(vlan, &vg->vlan_list, vlist) diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c index c126aa4e75512..adfd74102019e 100644 --- a/net/bridge/br_multicast_eht.c +++ b/net/bridge/br_multicast_eht.c @@ -207,7 +207,9 @@ void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg) static void br_multicast_eht_set_entry_expired(struct timer_list *t) { - struct net_bridge_group_eht_set_entry *set_h = from_timer(set_h, t, timer); + struct net_bridge_group_eht_set_entry *set_h = timer_container_of(set_h, + t, + timer); struct net_bridge *br = set_h->br; spin_lock(&br->multicast_lock); @@ -223,8 +225,9 @@ out: static void br_multicast_eht_set_expired(struct timer_list *t) { - struct net_bridge_group_eht_set *eht_set = from_timer(eht_set, t, - timer); + struct net_bridge_group_eht_set *eht_set = timer_container_of(eht_set, + t, + timer); struct net_bridge *br = eht_set->br; spin_lock(&br->multicast_lock); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6e337937d0d7b..4e2d53b272210 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -479,7 +479,7 @@ static int br_fill_ifinfo(struct sk_buff *skb, hdr->__ifi_pad = 0; hdr->ifi_type = dev->type; hdr->ifi_index = dev->ifindex; - hdr->ifi_flags = dev_get_flags(dev); + hdr->ifi_flags = netif_get_flags(dev); hdr->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4715a8d6dc326..b159aae594c0b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -306,11 +306,12 @@ struct net_bridge_fdb_flush_desc { u16 vlan_id; }; -#define MDB_PG_FLAGS_PERMANENT BIT(0) -#define MDB_PG_FLAGS_OFFLOAD BIT(1) -#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) -#define MDB_PG_FLAGS_STAR_EXCL BIT(3) -#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_PERMANENT BIT(0) +#define MDB_PG_FLAGS_OFFLOAD BIT(1) +#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) +#define MDB_PG_FLAGS_STAR_EXCL BIT(3) +#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_OFFLOAD_FAILED BIT(5) #define PG_SRC_ENT_LIMIT 32 @@ -483,6 +484,7 @@ enum net_bridge_opts { BROPT_VLAN_BRIDGE_BINDING, BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, + BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, }; struct net_bridge { @@ -1003,6 +1005,8 @@ int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); +void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg); void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type); void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, @@ -1052,6 +1056,7 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx); void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state); void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack); @@ -1343,6 +1348,22 @@ br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx)); } + +static inline void +br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p, + bool offloaded) +{ + p->flags &= ~(MDB_PG_FLAGS_OFFLOAD | MDB_PG_FLAGS_OFFLOAD_FAILED); + p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD : + MDB_PG_FLAGS_OFFLOAD_FAILED); +} + +static inline bool +br_mdb_should_notify(const struct net_bridge *br, u8 changed_flags) +{ + return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION) && + (changed_flags & MDB_PG_FLAGS_OFFLOAD_FAILED); +} #else static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, @@ -1502,6 +1523,11 @@ static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pm { } +static inline void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, + u8 state) +{ +} + static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { @@ -1862,7 +1888,9 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts); -/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ +/* vlan state manipulation helpers using *_ONCE to annotate lock-free access, + * while br_vlan_set_state() may access data protected by multicast_lock. + */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) { return READ_ONCE(v->state); @@ -1871,6 +1899,7 @@ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) { WRITE_ONCE(v->state, state); + br_multicast_update_vlan_mcast_ctx(v, state); } static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index 27bf1979b9099..e5d453305381b 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -29,7 +29,7 @@ static int br_is_designated_for_some_port(const struct net_bridge *br) static void br_hello_timer_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, hello_timer); + struct net_bridge *br = timer_container_of(br, t, hello_timer); br_debug(br, "hello timer expired\n"); spin_lock(&br->lock); @@ -45,7 +45,8 @@ static void br_hello_timer_expired(struct timer_list *t) static void br_message_age_timer_expired(struct timer_list *t) { - struct net_bridge_port *p = from_timer(p, t, message_age_timer); + struct net_bridge_port *p = timer_container_of(p, t, + message_age_timer); struct net_bridge *br = p->br; const bridge_id *id = &p->designated_bridge; int was_root; @@ -78,7 +79,8 @@ static void br_message_age_timer_expired(struct timer_list *t) static void br_forward_delay_timer_expired(struct timer_list *t) { - struct net_bridge_port *p = from_timer(p, t, forward_delay_timer); + struct net_bridge_port *p = timer_container_of(p, t, + forward_delay_timer); struct net_bridge *br = p->br; br_debug(br, "port %u(%s) forward delay timer\n", @@ -102,7 +104,7 @@ static void br_forward_delay_timer_expired(struct timer_list *t) static void br_tcn_timer_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, tcn_timer); + struct net_bridge *br = timer_container_of(br, t, tcn_timer); br_debug(br, "tcn timer expired\n"); spin_lock(&br->lock); @@ -116,7 +118,8 @@ static void br_tcn_timer_expired(struct timer_list *t) static void br_topology_change_timer_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, topology_change_timer); + struct net_bridge *br = timer_container_of(br, t, + topology_change_timer); br_debug(br, "topo change timer expired\n"); spin_lock(&br->lock); @@ -127,7 +130,7 @@ static void br_topology_change_timer_expired(struct timer_list *t) static void br_hold_timer_expired(struct timer_list *t) { - struct net_bridge_port *p = from_timer(p, t, hold_timer); + struct net_bridge_port *p = timer_container_of(p, t, hold_timer); br_debug(p->br, "port %u(%s) hold timer expired\n", (unsigned int) p->port_no, p->dev->name); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 7b41ee8740cbb..fe3f7bbe86ee6 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -17,6 +17,9 @@ static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p, if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) return false; + if (br_multicast_igmp_type(skb)) + return false; + return (p->flags & BR_TX_FWD_OFFLOAD) && (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom); } @@ -504,9 +507,10 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; + u8 old_flags; - if (err) - goto err; + if (err == -EOPNOTSUPP) + goto out_free; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &data->ip); @@ -516,11 +520,15 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri pp = &p->next) { if (p->key.port != port) continue; - p->flags |= MDB_PG_FLAGS_OFFLOAD; + + old_flags = p->flags; + br_multicast_set_pg_offload_flags(p, !err); + if (br_mdb_should_notify(br, old_flags ^ p->flags)) + br_mdb_flag_change_notify(br->dev, mp, p); } out: spin_unlock_bh(&br->multicast_lock); -err: +out_free: kfree(priv); } @@ -829,7 +837,7 @@ int br_switchdev_port_offload(struct net_bridge_port *p, struct netdev_phys_item_id ppid; int err; - err = dev_get_port_parent_id(dev, &ppid, false); + err = netif_get_port_parent_id(dev, &ppid, false); if (err) return err; diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index c1176a5e02c43..cb4855ed95007 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -1026,7 +1026,7 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, static const struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, .mode = 0444, }, - .read_new = brforward_read, + .read = brforward_read, }; /* diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index f16bbbbb94817..60f28e4fb5c0a 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -42,8 +42,8 @@ config NF_CONNTRACK_BRIDGE # old sockopt interface and eval loop config BRIDGE_NF_EBTABLES_LEGACY tristate "Legacy EBTABLES support" - depends on BRIDGE && NETFILTER_XTABLES - default n + depends on BRIDGE && NETFILTER_XTABLES_LEGACY + default n help Legacy ebtables packet/frame classifier. This is not needed if you are using ebtables over nftables @@ -65,7 +65,7 @@ if BRIDGE_NF_EBTABLES # config BRIDGE_EBT_BROUTE tristate "ebt: broute table support" - select BRIDGE_NF_EBTABLES_LEGACY + depends on BRIDGE_NF_EBTABLES_LEGACY help The ebtables broute table is used to define rules that decide between bridging and routing frames, giving Linux the functionality of a @@ -76,7 +76,7 @@ config BRIDGE_EBT_BROUTE config BRIDGE_EBT_T_FILTER tristate "ebt: filter table support" - select BRIDGE_NF_EBTABLES_LEGACY + depends on BRIDGE_NF_EBTABLES_LEGACY help The ebtables filter table is used to define frame filtering rules at local input, forwarding and local output. See the man page for @@ -86,7 +86,7 @@ config BRIDGE_EBT_T_FILTER config BRIDGE_EBT_T_NAT tristate "ebt: nat table support" - select BRIDGE_NF_EBTABLES_LEGACY + depends on BRIDGE_NF_EBTABLES_LEGACY help The ebtables nat table is used to define rules that alter the MAC source address (MAC SNAT) or the MAC destination address (MAC DNAT). diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 816bb0fde718e..6482de4d87509 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -60,19 +60,19 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct ip_fraglist_iter iter; struct sk_buff *frag; - if (first_len - hlen > mtu || - skb_headroom(skb) < ll_rs) + if (first_len - hlen > mtu) goto blackhole; - if (skb_cloned(skb)) + if (skb_cloned(skb) || + skb_headroom(skb) < ll_rs) goto slow_path; skb_walk_frags(skb, frag) { - if (frag->len > mtu || - skb_headroom(frag) < hlen + ll_rs) + if (frag->len > mtu) goto blackhole; - if (skb_shared(frag)) + if (skb_shared(frag) || + skb_headroom(frag) < hlen + ll_rs) goto slow_path; } diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 20139fa1be1ff..06b604cf9d58c 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -351,17 +351,154 @@ int cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer) return found; } +static int cfctrl_link_setup(struct cfctrl *cfctrl, struct cfpkt *pkt, u8 cmdrsp) +{ + u8 len; + u8 linkid = 0; + enum cfctrl_srv serv; + enum cfctrl_srv servtype; + u8 endpoint; + u8 physlinkid; + u8 prio; + u8 tmp; + u8 *cp; + int i; + struct cfctrl_link_param linkparam; + struct cfctrl_request_info rsp, *req; + + memset(&linkparam, 0, sizeof(linkparam)); + + tmp = cfpkt_extr_head_u8(pkt); + + serv = tmp & CFCTRL_SRV_MASK; + linkparam.linktype = serv; + + servtype = tmp >> 4; + linkparam.chtype = servtype; + + tmp = cfpkt_extr_head_u8(pkt); + physlinkid = tmp & 0x07; + prio = tmp >> 3; + + linkparam.priority = prio; + linkparam.phyid = physlinkid; + endpoint = cfpkt_extr_head_u8(pkt); + linkparam.endpoint = endpoint & 0x03; + + switch (serv) { + case CFCTRL_SRV_VEI: + case CFCTRL_SRV_DBG: + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + linkid = cfpkt_extr_head_u8(pkt); + break; + case CFCTRL_SRV_VIDEO: + tmp = cfpkt_extr_head_u8(pkt); + linkparam.u.video.connid = tmp; + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + linkid = cfpkt_extr_head_u8(pkt); + break; + + case CFCTRL_SRV_DATAGRAM: + linkparam.u.datagram.connid = cfpkt_extr_head_u32(pkt); + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + linkid = cfpkt_extr_head_u8(pkt); + break; + case CFCTRL_SRV_RFM: + /* Construct a frame, convert + * DatagramConnectionID + * to network format long and copy it out... + */ + linkparam.u.rfm.connid = cfpkt_extr_head_u32(pkt); + cp = (u8 *) linkparam.u.rfm.volume; + for (tmp = cfpkt_extr_head_u8(pkt); + cfpkt_more(pkt) && tmp != '\0'; + tmp = cfpkt_extr_head_u8(pkt)) + *cp++ = tmp; + *cp = '\0'; + + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + linkid = cfpkt_extr_head_u8(pkt); + + break; + case CFCTRL_SRV_UTIL: + /* Construct a frame, convert + * DatagramConnectionID + * to network format long and copy it out... + */ + /* Fifosize KB */ + linkparam.u.utility.fifosize_kb = cfpkt_extr_head_u16(pkt); + /* Fifosize bufs */ + linkparam.u.utility.fifosize_bufs = cfpkt_extr_head_u16(pkt); + /* name */ + cp = (u8 *) linkparam.u.utility.name; + caif_assert(sizeof(linkparam.u.utility.name) + >= UTILITY_NAME_LENGTH); + for (i = 0; i < UTILITY_NAME_LENGTH && cfpkt_more(pkt); i++) { + tmp = cfpkt_extr_head_u8(pkt); + *cp++ = tmp; + } + /* Length */ + len = cfpkt_extr_head_u8(pkt); + linkparam.u.utility.paramlen = len; + /* Param Data */ + cp = linkparam.u.utility.params; + while (cfpkt_more(pkt) && len--) { + tmp = cfpkt_extr_head_u8(pkt); + *cp++ = tmp; + } + if (CFCTRL_ERR_BIT & cmdrsp) + break; + /* Link ID */ + linkid = cfpkt_extr_head_u8(pkt); + /* Length */ + len = cfpkt_extr_head_u8(pkt); + /* Param Data */ + cfpkt_extr_head(pkt, NULL, len); + break; + default: + pr_warn("Request setup, invalid type (%d)\n", serv); + return -1; + } + + rsp.cmd = CFCTRL_CMD_LINK_SETUP; + rsp.param = linkparam; + spin_lock_bh(&cfctrl->info_list_lock); + req = cfctrl_remove_req(cfctrl, &rsp); + + if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) || + cfpkt_erroneous(pkt)) { + pr_err("Invalid O/E bit or parse error " + "on CAIF control channel\n"); + cfctrl->res.reject_rsp(cfctrl->serv.layer.up, 0, + req ? req->client_layer : NULL); + } else { + cfctrl->res.linksetup_rsp(cfctrl->serv.layer.up, linkid, + serv, physlinkid, + req ? req->client_layer : NULL); + } + + kfree(req); + + spin_unlock_bh(&cfctrl->info_list_lock); + + return 0; +} + static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) { u8 cmdrsp; u8 cmd; - int ret = -1; - u8 len; - u8 param[255]; + int ret = 0; u8 linkid = 0; struct cfctrl *cfctrl = container_obj(layer); - struct cfctrl_request_info rsp, *req; - cmdrsp = cfpkt_extr_head_u8(pkt); cmd = cmdrsp & CFCTRL_CMD_MASK; @@ -374,150 +511,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) switch (cmd) { case CFCTRL_CMD_LINK_SETUP: - { - enum cfctrl_srv serv; - enum cfctrl_srv servtype; - u8 endpoint; - u8 physlinkid; - u8 prio; - u8 tmp; - u8 *cp; - int i; - struct cfctrl_link_param linkparam; - memset(&linkparam, 0, sizeof(linkparam)); - - tmp = cfpkt_extr_head_u8(pkt); - - serv = tmp & CFCTRL_SRV_MASK; - linkparam.linktype = serv; - - servtype = tmp >> 4; - linkparam.chtype = servtype; - - tmp = cfpkt_extr_head_u8(pkt); - physlinkid = tmp & 0x07; - prio = tmp >> 3; - - linkparam.priority = prio; - linkparam.phyid = physlinkid; - endpoint = cfpkt_extr_head_u8(pkt); - linkparam.endpoint = endpoint & 0x03; - - switch (serv) { - case CFCTRL_SRV_VEI: - case CFCTRL_SRV_DBG: - if (CFCTRL_ERR_BIT & cmdrsp) - break; - /* Link ID */ - linkid = cfpkt_extr_head_u8(pkt); - break; - case CFCTRL_SRV_VIDEO: - tmp = cfpkt_extr_head_u8(pkt); - linkparam.u.video.connid = tmp; - if (CFCTRL_ERR_BIT & cmdrsp) - break; - /* Link ID */ - linkid = cfpkt_extr_head_u8(pkt); - break; - - case CFCTRL_SRV_DATAGRAM: - linkparam.u.datagram.connid = - cfpkt_extr_head_u32(pkt); - if (CFCTRL_ERR_BIT & cmdrsp) - break; - /* Link ID */ - linkid = cfpkt_extr_head_u8(pkt); - break; - case CFCTRL_SRV_RFM: - /* Construct a frame, convert - * DatagramConnectionID - * to network format long and copy it out... - */ - linkparam.u.rfm.connid = - cfpkt_extr_head_u32(pkt); - cp = (u8 *) linkparam.u.rfm.volume; - for (tmp = cfpkt_extr_head_u8(pkt); - cfpkt_more(pkt) && tmp != '\0'; - tmp = cfpkt_extr_head_u8(pkt)) - *cp++ = tmp; - *cp = '\0'; - - if (CFCTRL_ERR_BIT & cmdrsp) - break; - /* Link ID */ - linkid = cfpkt_extr_head_u8(pkt); - - break; - case CFCTRL_SRV_UTIL: - /* Construct a frame, convert - * DatagramConnectionID - * to network format long and copy it out... - */ - /* Fifosize KB */ - linkparam.u.utility.fifosize_kb = - cfpkt_extr_head_u16(pkt); - /* Fifosize bufs */ - linkparam.u.utility.fifosize_bufs = - cfpkt_extr_head_u16(pkt); - /* name */ - cp = (u8 *) linkparam.u.utility.name; - caif_assert(sizeof(linkparam.u.utility.name) - >= UTILITY_NAME_LENGTH); - for (i = 0; - i < UTILITY_NAME_LENGTH - && cfpkt_more(pkt); i++) { - tmp = cfpkt_extr_head_u8(pkt); - *cp++ = tmp; - } - /* Length */ - len = cfpkt_extr_head_u8(pkt); - linkparam.u.utility.paramlen = len; - /* Param Data */ - cp = linkparam.u.utility.params; - while (cfpkt_more(pkt) && len--) { - tmp = cfpkt_extr_head_u8(pkt); - *cp++ = tmp; - } - if (CFCTRL_ERR_BIT & cmdrsp) - break; - /* Link ID */ - linkid = cfpkt_extr_head_u8(pkt); - /* Length */ - len = cfpkt_extr_head_u8(pkt); - /* Param Data */ - cfpkt_extr_head(pkt, ¶m, len); - break; - default: - pr_warn("Request setup, invalid type (%d)\n", - serv); - goto error; - } - - rsp.cmd = cmd; - rsp.param = linkparam; - spin_lock_bh(&cfctrl->info_list_lock); - req = cfctrl_remove_req(cfctrl, &rsp); - - if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) || - cfpkt_erroneous(pkt)) { - pr_err("Invalid O/E bit or parse error " - "on CAIF control channel\n"); - cfctrl->res.reject_rsp(cfctrl->serv.layer.up, - 0, - req ? req->client_layer - : NULL); - } else { - cfctrl->res.linksetup_rsp(cfctrl->serv. - layer.up, linkid, - serv, physlinkid, - req ? req-> - client_layer : NULL); - } - - kfree(req); - - spin_unlock_bh(&cfctrl->info_list_lock); - } + ret = cfctrl_link_setup(cfctrl, pkt, cmdrsp); break; case CFCTRL_CMD_LINK_DESTROY: linkid = cfpkt_extr_head_u8(pkt); @@ -544,9 +538,9 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) break; default: pr_err("Unrecognized Control Frame\n"); + ret = -1; goto error; } - ret = 0; error: cfpkt_destroy(pkt); return ret; diff --git a/net/can/af_can.c b/net/can/af_can.c index 4aab7033c9330..b2387a46794a5 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -683,7 +683,7 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_CAN_RX_INVALID_FRAME); return NET_RX_DROP; } @@ -698,7 +698,7 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_CANFD_RX_INVALID_FRAME); return NET_RX_DROP; } @@ -713,7 +713,7 @@ static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", dev->type, skb->len); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_CANXL_RX_INVALID_FRAME); return NET_RX_DROP; } diff --git a/net/can/bcm.c b/net/can/bcm.c index 6bc1cc4c94c5e..5e690a2377e48 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -359,6 +359,7 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, unsigned int datalen = head->nframes * op->cfsiz; int err; unsigned int *pflags; + enum skb_drop_reason reason; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); if (!skb) @@ -413,11 +414,11 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, addr->can_family = AF_CAN; addr->can_ifindex = op->rx_ifindex; - err = sock_queue_rcv_skb(sk, skb); + err = sock_queue_rcv_skb_reason(sk, skb, &reason); if (err < 0) { struct bcm_sock *bo = bcm_sk(sk); - kfree_skb(skb); + sk_skb_reason_drop(sk, skb, reason); /* don't care about overflows in this statistic */ bo->dropped_usr_msgs++; } diff --git a/net/can/isotp.c b/net/can/isotp.c index 1efa377f002ed..dee1412b3c9c1 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -278,6 +278,7 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) { struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; + enum skb_drop_reason reason; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); @@ -285,8 +286,8 @@ static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; - if (sock_queue_rcv_skb(sk, skb) < 0) - kfree_skb(skb); + if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) + sk_skb_reason_drop(sk, skb, reason); } static u8 padlen(u8 datalen) diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 6fefe7a687611..3d8b588822f9d 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -311,6 +311,7 @@ static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) { const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); struct j1939_sk_buff_cb *skcb; + enum skb_drop_reason reason; struct sk_buff *skb; if (oskb->sk == &jsk->sk) @@ -331,8 +332,8 @@ static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) if (skb->sk) skcb->msg_flags |= MSG_DONTROUTE; - if (sock_queue_rcv_skb(&jsk->sk, skb) < 0) - kfree_skb(skb); + if (sock_queue_rcv_skb_reason(&jsk->sk, skb, &reason) < 0) + sk_skb_reason_drop(&jsk->sk, skb, reason); } bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) diff --git a/net/can/proc.c b/net/can/proc.c index 25fdf060e30d0..0938bf7dd646a 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -114,7 +114,7 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, void can_stat_update(struct timer_list *t) { - struct net *net = from_timer(net, t, can.stattimer); + struct net *net = timer_container_of(net, t, can.stattimer); struct can_pkg_stats *pkg_stats = net->can.pkg_stats; unsigned long j = jiffies; /* snapshot */ diff --git a/net/can/raw.c b/net/can/raw.c index 020f21430b1d8..76b867d21def2 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -129,6 +129,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data) { struct sock *sk = (struct sock *)data; struct raw_sock *ro = raw_sk(sk); + enum skb_drop_reason reason; struct sockaddr_can *addr; struct sk_buff *skb; unsigned int *pflags; @@ -205,8 +206,8 @@ static void raw_rcv(struct sk_buff *oskb, void *data) if (oskb->sk == sk) *pflags |= MSG_CONFIRM; - if (sock_queue_rcv_skb(sk, skb) < 0) - kfree_skb(skb); + if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) + sk_skb_reason_drop(sk, skb, reason); } static int raw_enable_filters(struct net *net, struct net_device *dev, diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index bd608ffa06279..5483b4eed94e1 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -793,8 +793,8 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_x, secure mode */ } -static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs, - int kvec_cnt, u8 *hmac) +static int ceph_hmac_sha256(struct ceph_connection *con, + const struct kvec *kvecs, int kvec_cnt, u8 *hmac) { SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */ int ret; @@ -1462,8 +1462,8 @@ static int prepare_auth_signature(struct ceph_connection *con) if (!buf) return -ENOMEM; - ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, - CTRL_BODY(buf)); + ret = ceph_hmac_sha256(con, con->v2.in_sign_kvecs, + con->v2.in_sign_kvec_cnt, CTRL_BODY(buf)); if (ret) return ret; @@ -2460,8 +2460,8 @@ static int process_auth_signature(struct ceph_connection *con, return -EINVAL; } - ret = hmac_sha256(con, con->v2.out_sign_kvecs, - con->v2.out_sign_kvec_cnt, hmac); + ret = ceph_hmac_sha256(con, con->v2.out_sign_kvecs, + con->v2.out_sign_kvec_cnt, hmac); if (ret) return ret; diff --git a/net/core/datagram.c b/net/core/datagram.c index f0693707aece4..94cc4705e91da 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -52,6 +52,7 @@ #include <linux/pagemap.h> #include <linux/iov_iter.h> #include <linux/indirect_call_wrapper.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -61,7 +62,8 @@ #include <net/tcp_states.h> #include <trace/events/skb.h> #include <net/busy_poll.h> -#include <crypto/hash.h> + +#include "devmem.h" /* * Is a socket 'connection oriented' ? @@ -163,8 +165,7 @@ done: return skb; } -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) @@ -261,7 +262,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + skb = __skb_try_recv_from_queue(queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) @@ -482,41 +483,37 @@ short_copy: return 0; } -static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, - struct iov_iter *i) +#ifdef CONFIG_NET_CRC32C +static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, + void *_crcp, struct iov_iter *i) { -#ifdef CONFIG_CRYPTO_HASH - struct ahash_request *hash = hashp; - struct scatterlist sg; + u32 *crcp = _crcp; size_t copied; copied = copy_to_iter(addr, bytes, i); - sg_init_one(&sg, addr, copied); - ahash_request_set_crypt(hash, &sg, NULL, copied); - crypto_ahash_update(hash); + *crcp = crc32c(*crcp, addr, copied); return copied; -#else - return 0; -#endif } /** - * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator - * and update a hash. + * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator + * and update a CRC32C value. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec - * @hash: hash request to update + * @crcp: pointer to CRC32C value to update + * + * Return: 0 on success, -EFAULT if there was a fault during copy. */ -int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len, - struct ahash_request *hash) +int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, u32 *crcp) { return __skb_datagram_iter(skb, offset, to, len, true, - hash_and_copy_to_iter, hash); + crc32c_and_copy_to_iter, crcp); } -EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); +EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter); +#endif /* CONFIG_NET_CRC32C */ static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i) @@ -692,9 +689,50 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, return 0; } +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; + + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC && + iov_iter_type(from) != ITER_UBUF) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } + + return 0; +} + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length) + size_t length, + struct net_devmem_dmabuf_binding *binding) { unsigned long orig_size = skb->truesize; unsigned long truesize; @@ -702,6 +740,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, if (msg && msg->msg_ubuf && msg->sg_from_iter) ret = msg->sg_from_iter(skb, from, length); + else if (binding) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); else ret = zerocopy_fill_skb_from_iter(skb, from, length); @@ -735,7 +775,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/dev.c b/net/core/dev.c index 0d891634c6927..68dc47d7e7004 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -462,7 +462,9 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -DEFINE_PER_CPU(struct page_pool *, system_page_pool); +DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #ifdef CONFIG_LOCKDEP /* @@ -828,7 +830,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); if (!dev) return NULL; @@ -1039,10 +1041,11 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away. */ -struct net_device *__netdev_put_lock(struct net_device *dev) +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) { netdev_lock(dev); - if (dev->reg_state > NETREG_REGISTERED) { + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { netdev_unlock(dev); dev_put(dev); return NULL; @@ -1051,6 +1054,20 @@ struct net_device *__netdev_put_lock(struct net_device *dev) return dev; } +static struct net_device * +__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net) +{ + netdev_lock_ops_compat(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock_ops_compat(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + /** * netdev_get_by_index_lock() - find a device by its ifindex * @net: the applicable net namespace @@ -1070,7 +1087,19 @@ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) if (!dev) return NULL; - return __netdev_put_lock(dev); + return __netdev_put_lock(dev, net); +} + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock_ops_compat(dev, net); } struct net_device * @@ -1090,7 +1119,32 @@ netdev_xa_find_lock(struct net *net, struct net_device *dev, dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock_ops_compat(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock_ops_compat(dev, net); if (dev) return dev; @@ -1213,33 +1267,32 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) EXPORT_SYMBOL(dev_getfirstbyhwtype); /** - * __dev_get_by_flags - find any device with given flags - * @net: the applicable net namespace - * @if_flags: IFF_* values - * @mask: bitmask of bits in if_flags to check + * netdev_get_by_flags_rcu - find any device with given flags + * @net: the applicable net namespace + * @tracker: tracking object for the acquired reference + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check * - * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. Must be called inside - * rtnl_lock(), and result refcount is unchanged. + * Search for any interface with the given flags. + * + * Context: rcu_read_lock() must be held. + * Returns: NULL if a device is not found or a pointer to the device. */ - -struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, - unsigned short mask) +struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, + unsigned short if_flags, unsigned short mask) { - struct net_device *dev, *ret; - - ASSERT_RTNL(); + struct net_device *dev; - ret = NULL; - for_each_netdev(net, dev) { - if (((dev->flags ^ if_flags) & mask) == 0) { - ret = dev; - break; + for_each_netdev_rcu(net, dev) { + if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) { + netdev_hold(dev, tracker, GFP_ATOMIC); + return dev; } } - return ret; + + return NULL; } -EXPORT_SYMBOL(__dev_get_by_flags); +EXPORT_IPV6_MOD(netdev_get_by_flags_rcu); /** * dev_valid_name - check if name is okay for network device @@ -1715,7 +1768,7 @@ static void __dev_close(struct net_device *dev) list_del(&single); } -void dev_close_many(struct list_head *head, bool unlink) +void netif_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; @@ -1733,7 +1786,7 @@ void dev_close_many(struct list_head *head, bool unlink) list_del_init(&dev->close_list); } } -EXPORT_SYMBOL(dev_close_many); +EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL"); void netif_close(struct net_device *dev) { @@ -1741,7 +1794,7 @@ void netif_close(struct net_device *dev) LIST_HEAD(single); list_add(&dev->close_list, &single); - dev_close_many(&single, true); + netif_close_many(&single, true); list_del(&single); } } @@ -3125,7 +3178,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { - ASSERT_RTNL(); netdev_ops_assert_locked(dev); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, @@ -3175,7 +3227,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { - ASSERT_RTNL(); netdev_ops_assert_locked(dev); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, @@ -3542,9 +3593,10 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +#ifdef CONFIG_NET_CRC32C int skb_crc32c_csum_help(struct sk_buff *skb) { - __le32 crc32c_csum; + u32 crc; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) @@ -3572,15 +3624,14 @@ int skb_crc32c_csum_help(struct sk_buff *skb) if (ret) goto out; - crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, - skb->len - start, ~(__u32)0, - crc32c_csum_stub)); - *(__le32 *)(skb->data + offset) = crc32c_csum; + crc = ~skb_crc32c(skb, start, skb->len - start, ~0); + *(__le32 *)(skb->data + offset) = cpu_to_le32(crc); skb_reset_csum_not_inet(skb); out: return ret; } EXPORT_SYMBOL(skb_crc32c_csum_help); +#endif /* CONFIG_NET_CRC32C */ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { @@ -3844,12 +3895,42 @@ sw_checksum: } EXPORT_SYMBOL(skb_csum_hwoffload_help); +static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct skb_shared_info *shinfo; + struct net_iov *niov; + + if (likely(skb_frags_readable(skb))) + goto out; + + if (!dev->netmem_tx) + goto out_free; + + shinfo = skb_shinfo(skb); + + if (shinfo->nr_frags > 0) { + niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); + if (net_is_devmem_iov(niov) && + net_devmem_iov_binding(niov)->dev != dev) + goto out_free; + } + +out: + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; - if (!skb_frags_readable(skb)) - goto out_kfree_skb; + skb = validate_xmit_unreadable_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); @@ -3944,7 +4025,10 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) unsigned int hdr_len; /* mac layer + network layer */ - hdr_len = skb_transport_offset(skb); + if (!skb->encapsulation) + hdr_len = skb_transport_offset(skb); + else + hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { @@ -4714,7 +4798,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in - * napi_enable()/dev_set_threaded(). + * napi_enable()/netif_set_threaded(). * Use READ_ONCE() to guarantee a complete * read on napi->thread. Only call * wake_up_process() when it's not NULL. @@ -4731,6 +4815,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, } use_local_napi: + DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list)); list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() @@ -4946,7 +5031,8 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); - sd->received_rps++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->received_rps, sd->received_rps + 1); } #endif /* CONFIG_RPS */ @@ -5031,7 +5117,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + new_flow = hash_32(skb_get_hash(skb), fl->log_buckets); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; @@ -5042,7 +5128,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { - fl->count++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(fl->count, fl->count + 1); rcu_read_unlock(); return true; } @@ -5238,7 +5325,10 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) struct sk_buff *skb = *pskb; int err, hroom, troom; - if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + local_lock_nested_bh(&system_page_pool.bh_lock); + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); + local_unlock_nested_bh(&system_page_pool.bh_lock); + if (!err) return 0; /* In case we have to go down the path and also linearize, @@ -5659,6 +5749,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { + enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO; struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; @@ -5750,8 +5841,10 @@ skip_taps: #endif skb_reset_redirect(skb); skip_classify: - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) { + drop_reason = SKB_DROP_REASON_PFMEMALLOC; goto drop; + } if (skb_vlan_tag_present(skb)) { if (pt_prev) { @@ -5849,8 +5942,6 @@ check_vlan_id: } if (pt_prev) { - if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) - goto drop; *ppt_prev = pt_prev; } else { drop: @@ -5858,7 +5949,8 @@ drop: dev_core_stats_rx_dropped_inc(skb->dev); else dev_core_stats_rx_nohandler_inc(skb->dev); - kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); + + kfree_skb_reason(skb, drop_reason); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ @@ -6486,8 +6578,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) * it, we need to bound somehow the time packets are kept in * the GRO layer. */ - gro_flush(&n->gro, !!timeout); - gro_normal_list(&n->gro); + gro_flush_normal(&n->gro, !!timeout); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6557,8 +6648,7 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) } /* Flush too old packets. If HZ < 1000, flush all packets */ - gro_flush(&napi->gro, HZ >= 1000); - gro_normal_list(&napi->gro); + gro_flush_normal(&napi->gro, HZ >= 1000); clear_bit(NAPI_STATE_SCHED, &napi->state); } @@ -6836,22 +6926,89 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -int dev_set_threaded(struct net_device *dev, bool threaded) +static void napi_stop_kthread(struct napi_struct *napi) +{ + unsigned long val, new; + + /* Wait until the napi STATE_THREADED is unset. */ + while (true) { + val = READ_ONCE(napi->state); + + /* If napi kthread own this napi or the napi is idle, + * STATE_THREADED can be unset here. + */ + if ((val & NAPIF_STATE_SCHED_THREADED) || + !(val & NAPIF_STATE_SCHED)) { + new = val & (~NAPIF_STATE_THREADED); + } else { + msleep(20); + continue; + } + + if (try_cmpxchg(&napi->state, &val, new)) + break; + } + + /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by + * the kthread. + */ + while (true) { + if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) + break; + + msleep(20); + } + + kthread_stop(napi->thread); + napi->thread = NULL; +} + +int napi_set_threaded(struct napi_struct *napi, + enum netdev_napi_threaded threaded) +{ + if (threaded) { + if (!napi->thread) { + int err = napi_kthread_create(napi); + + if (err) + return err; + } + } + + if (napi->config) + napi->config->threaded = threaded; + + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + if (!threaded && napi->thread) { + napi_stop_kthread(napi); + } else { + /* Make sure kthread is created before THREADED bit is set. */ + smp_mb__before_atomic(); + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + } + + return 0; +} + +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) { struct napi_struct *napi; int err = 0; netdev_assert_locked_or_invisible(dev); - if (dev->threaded == threaded) - return 0; - if (threaded) { list_for_each_entry(napi, &dev->napi_list, dev_list) { if (!napi->thread) { err = napi_kthread_create(napi); if (err) { - threaded = false; + threaded = NETDEV_NAPI_THREADED_DISABLED; break; } } @@ -6860,23 +7017,29 @@ int dev_set_threaded(struct net_device *dev, bool threaded) WRITE_ONCE(dev->threaded, threaded); - /* Make sure kthread is created before THREADED bit - * is set. - */ - smp_mb__before_atomic(); - - /* Setting/unsetting threaded mode on a napi might not immediately - * take effect, if the current napi instance is actively being - * polled. In this case, the switch between threaded mode and - * softirq mode will happen in the next round of napi_schedule(). - * This should not cause hiccups/stalls to the live traffic. - */ + /* The error should not occur as the kthreads are already created. */ list_for_each_entry(napi, &dev->napi_list, dev_list) - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + WARN_ON_ONCE(napi_set_threaded(napi, threaded)); return err; } -EXPORT_SYMBOL(dev_set_threaded); + +/** + * netif_threaded_enable() - enable threaded NAPIs + * @dev: net_device instance + * + * Enable threaded mode for the NAPI instances of the device. This may be useful + * for devices where multiple NAPI instances get scheduled by a single + * interrupt. Threaded NAPI allows moving the NAPI processing to cores other + * than the core where IRQ is mapped. + * + * This function should be called before @dev is registered. + */ +void netif_threaded_enable(struct net_device *dev) +{ + WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED)); +} +EXPORT_SYMBOL(netif_threaded_enable); /** * netif_queue_set_napi - Associate queue with the napi @@ -7092,6 +7255,8 @@ static void napi_restore_config(struct napi_struct *n) napi_hash_add(n); n->config->napi_id = n->napi_id; } + + WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded)); } static void napi_save_config(struct napi_struct *n) @@ -7189,7 +7354,7 @@ void netif_napi_add_weight_locked(struct net_device *dev, * threaded mode will not be enabled in napi_enable(). */ if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = false; + dev->threaded = NETDEV_NAPI_THREADED_DISABLED; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); @@ -7358,8 +7523,7 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) } /* Flush too old packets. If HZ < 1000, flush all packets */ - gro_flush(&n->gro, HZ >= 1000); - gro_normal_list(&n->gro); + gro_flush_normal(&n->gro, HZ >= 1000); /* Some drivers may have called napi_schedule * prior to exhausting their budget. @@ -7387,9 +7551,14 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) work = __napi_poll(n, &do_repoll); - if (do_repoll) + if (do_repoll) { +#if defined(CONFIG_DEBUG_NET) + if (unlikely(!napi_is_scheduled(n))) + pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n", + n->dev->name, n->poll); +#endif list_add_tail(&n->poll_list, repoll); - + } netpoll_poll_unlock(have); return work; @@ -7515,7 +7684,8 @@ start: */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { - sd->time_squeeze++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break; } } @@ -9188,8 +9358,16 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) dev_change_rx_flags(dev, IFF_PROMISC); } - if (notify) + if (notify) { + /* The ops lock is only required to ensure consistent locking + * for `NETDEV_CHANGE` notifiers. This function is sometimes + * called without the lock, even for devices that are ops + * locked, such as in `dev_uc_sync_multiple` when using + * bonding or teaming. + */ + netdev_ops_assert_locked(dev); __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); + } return 0; } @@ -9283,12 +9461,12 @@ void dev_set_rx_mode(struct net_device *dev) } /** - * dev_get_flags - get flags reported to userspace - * @dev: device + * netif_get_flags() - get flags reported to userspace + * @dev: device * - * Get the combination of flag bits exported through APIs to userspace. + * Get the combination of flag bits exported through APIs to userspace. */ -unsigned int dev_get_flags(const struct net_device *dev) +unsigned int netif_get_flags(const struct net_device *dev) { unsigned int flags; @@ -9311,7 +9489,7 @@ unsigned int dev_get_flags(const struct net_device *dev) return flags; } -EXPORT_SYMBOL(dev_get_flags); +EXPORT_SYMBOL(netif_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack) @@ -9423,7 +9601,7 @@ int netif_change_flags(struct net_device *dev, unsigned int flags, return ret; } -int __dev_set_mtu(struct net_device *dev, int new_mtu) +int __netif_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; @@ -9434,7 +9612,7 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu) WRITE_ONCE(dev->mtu, new_mtu); return 0; } -EXPORT_SYMBOL(__dev_set_mtu); +EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL"); int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) @@ -9453,18 +9631,22 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu, } /** - * netif_set_mtu_ext - Change maximum transfer unit - * @dev: device - * @new_mtu: new transfer unit - * @extack: netlink extended ack + * netif_set_mtu_ext() - Change maximum transfer unit + * @dev: device + * @new_mtu: new transfer unit + * @extack: netlink extended ack + * + * Change the maximum transfer size of the network device. * - * Change the maximum transfer size of the network device. + * Return: 0 on success, -errno on failure. */ int netif_set_mtu_ext(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { int err, orig_mtu; + netdev_ops_assert_locked(dev); + if (new_mtu == dev->mtu) return 0; @@ -9481,7 +9663,7 @@ int netif_set_mtu_ext(struct net_device *dev, int new_mtu, return err; orig_mtu = dev->mtu; - err = __dev_set_mtu(dev, new_mtu); + err = __netif_set_mtu(dev, new_mtu); if (!err) { err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, @@ -9491,7 +9673,7 @@ int netif_set_mtu_ext(struct net_device *dev, int new_mtu, /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ - __dev_set_mtu(dev, orig_mtu); + __netif_set_mtu(dev, orig_mtu); call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, new_mtu); } @@ -9545,13 +9727,15 @@ void netif_set_group(struct net_device *dev, int new_group) } /** - * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR. - * @dev: device - * @addr: new address - * @extack: netlink extended ack + * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR. + * @dev: device + * @addr: new address + * @extack: netlink extended ack + * + * Return: 0 on success, -errno on failure. */ -int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, - struct netlink_ext_ack *extack) +int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr, + struct netlink_ext_ack *extack) { struct netdev_notifier_pre_changeaddr_info info = { .info.dev = dev, @@ -9563,9 +9747,9 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info); return notifier_to_errno(rc); } -EXPORT_SYMBOL(dev_pre_changeaddr_notify); +EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL"); -int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; @@ -9573,15 +9757,15 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; - if (sa->sa_family != dev->type) + if (ss->ss_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + err = netif_pre_changeaddr_notify(dev, ss->__data, extack); if (err) return err; - if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { - err = ops->ndo_set_mac_address(dev, sa); + if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) { + err = ops->ndo_set_mac_address(dev, ss); if (err) return err; } @@ -9593,7 +9777,8 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa, DECLARE_RWSEM(dev_addr_sem); -int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) +/* "sa" is a true struct sockaddr with limited "sa_data" member. */ +int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); struct net_device *dev; @@ -9619,7 +9804,7 @@ unlock: up_read(&dev_addr_sem); return ret; } -EXPORT_SYMBOL(dev_get_mac_address); +EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL"); int netif_change_carrier(struct net_device *dev, bool new_carrier) { @@ -9672,16 +9857,17 @@ int dev_get_phys_port_name(struct net_device *dev, } /** - * dev_get_port_parent_id - Get the device's port parent identifier - * @dev: network device - * @ppid: pointer to a storage for the port's parent identifier - * @recurse: allow/disallow recursion to lower devices + * netif_get_port_parent_id() - Get the device's port parent identifier + * @dev: network device + * @ppid: pointer to a storage for the port's parent identifier + * @recurse: allow/disallow recursion to lower devices + * + * Get the devices's port parent identifier. * - * Get the devices's port parent identifier + * Return: 0 on success, -errno on failure. */ -int dev_get_port_parent_id(struct net_device *dev, - struct netdev_phys_item_id *ppid, - bool recurse) +int netif_get_port_parent_id(struct net_device *dev, + struct netdev_phys_item_id *ppid, bool recurse) { const struct net_device_ops *ops = dev->netdev_ops; struct netdev_phys_item_id first = { }; @@ -9700,7 +9886,7 @@ int dev_get_port_parent_id(struct net_device *dev, return err; netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = dev_get_port_parent_id(lower_dev, ppid, true); + err = netif_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) @@ -9711,7 +9897,7 @@ int dev_get_port_parent_id(struct net_device *dev, return err; } -EXPORT_SYMBOL(dev_get_port_parent_id); +EXPORT_SYMBOL(netif_get_port_parent_id); /** * netdev_port_same_parent_id - Indicate if two network devices have @@ -9724,8 +9910,8 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) struct netdev_phys_item_id a_id = { }; struct netdev_phys_item_id b_id = { }; - if (dev_get_port_parent_id(a, &a_id, true) || - dev_get_port_parent_id(b, &b_id, true)) + if (netif_get_port_parent_id(a, &a_id, true) || + netif_get_port_parent_id(b, &b_id, true)) return false; return netdev_phys_item_id_same(&a_id, &b_id); @@ -9863,6 +10049,7 @@ int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) return dev->netdev_ops->ndo_bpf(dev, bpf); } +EXPORT_SYMBOL_GPL(netif_xdp_propagate); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { @@ -10258,7 +10445,8 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } - bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog, + attr->link_create.attach_type); link->dev = dev; link->flags = attr->link_create.flags; @@ -10393,7 +10581,7 @@ static void dev_index_release(struct net *net, int ifindex) static bool from_cleanup_net(void) { #ifdef CONFIG_NET_NS - return current == cleanup_net_task; + return current == READ_ONCE(cleanup_net_task); #else return false; #endif @@ -10624,12 +10812,14 @@ sync_lower: * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info. */ + udp_tunnel_nic_lock(dev); if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { dev->features = features; udp_tunnel_get_rx_info(dev); } else { udp_tunnel_drop_rx_info(dev); } + udp_tunnel_nic_unlock(dev); } if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { @@ -11047,8 +11237,7 @@ int register_netdevice(struct net_device *dev) * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: @@ -11610,7 +11799,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_len = sizeof_priv; - ref_tracker_dir_init(&dev->refcnt_tracker, 128, name); + ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev"); #ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) @@ -11832,21 +12021,8 @@ static void netdev_rss_contexts_free(struct net_device *dev) mutex_lock(&dev->ethtool->rss_lock); xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { - struct ethtool_rxfh_param rxfh; - - rxfh.indir = ethtool_rxfh_context_indir(ctx); - rxfh.key = ethtool_rxfh_context_key(ctx); - rxfh.hfunc = ctx->hfunc; - rxfh.input_xfrm = ctx->input_xfrm; - rxfh.rss_context = context; - rxfh.rss_delete = true; - xa_erase(&dev->ethtool->rss_ctx, context); - if (dev->ethtool_ops->create_rxfh_context) - dev->ethtool_ops->remove_rxfh_context(dev, ctx, - context, NULL); - else - dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL); + dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL); kfree(ctx); } xa_destroy(&dev->ethtool->rss_ctx); @@ -11931,7 +12107,7 @@ void unregister_netdevice_many_notify(struct list_head *head, netdev_lock(dev); } } - dev_close_many(&close_head, true); + netif_close_many(&close_head, true); /* ... now unlock them and go over the rest. */ list_for_each_entry(dev, head, unreg_list) { if (netdev_need_ops_lock(dev)) @@ -11939,7 +12115,7 @@ void unregister_netdevice_many_notify(struct list_head *head, else list_add_tail(&dev->close_list, &close_head); } - dev_close_many(&close_head, true); + netif_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ @@ -11971,8 +12147,7 @@ void unregister_netdevice_many_notify(struct list_head *head, */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); @@ -12146,7 +12321,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, netif_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); - netdev_unlock_ops(dev); + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); synchronize_net(); @@ -12184,7 +12363,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); dev->ifindex = new_ifindex; if (new_name[0]) { @@ -12210,7 +12391,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = netdev_change_owner(dev, net_old, net); WARN_ON(err); - netdev_lock_ops(dev); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ @@ -12621,7 +12806,7 @@ static int net_page_pool_create(int cpuid) return err; } - per_cpu(system_page_pool, cpuid) = pp_ptr; + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; #endif return 0; } @@ -12751,13 +12936,13 @@ out: for_each_possible_cpu(i) { struct page_pool *pp_ptr; - pp_ptr = per_cpu(system_page_pool, i); + pp_ptr = per_cpu(system_page_pool.pool, i); if (!pp_ptr) continue; xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); - per_cpu(system_page_pool, i) = NULL; + per_cpu(system_page_pool.pool, i) = NULL; } } diff --git a/net/core/dev.h b/net/core/dev.h index 7ee203395d8e2..ab69edc0c3e38 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -15,8 +15,9 @@ struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { - u64 count; - unsigned int num_buckets; + struct rcu_head rcu; + unsigned int count; + u8 log_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; u8 buckets[]; @@ -29,7 +30,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); -struct net_device *__netdev_put_lock(struct net_device *dev); +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); @@ -41,6 +42,21 @@ DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T)); (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \ ifindex++) +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex); +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *, + if (_T) netdev_unlock_ops_compat(_T)); + +#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \ + (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \ + &ifindex)); \ + ifindex++) + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else @@ -299,6 +315,20 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, WRITE_ONCE(n->irq_suspend_timeout, timeout); } +static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) +{ + if (test_bit(NAPI_STATE_THREADED, &n->state)) + return NETDEV_NAPI_THREADED_ENABLED; + + return NETDEV_NAPI_THREADED_DISABLED; +} + +int napi_set_threaded(struct napi_struct *n, + enum netdev_napi_threaded threaded); + +int netif_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded); + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 90716bd736f3e..76c91f224886c 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -603,7 +603,7 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr, ASSERT_RTNL(); - err = dev_pre_changeaddr_notify(dev, addr, NULL); + err = netif_pre_changeaddr_notify(dev, addr, NULL); if (err) return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); diff --git a/net/core/dev_api.c b/net/core/dev_api.c index f9a160ab596f3..f28852078aa68 100644 --- a/net/core/dev_api.c +++ b/net/core/dev_api.c @@ -84,14 +84,15 @@ void dev_set_group(struct net_device *dev, int new_group) netdev_unlock_ops(dev); } -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address_user(struct net_device *dev, + struct sockaddr_storage *ss, struct netlink_ext_ack *extack) { int ret; down_write(&dev_addr_sem); netdev_lock_ops(dev); - ret = netif_set_mac_address(dev, sa, extack); + ret = netif_set_mac_address(dev, ss, extack); netdev_unlock_ops(dev); up_write(&dev_addr_sem); @@ -319,20 +320,20 @@ EXPORT_SYMBOL(dev_set_allmulti); /** * dev_set_mac_address() - change Media Access Control Address * @dev: device - * @sa: new address + * @ss: new address * @extack: netlink extended ack * * Change the hardware (MAC) address of the device * * Return: 0 on success, -errno on failure. */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, +int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, struct netlink_ext_ack *extack) { int ret; netdev_lock_ops(dev); - ret = netif_set_mac_address(dev, sa, extack); + ret = netif_set_mac_address(dev, ss, extack); netdev_unlock_ops(dev); return ret; @@ -366,3 +367,16 @@ void netdev_state_change(struct net_device *dev) netdev_unlock_ops(dev); } EXPORT_SYMBOL(netdev_state_change); + +int dev_set_threaded(struct net_device *dev, + enum netdev_napi_threaded threaded) +{ + int ret; + + netdev_lock(dev); + ret = netif_set_threaded(dev, threaded); + netdev_unlock(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_threaded); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index fff13a8b48f18..9c0ad7f4b5d81 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -147,7 +147,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm switch (cmd) { case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = (short) dev_get_flags(dev); + ifr->ifr_flags = (short)netif_get_flags(dev); return 0; case SIOCGIFMETRIC: /* Get the metric on the interface @@ -572,9 +572,11 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: - if (dev->addr_len > sizeof(struct sockaddr)) + if (dev->addr_len > sizeof(ifr->ifr_hwaddr)) return -EINVAL; - return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); + return dev_set_mac_address_user(dev, + (struct sockaddr_storage *)&ifr->ifr_hwaddr, + NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) @@ -726,7 +728,8 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, switch (cmd) { case SIOCGIFHWADDR: dev_load(net, ifr->ifr_name); - ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); + ret = netif_get_mac_address(&ifr->ifr_hwaddr, net, + ifr->ifr_name); if (colon) *colon = ':'; return ret; diff --git a/net/core/devmem.c b/net/core/devmem.c index 2db428ab6b8be..24c591ab38aec 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -16,6 +16,7 @@ #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/page_pool/memory_provider.h> +#include <net/sock.h> #include <trace/events/page_pool.h> #include "devmem.h" @@ -30,7 +31,7 @@ static const struct memory_provider_ops dmabuf_devmem_ops; bool net_is_devmem_iov(struct net_iov *niov) { - return niov->pp->mp_ops == &dmabuf_devmem_ops; + return niov->type == NET_IOV_DMABUF; } static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, @@ -52,8 +53,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + size_t size, avail; gen_pool_for_each_chunk(binding->chunk_pool, @@ -67,10 +70,11 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) gen_pool_destroy(binding->chunk_pool); dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, - DMA_FROM_DEVICE); + binding->direction); dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); kfree(binding); } @@ -117,6 +121,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) unsigned long xa_idx; unsigned int rxq_idx; + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase. + */ + synchronize_net(); + if (binding->list.next) list_del(&binding->list); @@ -131,8 +142,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } - xa_erase(&net_devmem_dmabuf_bindings, binding->id); - net_devmem_dmabuf_binding_put(binding); } @@ -166,7 +175,9 @@ err_close_rxq: } struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; @@ -189,13 +200,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, } binding->dev = dev; - - err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, - binding, xa_limit_32b, &id_alloc_next, - GFP_KERNEL); - if (err < 0) - goto err_free_binding; - xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); refcount_set(&binding->ref, 1); @@ -203,31 +207,42 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, mutex_init(&binding->lock); binding->dmabuf = dmabuf; + binding->direction = direction; binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); - goto err_free_id; + goto err_free_binding; } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, - DMA_FROM_DEVICE); + direction); if (IS_ERR(binding->sgt)) { err = PTR_ERR(binding->sgt); NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); goto err_detach; } + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + /* For simplicity we expect to make PAGE_SIZE allocations, but the * binding can be much more flexible than that. We may be able to * allocate MTU sized chunks here. Leave that for future work... */ - binding->chunk_pool = - gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM; - goto err_unmap; + goto err_tx_vec; } virtual = 0; @@ -268,27 +283,38 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, for (i = 0; i < owner->area.num_niovs; i++) { niov = &owner->area.niovs[i]; + niov->type = NET_IOV_DMABUF; niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; } virtual += len; } + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_chunks; + + list_add(&binding->list, &priv->bindings); + return binding; err_free_chunks: gen_pool_for_each_chunk(binding->chunk_pool, net_devmem_dmabuf_free_chunk_owner, NULL); gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, - DMA_FROM_DEVICE); + direction); err_detach: dma_buf_detach(dmabuf, binding->attachment); -err_free_id: - xa_erase(&net_devmem_dmabuf_bindings, binding->id); err_free_binding: kfree(binding); err_put_dmabuf: @@ -296,6 +322,74 @@ err_put_dmabuf: return ERR_PTR(err); } +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + struct net_devmem_dmabuf_binding *binding; + + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); + + return binding; +} + +void net_devmem_get_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); +} + +void net_devmem_put_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); +} + +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct dst_entry *dst = __sk_dst_get(sk); + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + err = -ENODEV; + goto out_err; + } + + return binding; + +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + if (virt_addr >= binding->dmabuf->size) + return NULL; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[virt_addr / PAGE_SIZE]; +} + /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) diff --git a/net/core/devmem.h b/net/core/devmem.h index a1aabc9685cc6..41cd6e1c91412 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -11,6 +11,7 @@ #define _NET_DEVMEM_H #include <net/netmem.h> +#include <net/netdev_netlink.h> struct netlink_ext_ack; @@ -25,12 +26,20 @@ struct net_devmem_dmabuf_binding { /* The user holds a ref (via the netlink API) for as long as they want * the binding to remain alive. Each page pool using this binding holds - * a ref to keep the binding alive. Each allocated net_iov holds a - * ref. + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. * * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in * use. + * + * net_devmem_get_net_iov() on dmabuf net_iovs will increment this + * reference, making sure that the binding remains alive until all the + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. */ refcount_t ref; @@ -46,6 +55,17 @@ struct net_devmem_dmabuf_binding { * active. */ u32 id; + + /* DMA direction, FROM_DEVICE for Rx binding, TO_DEVICE for Tx. */ + enum dma_data_direction direction; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; }; #if defined(CONFIG_NET_DEVMEM) @@ -62,14 +82,18 @@ struct dmabuf_genpool_chunk_owner { dma_addr_t base_dma_addr; }; -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); +void net_devmem_bind_tx_release(struct sock *sk); static inline struct dmabuf_genpool_chunk_owner * net_devmem_iov_to_chunk_owner(const struct net_iov *niov) @@ -98,10 +122,10 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); } -static inline void +static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - refcount_inc(&binding->ref); + return refcount_inc_not_zero(&binding->ref); } static inline void @@ -110,30 +134,55 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) if (!refcount_dec_and_test(&binding->ref)) return; - __net_devmem_dmabuf_binding_free(binding); + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); } +void net_devmem_get_net_iov(struct net_iov *niov); +void net_devmem_put_net_iov(struct net_iov *niov); + struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); bool net_is_devmem_iov(struct net_iov *niov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); #else struct net_devmem_dmabuf_binding; static inline void -__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline void net_devmem_get_net_iov(struct net_iov *niov) +{ +} + +static inline void net_devmem_put_net_iov(struct net_iov *niov) { } static inline struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, + struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { return ERR_PTR(-EOPNOTSUPP); } +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + static inline void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -172,6 +221,25 @@ static inline bool net_is_devmem_iov(struct net_iov *niov) { return false; } + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) +{ + return NULL; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 8a7ce640f74d8..60d31c2feed3e 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -208,7 +208,7 @@ static void send_dm_alert(struct work_struct *work) */ static void sched_send_work(struct timer_list *t) { - struct per_cpu_dm_data *data = from_timer(data, t, send_timer); + struct per_cpu_dm_data *data = timer_container_of(data, t, send_timer); schedule_work(&data->dm_alert_work); } diff --git a/net/core/dst.c b/net/core/dst.c index 795ca07e28a4e..e2de8b68c41d3 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -145,12 +145,12 @@ void dst_dev_put(struct dst_entry *dst) { struct net_device *dev = dst->dev; - dst->obsolete = DST_OBSOLETE_DEAD; + WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD); if (dst->ops->ifdown) dst->ops->ifdown(dst, dev); - dst->input = dst_discard; - dst->output = dst_discard_out; - dst->dev = blackhole_netdev; + WRITE_ONCE(dst->input, dst_discard); + WRITE_ONCE(dst->output, dst_discard_out); + WRITE_ONCE(dst->dev, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } @@ -263,7 +263,7 @@ unsigned int dst_blackhole_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - return mtu ? : dst->dev->mtu; + return mtu ? : dst_dev(dst)->mtu; } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 70c634b9e7b02..9ab4902324e19 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -17,6 +17,7 @@ struct dst_cache_pcpu { unsigned long refresh_ts; struct dst_entry *dst; + local_lock_t bh_lock; u32 cookie; union { struct in_addr in_saddr; @@ -51,7 +52,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, if (unlikely(!time_after(idst->refresh_ts, READ_ONCE(dst_cache->reset_ts)) || - (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); goto fail; @@ -65,10 +66,15 @@ fail: struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { + struct dst_entry *dst; + if (!dst_cache->cache) return NULL; - return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_lock_nested_bh(&dst_cache->cache->bh_lock); + dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; } EXPORT_SYMBOL_GPL(dst_cache_get); @@ -80,12 +86,16 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in_saddr.s_addr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -98,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, 0); idst->in_saddr.s_addr = saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); @@ -113,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -129,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in6_saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); @@ -142,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6); int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) { + unsigned int i; + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, gfp | __GFP_ZERO); if (!dst_cache->cache) return -ENOMEM; + for_each_possible_cpu(i) + local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); dst_cache_reset(dst_cache); return 0; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 7af302080a660..8ca634964e363 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -874,13 +874,14 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *r, *last = NULL; - struct fib_rule_hdr *frh = nlmsg_data(nlh); int err = -EINVAL, unresolved = 0; struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX + 1]; bool user_priority = false; + struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -1002,13 +1003,14 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *nlrule = NULL; - struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX+1]; bool user_priority = false; + struct fib_rule_hdr *frh; int err = -EINVAL; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -1260,12 +1262,12 @@ static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, { struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } - frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, diff --git a/net/core/filter.c b/net/core/filter.c index 577a4504e26fa..da391e2b0788d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -122,6 +122,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet + * @reason: record drop reason on errors (negative return value) * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller @@ -130,7 +131,8 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * be accepted or -EPERM if the packet should be tossed. * */ -int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, + unsigned int cap, enum skb_drop_reason *reason) { int err; struct sk_filter *filter; @@ -142,15 +144,20 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); + *reason = SKB_DROP_REASON_PFMEMALLOC; return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); - if (err) + if (err) { + *reason = SKB_DROP_REASON_SOCKET_FILTER; return err; + } err = security_sock_rcv_skb(sk, skb); - if (err) + if (err) { + *reason = SKB_DROP_REASON_SECURITY_HOOK; return err; + } rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); @@ -162,6 +169,8 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; + if (err) + *reason = SKB_DROP_REASON_SOCKET_FILTER; } rcu_read_unlock(); @@ -1968,10 +1977,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; + bool is_ipv6 = flags & BPF_F_IPV6; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | - BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; @@ -1987,7 +1997,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, if (unlikely(from != 0)) return -EINVAL; - inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); @@ -3232,6 +3242,13 @@ static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto) +{ + skb->protocol = htons(proto); + if (skb_valid_dst(skb)) + skb_dst_drop(skb); +} + static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { /* Caller already did skb_cow() with len as headroom, @@ -3328,7 +3345,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) } } - skb->protocol = htons(ETH_P_IPV6); + bpf_skb_change_protocol(skb, ETH_P_IPV6); skb_clear_hash(skb); return 0; @@ -3358,7 +3375,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) } } - skb->protocol = htons(ETH_P_IP); + bpf_skb_change_protocol(skb, ETH_P_IP); skb_clear_hash(skb); return 0; @@ -3549,10 +3566,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) - skb->protocol = htons(ETH_P_IPV6); + bpf_skb_change_protocol(skb, ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) - skb->protocol = htons(ETH_P_IP); + bpf_skb_change_protocol(skb, ETH_P_IP); } if (skb_is_gso(skb)) { @@ -3605,10 +3622,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, /* Match skb->protocol to new outer l3 protocol */ if (skb->protocol == htons(ETH_P_IP) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) - skb->protocol = htons(ETH_P_IPV6); + bpf_skb_change_protocol(skb, ETH_P_IPV6); else if (skb->protocol == htons(ETH_P_IPV6) && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) - skb->protocol = htons(ETH_P_IP); + bpf_skb_change_protocol(skb, ETH_P_IP); if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -8023,10 +8040,6 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; @@ -8052,10 +8065,6 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { @@ -8489,18 +8498,12 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif default: return bpf_sk_base_func_proto(func_id, prog); } @@ -8696,7 +8699,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(__u64)) return false; break; - case offsetof(struct __sk_buff, sk): + case bpf_ctx_range_ptr(struct __sk_buff, sk): if (type == BPF_WRITE || size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; @@ -9274,7 +9277,7 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case offsetof(struct bpf_sock_addr, sk): + case bpf_ctx_range_ptr(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; if (size != sizeof(__u64)) @@ -9324,17 +9327,17 @@ static bool sock_ops_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; - case offsetof(struct bpf_sock_ops, sk): + case bpf_ctx_range_ptr(struct bpf_sock_ops, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; - case offsetof(struct bpf_sock_ops, skb_data): + case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct bpf_sock_ops, skb_data_end): + case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; @@ -9343,7 +9346,7 @@ static bool sock_ops_is_valid_access(int off, int size, bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); - case offsetof(struct bpf_sock_ops, skb_hwtstamp): + case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp): if (size != sizeof(__u64)) return false; break; @@ -9413,17 +9416,17 @@ static bool sk_msg_is_valid_access(int off, int size, return false; switch (off) { - case offsetof(struct sk_msg_md, data): + case bpf_ctx_range_ptr(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; if (size != sizeof(__u64)) return false; break; - case offsetof(struct sk_msg_md, data_end): + case bpf_ctx_range_ptr(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; if (size != sizeof(__u64)) return false; break; - case offsetof(struct sk_msg_md, sk): + case bpf_ctx_range_ptr(struct sk_msg_md, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; @@ -9455,6 +9458,9 @@ static bool flow_dissector_is_valid_access(int off, int size, if (off < 0 || off >= sizeof(struct __sk_buff)) return false; + if (off % size != 0) + return false; + if (type == BPF_WRITE) return false; @@ -11629,7 +11635,7 @@ static bool sk_lookup_is_valid_access(int off, int size, return false; switch (off) { - case offsetof(struct bpf_sk_lookup, sk): + case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk): info->reg_type = PTR_TO_SOCKET_OR_NULL; return size == sizeof(__u64); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 2b821b9a86997..7d426a8e29f30 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -75,7 +75,7 @@ static void est_fetch_counters(struct net_rate_estimator *e, static void est_timer(struct timer_list *t) { - struct net_rate_estimator *est = from_timer(est, t, timer); + struct net_rate_estimator *est = timer_container_of(est, t, timer); struct gnet_stats_basic_sync b; u64 b_bytes, b_packets; u64 rate, brate; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 0bc893d5f07b0..95d0a4df10069 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -2,7 +2,9 @@ #include <linux/cache.h> #include <linux/jiffies.h> #include <linux/list.h> +#include <net/aligned_data.h> #include <net/hotdata.h> +#include <net/ip.h> #include <net/proto_memory.h> struct net_hotdata net_hotdata __cacheline_aligned = { @@ -22,3 +24,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE }; EXPORT_SYMBOL(net_hotdata); + +struct net_aligned_data net_aligned_data; +EXPORT_IPV6_MOD(net_aligned_data); diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c index 759a9b9f3f898..669b357b73b2d 100644 --- a/net/core/ieee8021q_helpers.c +++ b/net/core/ieee8021q_helpers.c @@ -7,6 +7,11 @@ #include <net/dscp.h> #include <net/ieee8021q.h> +/* verify that table covers all 8 traffic types */ +#define TT_MAP_SIZE_OK(tbl) \ + compiletime_assert(ARRAY_SIZE(tbl) == IEEE8021Q_TT_MAX, \ + #tbl " size mismatch") + /* The following arrays map Traffic Types (TT) to traffic classes (TC) for * different number of queues as shown in the example provided by * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and @@ -101,51 +106,28 @@ int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues) switch (num_queues) { case 8: - compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_8queue_tt_tc_map != max - 1"); + TT_MAP_SIZE_OK(ieee8021q_8queue_tt_tc_map); return ieee8021q_8queue_tt_tc_map[tt]; case 7: - compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_7queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_7queue_tt_tc_map); return ieee8021q_7queue_tt_tc_map[tt]; case 6: - compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_6queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_6queue_tt_tc_map); return ieee8021q_6queue_tt_tc_map[tt]; case 5: - compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_5queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_5queue_tt_tc_map); return ieee8021q_5queue_tt_tc_map[tt]; case 4: - compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_4queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_4queue_tt_tc_map); return ieee8021q_4queue_tt_tc_map[tt]; case 3: - compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_3queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_3queue_tt_tc_map); return ieee8021q_3queue_tt_tc_map[tt]; case 2: - compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_2queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_2queue_tt_tc_map); return ieee8021q_2queue_tt_tc_map[tt]; case 1: - compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) != - IEEE8021Q_TT_MAX - 1, - "ieee8021q_1queue_tt_tc_map != max - 1"); - + TT_MAP_SIZE_OK(ieee8021q_1queue_tt_tc_map); return ieee8021q_1queue_tt_tc_map[tt]; } diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c index 941e26c1343de..9e9fb25314b9c 100644 --- a/net/core/lock_debug.c +++ b/net/core/lock_debug.c @@ -18,9 +18,12 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, /* Keep enum and don't add default to trigger -Werror=switch */ switch (cmd) { + case NETDEV_XDP_FEAT_CHANGE: + netdev_assert_locked(dev); + fallthrough; + case NETDEV_CHANGE: case NETDEV_REGISTER: case NETDEV_UP: - case NETDEV_CHANGE: netdev_ops_assert_locked(dev); fallthrough; case NETDEV_DOWN: @@ -58,7 +61,6 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: - case NETDEV_XDP_FEAT_CHANGE: ASSERT_RTNL(); break; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 60f27cb4e54f1..f9d76d85d04fd 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -149,8 +149,7 @@ int lwtunnel_build_state(struct net *net, u16 encap_type, } EXPORT_SYMBOL_GPL(lwtunnel_build_state); -int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, - bool rtnl_is_held) +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; @@ -167,12 +166,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { - if (rtnl_is_held) - __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); - if (rtnl_is_held) - rtnl_lock(); - ops = rcu_access_pointer(lwtun_encaps[encap_type]); } } @@ -186,8 +180,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; @@ -208,9 +201,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, } encap_type = nla_get_u16(nla_entype); - if (lwtunnel_valid_encap_type(encap_type, - extack, - rtnl_is_held) != 0) + if (lwtunnel_valid_encap_type(encap_type, extack)) return -EOPNOTSUPP; } } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a07249b59ae1e..bddfa389effa7 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -28,6 +28,7 @@ #include <net/neighbour.h> #include <net/arp.h> #include <net/dst.h> +#include <net/ip.h> #include <net/sock.h> #include <net/netevent.h> #include <net/netlink.h> @@ -53,8 +54,8 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev); +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -153,11 +154,12 @@ static void neigh_update_gc_list(struct neighbour *n) if (n->dead) goto out; - /* remove from the gc list if new state is permanent or if neighbor - * is externally learned; otherwise entry should be on the gc list + /* remove from the gc list if new state is permanent or if neighbor is + * externally learned / validated; otherwise entry should be on the gc + * list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || - n->flags & NTF_EXT_LEARNED; + n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { @@ -204,6 +206,7 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) @@ -221,6 +224,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, *notify = 1; *managed_update = true; } + if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) { + if (ndm_flags & NTF_EXT_VALIDATED) + neigh->flags |= NTF_EXT_VALIDATED; + else + neigh->flags &= ~NTF_EXT_VALIDATED; + *notify = 1; + *gc_update = true; + } } bool neigh_remove_one(struct neighbour *n) @@ -368,6 +379,43 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, } } +static void neigh_flush_one(struct neighbour *n) +{ + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + + write_lock(&n->lock); + + neigh_del_timer(n); + neigh_mark_dead(n); + + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + * We must destroy neighbour entry, + * but someone still uses it. + * + * The destroy will be delayed until + * the last user releases us, but + * we must kill timers etc. and move + * it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + + neigh_dbg(2, "neigh %p is stray\n", n); + } + + write_unlock(&n->lock); + + neigh_cleanup_and_release(n); +} + static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { @@ -378,35 +426,29 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, dev_head = neigh_get_dev_table(dev, tbl->family); hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { - if (skip_perm && n->nud_state & NUD_PERMANENT) + if (skip_perm && + (n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_VALIDATED)) continue; - hlist_del_rcu(&n->hash); - hlist_del_rcu(&n->dev_list); - write_lock(&n->lock); - neigh_del_timer(n); - neigh_mark_dead(n); - if (refcount_read(&n->refcnt) != 1) { - /* The most unpleasant situation. - * We must destroy neighbour entry, - * but someone still uses it. - * - * The destroy will be delayed until - * the last user releases us, but - * we must kill timers etc. and move - * it to safe state. - */ - __skb_queue_purge(&n->arp_queue); - n->arp_queue_len_bytes = 0; - WRITE_ONCE(n->output, neigh_blackhole); - if (n->nud_state & NUD_VALID) - n->nud_state = NUD_NOARP; - else - n->nud_state = NUD_NONE; - neigh_dbg(2, "neigh %p is stray\n", n); - } - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + neigh_flush_one(n); + } +} + +static void neigh_flush_table(struct neigh_table *tbl) +{ + struct neigh_hash_table *nht; + int i; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + for (i = 0; i < (1 << nht->hash_shift); i++) { + struct hlist_node *tmp; + struct neighbour *n; + + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) + neigh_flush_one(n); } } @@ -422,8 +464,15 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev, skip_perm); - pneigh_ifdown_and_unlock(tbl, dev); + if (likely(dev)) { + neigh_flush_dev(tbl, dev, skip_perm); + } else { + DEBUG_NET_WARN_ON_ONCE(skip_perm); + neigh_flush_table(tbl); + } + write_unlock_bh(&tbl->lock); + + pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) @@ -706,54 +755,53 @@ static u32 pneigh_hash(const void *pkey, unsigned int key_len) return hash_val; } -static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, - struct net *net, - const void *pkey, - unsigned int key_len, - struct net_device *dev) +struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) { + struct pneigh_entry *n; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(tbl->phash_buckets[hash_val], + lockdep_is_held(&tbl->phash_lock)); + while (n) { if (!memcmp(n->key, pkey, key_len) && net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; - n = n->next; - } - return NULL; -} -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, struct net_device *dev) -{ - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); + } - return __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); + return NULL; } -EXPORT_SYMBOL_GPL(__pneigh_lookup); +EXPORT_IPV6_MOD(pneigh_lookup); -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev, int creat) +int pneigh_create(struct neigh_table *tbl, struct net *net, + const void *pkey, struct net_device *dev, + u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + unsigned int key_len; + u32 hash_val; + int err = 0; - read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], - net, pkey, key_len, dev); - read_unlock_bh(&tbl->lock); + mutex_lock(&tbl->phash_lock); - if (n || !creat) - goto out; - - ASSERT_RTNL(); + n = pneigh_lookup(tbl, net, pkey, dev); + if (n) + goto update; + key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); - if (!n) + if (!n) { + err = -ENOBUFS; goto out; + } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); @@ -763,73 +811,98 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); - n = NULL; + err = -ENOBUFS; goto out; } - write_lock_bh(&tbl->lock); + hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; - tbl->phash_buckets[hash_val] = n; - write_unlock_bh(&tbl->lock); + rcu_assign_pointer(tbl->phash_buckets[hash_val], n); +update: + WRITE_ONCE(n->flags, flags); + n->permanent = permanent; + WRITE_ONCE(n->protocol, protocol); out: - return n; + mutex_unlock(&tbl->phash_lock); + return err; } -EXPORT_SYMBOL(pneigh_lookup); +static void pneigh_destroy(struct rcu_head *rcu) +{ + struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); + + netdev_put(n->dev, &n->dev_tracker); + kfree(n); +} int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - struct pneigh_entry *n, **np; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + struct pneigh_entry *n, __rcu **np; + unsigned int key_len; + u32 hash_val; - write_lock_bh(&tbl->lock); - for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + + mutex_lock(&tbl->phash_lock); + + for (np = &tbl->phash_buckets[hash_val]; + (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { - *np = n->next; - write_unlock_bh(&tbl->lock); + rcu_assign_pointer(*np, n->next); + + mutex_unlock(&tbl->phash_lock); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); return 0; } } - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); return -ENOENT; } -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev) +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { - struct pneigh_entry *n, **np, *freelist = NULL; + struct pneigh_entry *n, __rcu **np; + LIST_HEAD(head); u32 h; + mutex_lock(&tbl->phash_lock); + for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, 1)) != NULL) { + if (skip_perm && n->permanent) + goto skip; if (!dev || n->dev == dev) { - *np = n->next; - n->next = freelist; - freelist = n; + rcu_assign_pointer(*np, n->next); + list_add(&n->free_node, &head); continue; } +skip: np = &n->next; } } - write_unlock_bh(&tbl->lock); - while ((n = freelist)) { - freelist = n->next; - n->next = NULL; + + mutex_unlock(&tbl->phash_lock); + + while (!list_empty(&head)) { + n = list_first_entry(&head, typeof(*n), free_node); + list_del(&n->free_node); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); } - return -ENOENT; } static inline void neigh_parms_put(struct neigh_parms *parms) @@ -937,7 +1010,8 @@ static void neigh_periodic_work(struct work_struct *work) state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || - (n->flags & NTF_EXT_LEARNED)) { + (n->flags & + (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) { write_unlock(&n->lock); continue; } @@ -1031,7 +1105,7 @@ static void neigh_probe(struct neighbour *neigh) static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; - struct neighbour *neigh = from_timer(neigh, t, timer); + struct neighbour *neigh = timer_container_of(neigh, t, timer); unsigned int state; int notify = 0; @@ -1090,9 +1164,15 @@ static void neigh_timer_handler(struct timer_list *t) if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { - WRITE_ONCE(neigh->nud_state, NUD_FAILED); + if (neigh->nud_state == NUD_PROBE && + neigh->flags & NTF_EXT_VALIDATED) { + WRITE_ONCE(neigh->nud_state, NUD_STALE); + neigh->updated = jiffies; + } else { + WRITE_ONCE(neigh->nud_state, NUD_FAILED); + neigh_invalidate(neigh); + } notify = 1; - neigh_invalidate(neigh); goto out; } @@ -1240,6 +1320,8 @@ static void neigh_update_hhs(struct neighbour *neigh) NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. + NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed + or invalidated. Caller MUST hold reference count on the entry. */ @@ -1402,7 +1484,8 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, * we can reinject the packet there. */ n2 = NULL; - if (dst && dst->obsolete != DST_OBSOLETE_DEAD) { + if (dst && + READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; @@ -1517,7 +1600,7 @@ out: return rc; out_kfree_skb: rc = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); @@ -1541,7 +1624,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) err = dev_queue_xmit(skb); else { err = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } @@ -1569,7 +1652,7 @@ static void neigh_managed_work(struct work_struct *work) static void neigh_proxy_process(struct timer_list *t) { - struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); + struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer); long sched_next = 0; unsigned long now = jiffies; struct sk_buff *skb, *n; @@ -1756,6 +1839,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); + mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, @@ -1972,21 +2056,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; - - if (ndm_flags & NTF_MANAGED) { + if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } - err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); - if (pn) { - pn->flags = ndm_flags; - if (protocol) - pn->protocol = protocol; - err = 0; - } + err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, + !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } @@ -2004,7 +2080,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || - ndm_flags & NTF_EXT_LEARNED; + ndm_flags & (NTF_EXT_LEARNED | + NTF_EXT_VALIDATED); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; @@ -2015,10 +2092,27 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EINVAL; goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED will result in the neighbor + * being created with an invalid state (NUD_NONE). + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = NUD_NONE; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot create externally validated neighbor with an invalid state"); + err = -EINVAL; + goto out; + } + } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & - (NTF_EXT_LEARNED | NTF_MANAGED), + (NTF_EXT_LEARNED | NTF_MANAGED | + NTF_EXT_VALIDATED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); @@ -2030,6 +2124,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_release(neigh); goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED do not update the existing + * state other than clearing it if it was + * NUD_PERMANENT. + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot mark neighbor as externally validated with an invalid state"); + err = -EINVAL; + neigh_release(neigh); + goto out; + } + } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | @@ -2046,13 +2158,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; + if (ndm_flags & NTF_EXT_VALIDATED) + flags |= NEIGH_UPDATE_F_EXT_VALIDATED; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); - if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) { + if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) neigh_event_send(neigh, NULL); - err = 0; - } neigh_release(neigh); out: return err; @@ -2430,12 +2542,12 @@ static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, { struct ndtmsg *ndtm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); + if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } - ndtm = nlmsg_data(nlh); if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; @@ -2579,13 +2691,15 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; + u8 protocol; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; - neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; - neigh_flags = pn->flags & NTF_OLD_MASK; + neigh_flags = READ_ONCE(pn->flags); + neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT; + neigh_flags &= NTF_OLD_MASK; ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; @@ -2599,7 +2713,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nla_put(skb, NDA_DST, tbl->key_len, pn->key)) goto nla_put_failure; - if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) + protocol = READ_ONCE(pn->protocol); + if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol)) goto nla_put_failure; if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) goto nla_put_failure; @@ -2706,12 +2821,12 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, if (filter->dev_idx || filter->master_idx) flags |= NLM_F_DUMP_FILTERED; - read_lock_bh(&tbl->lock); - for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0; + n; + n = rcu_dereference(n->next)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -2720,16 +2835,13 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, flags, tbl); - if (err < 0) { - read_unlock_bh(&tbl->lock); + if (err < 0) goto out; - } next: idx++; } } - read_unlock_bh(&tbl->lock); out: cb->args[3] = h; cb->args[4] = idx; @@ -2747,12 +2859,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, if (strict_check) { struct ndmsg *ndm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); @@ -2846,64 +2958,58 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) return err; } -static int neigh_valid_get_req(const struct nlmsghdr *nlh, - struct neigh_table **tbl, - void **dst, int *dev_idx, u8 *ndm_flags, - struct netlink_ext_ack *extack) +static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) { - struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } if (ndm->ndm_flags & ~NTF_PROXY) { NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); + } + + if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) { + NL_SET_ERR_MSG(extack, "No device specified"); + return ERR_PTR(-EINVAL); } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) - return err; - - *ndm_flags = ndm->ndm_flags; - *dev_idx = ndm->ndm_ifindex; - *tbl = neigh_find_table(ndm->ndm_family); - if (*tbl == NULL) { - NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); - return -EAFNOSUPPORT; - } + return ERR_PTR(err); for (i = 0; i <= NDA_MAX; ++i) { - if (!tb[i]) - continue; - switch (i) { case NDA_DST: - if (nla_len(tb[i]) != (int)(*tbl)->key_len) { - NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); - return -EINVAL; + if (!tb[i]) { + NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST); + return ERR_PTR(-EINVAL); } - *dst = nla_data(tb[i]); break; default: + if (!tb[i]) + continue; + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request"); - return -EINVAL; + return ERR_PTR(-EINVAL); } } - return 0; + return ndm; } static inline size_t neigh_nlmsg_size(void) @@ -2917,27 +3023,6 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int neigh_get_reply(struct net *net, struct neighbour *neigh, - u32 pid, u32 seq) -{ - struct sk_buff *skb; - int err = 0; - - skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); - if (err) { - kfree_skb(skb); - goto errout; - } - - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} - static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) @@ -2946,85 +3031,91 @@ static inline size_t pneigh_nlmsg_size(void) + nla_total_size(1); /* NDA_PROTOCOL */ } -static int pneigh_get_reply(struct net *net, struct pneigh_entry *neigh, - u32 pid, u32 seq, struct neigh_table *tbl) +static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { + struct net *net = sock_net(in_skb->sk); + u32 pid = NETLINK_CB(in_skb).portid; + struct nlattr *tb[NDA_MAX + 1]; + struct net_device *dev = NULL; + u32 seq = nlh->nlmsg_seq; + struct neigh_table *tbl; + struct neighbour *neigh; struct sk_buff *skb; - int err = 0; + struct ndmsg *ndm; + void *dst; + int err; + + ndm = neigh_valid_get_req(nlh, tb, extack); + if (IS_ERR(ndm)) + return PTR_ERR(ndm); - skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); + if (ndm->ndm_flags & NTF_PROXY) + skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL); + else + skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL); if (!skb) return -ENOBUFS; - err = pneigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0, tbl); - if (err) { - kfree_skb(skb); - goto errout; - } + rcu_read_lock(); - err = rtnl_unicast(skb, net, pid); -errout: - return err; -} + tbl = neigh_find_table(ndm->ndm_family); + if (!tbl) { + NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request"); + err = -EAFNOSUPPORT; + goto err_unlock; + } -static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) -{ - struct net *net = sock_net(in_skb->sk); - struct net_device *dev = NULL; - struct neigh_table *tbl = NULL; - struct neighbour *neigh; - void *dst = NULL; - u8 ndm_flags = 0; - int dev_idx = 0; - int err; + if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) { + NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request"); + err = -EINVAL; + goto err_unlock; + } - err = neigh_valid_get_req(nlh, &tbl, &dst, &dev_idx, &ndm_flags, - extack); - if (err < 0) - return err; + dst = nla_data(tb[NDA_DST]); - if (dev_idx) { - dev = __dev_get_by_index(net, dev_idx); + if (ndm->ndm_ifindex) { + dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); - return -ENODEV; + err = -ENODEV; + goto err_unlock; } } - if (!dst) { - NL_SET_ERR_MSG(extack, "Network address not specified"); - return -EINVAL; - } - - if (ndm_flags & NTF_PROXY) { + if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; - pn = pneigh_lookup(tbl, net, dst, dev, 0); + pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); - return -ENOENT; + err = -ENOENT; + goto err_unlock; } - return pneigh_get_reply(net, pn, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, tbl); - } - if (!dev) { - NL_SET_ERR_MSG(extack, "No device specified"); - return -EINVAL; - } + err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl); + if (err) + goto err_unlock; + } else { + neigh = neigh_lookup(tbl, dst, dev); + if (!neigh) { + NL_SET_ERR_MSG(extack, "Neighbour entry not found"); + err = -ENOENT; + goto err_unlock; + } - neigh = neigh_lookup(tbl, dst, dev); - if (!neigh) { - NL_SET_ERR_MSG(extack, "Neighbour entry not found"); - return -ENOENT; + err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0); + neigh_release(neigh); + if (err) + goto err_unlock; } - err = neigh_get_reply(net, neigh, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq); - - neigh_release(neigh); + rcu_read_unlock(); + return rtnl_unicast(skb, net, pid); +err_unlock: + rcu_read_unlock(); + kfree_skb(skb); return err; } @@ -3231,9 +3322,10 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = tbl->phash_buckets[bucket]; + pn = rcu_dereference(tbl->phash_buckets[bucket]); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } @@ -3251,15 +3343,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = pn->next; + pn = rcu_dereference(pn->next); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = tbl->phash_buckets[state->bucket]; + + pn = rcu_dereference(tbl->phash_buckets[state->bucket]); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference(pn->next); if (pn) break; } @@ -3823,7 +3917,7 @@ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, - .flags = RTNL_FLAG_DUMP_UNLOCKED}, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, }; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 3e92bf0f9060b..4f0f0709a1cbc 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -132,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); + /* Pairs with WRITE_ONCE() in skb_flow_limit() */ if (fl) - flow_limit_count = fl->count; + flow_limit_count = READ_ONCE(fl->count); rcu_read_unlock(); #endif @@ -144,11 +145,11 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, atomic_read(&sd->dropped), - sd->time_squeeze, 0, + READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ - sd->received_rps, flow_limit_count, + READ_ONCE(sd->received_rps), flow_limit_count, input_qlen + process_qlen, (int)seq->index, input_qlen, process_qlen); return 0; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1ace0cd01adce..c28cd66654447 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -256,7 +256,7 @@ static ssize_t name_assign_type_show(struct device *dev, } static DEVICE_ATTR_RO(name_assign_type); -/* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */ +/* use same locking rules as GIFHWADDR ioctl's (netif_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -641,12 +641,6 @@ static ssize_t phys_port_id_show(struct device *dev, struct netdev_phys_item_id ppid; ssize_t ret; - /* The check is also done in dev_get_phys_port_id; this helps returning - * early without hitting the locking section below. - */ - if (!netdev->netdev_ops->ndo_get_phys_port_id) - return -EOPNOTSUPP; - ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; @@ -668,13 +662,6 @@ static ssize_t phys_port_name_show(struct device *dev, char name[IFNAMSIZ]; ssize_t ret; - /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the locking section below. - */ - if (!netdev->netdev_ops->ndo_get_phys_port_name && - !netdev->devlink_port) - return -EOPNOTSUPP; - ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; @@ -696,19 +683,11 @@ static ssize_t phys_switch_id_show(struct device *dev, struct netdev_phys_item_id ppid = { }; ssize_t ret; - /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the locking section below. This works - * because recurse is false when calling dev_get_port_parent_id. - */ - if (!netdev->netdev_ops->ndo_get_port_parent_id && - !netdev->devlink_port) - return -EOPNOTSUPP; - ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; - ret = dev_get_port_parent_id(netdev, &ppid, false); + ret = netif_get_port_parent_id(netdev, &ppid, false); if (!ret) ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); @@ -718,6 +697,40 @@ static ssize_t phys_switch_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_switch_id); +static struct attribute *netdev_phys_attrs[] __ro_after_init = { + &dev_attr_phys_port_id.attr, + &dev_attr_phys_port_name.attr, + &dev_attr_phys_switch_id.attr, + NULL, +}; + +static umode_t netdev_phys_is_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + struct device *dev = kobj_to_dev(kobj); + struct net_device *netdev = to_net_dev(dev); + + if (attr == &dev_attr_phys_port_id.attr) { + if (!netdev->netdev_ops->ndo_get_phys_port_id) + return 0; + } else if (attr == &dev_attr_phys_port_name.attr) { + if (!netdev->netdev_ops->ndo_get_phys_port_name && + !netdev->devlink_port) + return 0; + } else if (attr == &dev_attr_phys_switch_id.attr) { + if (!netdev->netdev_ops->ndo_get_port_parent_id && + !netdev->devlink_port) + return 0; + } + + return attr->mode; +} + +static const struct attribute_group netdev_phys_group = { + .attrs = netdev_phys_attrs, + .is_visible = netdev_phys_is_visible, +}; + static ssize_t threaded_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -744,7 +757,7 @@ static int modify_napi_threaded(struct net_device *dev, unsigned long val) if (val != 0 && val != 1) return -EOPNOTSUPP; - ret = dev_set_threaded(dev, val); + ret = netif_set_threaded(dev, val); return ret; } @@ -783,9 +796,6 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_napi_defer_hard_irqs.attr, - &dev_attr_phys_port_id.attr, - &dev_attr_phys_port_name.attr, - &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, @@ -1200,12 +1210,21 @@ static int rx_queue_default_mask(struct net_device *dev, struct netdev_rx_queue *queue) { #if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) - struct cpumask *rps_default_mask = READ_ONCE(dev_net(dev)->core.rps_default_mask); + struct cpumask *rps_default_mask; + int res = 0; + mutex_lock(&rps_default_mask_mutex); + + rps_default_mask = dev_net(dev)->core.rps_default_mask; if (rps_default_mask && !cpumask_empty(rps_default_mask)) - return netdev_rx_queue_set_rps_mask(queue, rps_default_mask); -#endif + res = netdev_rx_queue_set_rps_mask(queue, rps_default_mask); + + mutex_unlock(&rps_default_mask_mutex); + + return res; +#else return 0; +#endif } static int rx_queue_add_kobject(struct net_device *dev, int index) @@ -2328,6 +2347,7 @@ int netdev_register_kobject(struct net_device *ndev) groups++; *groups++ = &netstat_group; + *groups++ = &netdev_phys_group; if (wireless_group_needed(ndev)) *groups++ = &wireless_group; diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 8a5b04c2699aa..e938f25e8e86f 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -11,4 +11,6 @@ int netdev_queue_update_kobjects(struct net_device *net, int netdev_change_owner(struct net_device *, const struct net *net_old, const struct net *net_new); +extern struct mutex rps_default_mask_mutex; + #endif diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b0dfdf791ece5..1b6f3826dd0e1 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -19,9 +19,9 @@ #include <linux/net_namespace.h> #include <linux/sched/task.h> #include <linux/uidgid.h> -#include <linux/cookie.h> #include <linux/proc_fs.h> +#include <net/aligned_data.h> #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> @@ -64,8 +64,6 @@ DECLARE_RWSEM(pernet_ops_rwsem); static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; -DEFINE_COOKIE(net_cookie); - static struct net_generic *net_alloc_generic(void) { unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); @@ -163,16 +161,45 @@ static void ops_pre_exit_list(const struct pernet_operations *ops, } } +static void ops_exit_rtnl_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + const struct pernet_operations *saved_ops = ops; + LIST_HEAD(dev_kill_list); + struct net *net; + + rtnl_lock(); + + list_for_each_entry(net, net_exit_list, exit_list) { + __rtnl_net_lock(net); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_rtnl) + ops->exit_rtnl(net, &dev_kill_list); + } + + __rtnl_net_unlock(net); + } + + unregister_netdevice_many(&dev_kill_list); + + rtnl_unlock(); +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { - struct net *net; if (ops->exit) { + struct net *net; + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } + if (ops->exit_batch) ops->exit_batch(net_exit_list); } @@ -188,6 +215,56 @@ static void ops_free_list(const struct pernet_operations *ops, } } +static void ops_undo_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list, + bool expedite_rcu) +{ + const struct pernet_operations *saved_ops; + bool hold_rtnl = false; + + if (!ops) + ops = list_entry(ops_list, typeof(*ops), list); + + saved_ops = ops; + + list_for_each_entry_continue_reverse(ops, ops_list, list) { + hold_rtnl |= !!ops->exit_rtnl; + ops_pre_exit_list(ops, net_exit_list); + } + + /* Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so the + * rcu_barrier() after ops_undo_list() isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. + */ + if (expedite_rcu) + synchronize_rcu_expedited(); + else + synchronize_rcu(); + + if (hold_rtnl) + ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_exit_list(ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_free_list(ops, net_exit_list); +} + +static void ops_undo_single(struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + LIST_HEAD(ops_list); + + list_add(&ops->list, &ops_list); + ops_undo_list(&ops_list, NULL, net_exit_list, false); + list_del(&ops->list); +} + /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { @@ -240,10 +317,10 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) if (refcount_read(&net->ns.count) == 0) return NETNSA_NSID_NOT_ASSIGNED; - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); id = __peernet2id(net, peer); if (id >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); return id; } @@ -253,12 +330,12 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) * just been idr_remove()'d from there in cleanup_net(). */ if (!maybe_get_net(peer)) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); return NETNSA_NSID_NOT_ASSIGNED; } id = alloc_netid(net, peer, -1); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); put_net(peer); if (id < 0) @@ -324,8 +401,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ { refcount_set(&net->passive, 1); refcount_set(&net->ns.count, 1); - ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); - ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt"); + ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); + ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); get_random_bytes(&net->hash_mix, sizeof(u32)); net->dev_base_seq = 1; @@ -351,14 +428,11 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ static __net_init int setup_net(struct net *net) { /* Must be called with pernet_ops_rwsem held */ - const struct pernet_operations *ops, *saved_ops; + const struct pernet_operations *ops; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); int error = 0; - preempt_disable(); - net->net_cookie = gen_cookie_next(&net_cookie); - preempt_enable(); + net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -376,29 +450,7 @@ out_undo: * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); - saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - synchronize_rcu(); - - ops = saved_ops; - rtnl_lock(); - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); - + ops_undo_list(&pernet_list, ops, &net_exit_list, false); rcu_barrier(); goto out; } @@ -572,20 +624,20 @@ static void unhash_nsid(struct net *net, struct net *last) for_each_net(tmp) { int id; - spin_lock_bh(&tmp->nsid_lock); + spin_lock(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); - spin_unlock_bh(&tmp->nsid_lock); + spin_unlock(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, GFP_KERNEL); if (tmp == last) break; } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); idr_destroy(&net->netns_ids); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); @@ -594,13 +646,11 @@ struct task_struct *cleanup_net_task; static void cleanup_net(struct work_struct *work) { - const struct pernet_operations *ops; - struct net *net, *tmp, *last; struct llist_node *net_kill_list; + struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); - cleanup_net_task = current; + WRITE_ONCE(cleanup_net_task, current); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -629,33 +679,7 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } - /* Run all of the network namespace pre_exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - /* - * Another CPU might be rcu-iterating the list, wait for it. - * This needs to be before calling the exit() notifiers, so - * the rcu_barrier() below isn't sufficient alone. - * Also the pre_exit() and exit() methods need this barrier. - */ - synchronize_rcu_expedited(); - - rtnl_lock(); - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); + ops_undo_list(&pernet_list, NULL, &net_exit_list, true); up_read(&pernet_ops_rwsem); @@ -676,7 +700,7 @@ static void cleanup_net(struct work_struct *work) put_user_ns(net->user_ns); net_passive_dec(net); } - cleanup_net_task = NULL; + WRITE_ONCE(cleanup_net_task, NULL); } /** @@ -763,16 +787,50 @@ struct net *get_net_ns_by_pid(pid_t pid) } EXPORT_SYMBOL_GPL(get_net_ns_by_pid); +#ifdef CONFIG_NET_NS_REFCNT_TRACKER +static void net_ns_net_debugfs(struct net *net) +{ + ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt", + net->net_cookie, net->ns.inum); + ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt", + net->net_cookie, net->ns.inum); +} + +static int __init init_net_debugfs(void) +{ + ref_tracker_dir_debugfs(&init_net.refcnt_tracker); + ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker); + net_ns_net_debugfs(&init_net); + return 0; +} +late_initcall(init_net_debugfs); +#else +static void net_ns_net_debugfs(struct net *net) +{ +} +#endif + static __net_init int net_ns_net_init(struct net *net) { #ifdef CONFIG_NET_NS net->ns.ops = &netns_operations; #endif - return ns_alloc_inum(&net->ns); + net->ns.inum = PROC_NET_INIT_INO; + if (net != &init_net) { + int ret = ns_alloc_inum(&net->ns); + if (ret) + return ret; + } + net_ns_net_debugfs(net); + return 0; } static __net_exit void net_ns_net_exit(struct net *net) { + /* + * Initial network namespace doesn't exit so we don't need any + * special checks here. + */ ns_free_inum(&net->ns); } @@ -824,9 +882,9 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(peer); } - spin_lock_bh(&net->nsid_lock); + spin_lock(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); err = -EEXIST; NL_SET_BAD_ATTR(extack, nla); NL_SET_ERR_MSG(extack, @@ -835,7 +893,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, } err = alloc_netid(net, peer, nsid); - spin_unlock_bh(&net->nsid_lock); + spin_unlock(&net->nsid_lock); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid, nlh, GFP_KERNEL); @@ -1239,31 +1297,13 @@ void __init net_ns_init(void) rtnl_register_many(net_ns_rtnl_msg_handlers); } -static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) -{ - ops_pre_exit_list(ops, net_exit_list); - synchronize_rcu(); - - if (ops->exit_batch_rtnl) { - LIST_HEAD(dev_kill_list); - - rtnl_lock(); - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - ops_exit_list(ops, net_exit_list); - - ops_free_list(ops, net_exit_list); -} - #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + LIST_HEAD(net_exit_list); struct net *net; int error; - LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); if (ops->init || ops->id) { @@ -1282,21 +1322,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { - struct net *net; LIST_HEAD(net_exit_list); + struct net *net; - list_del(&ops->list); /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + list_del(&ops->list); + ops_undo_single(ops, &net_exit_list); } #else @@ -1318,8 +1358,9 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) list_del(&ops->list); } else { LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); } } diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index d22f0919821e9..dff66d8fb325d 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -21,7 +21,9 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state struct cgroup_cls_state *task_cls_state(struct task_struct *p) { return css_cls_state(task_css_check(p, net_cls_cgrp_id, - rcu_read_lock_bh_held())); + rcu_read_lock_held() || + rcu_read_lock_bh_held() || + rcu_read_lock_trace_held())); } EXPORT_SYMBOL_GPL(task_cls_state); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 739f7b6506a6a..e9a2a6f26cb7d 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -92,11 +92,18 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] }; /* NETDEV_CMD_NAPI_SET - do */ -static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED + 1] = { [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), +}; + +/* NETDEV_CMD_BIND_TX - do */ +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, }; /* Ops table for netdev */ @@ -187,9 +194,16 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_NAPI_SET, .doit = netdev_nl_napi_set_doit, .policy = netdev_napi_set_nl_policy, - .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + .maxattr = NETDEV_A_NAPI_THREADED, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_BIND_TX, + .doit = netdev_nl_bind_tx_doit, + .policy = netdev_bind_tx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 17d39fd64c948..cf3fad74511f5 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -34,6 +34,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a877693fecd65..6314eb7bdf69b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -38,6 +38,8 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, u64 xdp_rx_meta = 0; void *hdr; + netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ + hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; @@ -122,15 +124,14 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = __dev_get_by_index(genl_info_net(info), ifindex); - if (netdev) - err = netdev_nl_dev_fill(netdev, rsp, info); - else + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { err = -ENODEV; + goto err_free_msg; + } - rtnl_unlock(); + err = netdev_nl_dev_fill(netdev, rsp, info); + netdev_unlock(netdev); if (err) goto err_free_msg; @@ -146,18 +147,15 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); - struct net_device *netdev; - int err = 0; + int err; - rtnl_lock(); - for_each_netdev_dump(net, netdev, ctx->ifindex) { + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) - break; + return err; } - rtnl_unlock(); - return err; + return 0; } static int @@ -186,6 +184,10 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; + if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED, + napi_get_threaded(napi))) + goto nla_put_failure; + if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) @@ -324,8 +326,18 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; + u8 threaded = 0; u32 defer = 0; + if (info->attrs[NETDEV_A_NAPI_THREADED]) { + int ret; + + threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]); + ret = napi_set_threaded(napi, threaded); + if (ret) + return ret; + } + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); napi_set_defer_hard_irqs(napi, defer); @@ -481,18 +493,15 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), + ifindex); if (netdev) { err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); - netdev_unlock(netdev); + netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } - rtnl_unlock(); - if (err) goto err_free_msg; @@ -541,17 +550,17 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = netdev_get_by_index_lock(net, ifindex); + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); if (netdev) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); - netdev_unlock(netdev); + netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } } else { - for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { + for_each_netdev_lock_ops_compat_scoped(net, netdev, + ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; @@ -559,7 +568,6 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) ctx->txq_idx = 0; } } - rtnl_unlock(); return err; } @@ -832,26 +840,31 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = __dev_get_by_index(net, ifindex); - if (netdev && netdev->stat_ops) { - err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, - info, ctx); - } else { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (!netdev) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); - err = netdev ? -EOPNOTSUPP : -ENODEV; + return -ENODEV; } - } else { - for_each_netdev_dump(net, netdev, ctx->ifindex) { + if (netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); - if (err < 0) - break; + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = -EOPNOTSUPP; } + netdev_unlock_ops_compat(netdev); + return err; + } + + for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; } - rtnl_unlock(); return err; } @@ -908,7 +921,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); + binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, + priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; @@ -943,8 +957,6 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unbind; } - list_add(&binding->list, &priv->bindings); - nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); @@ -969,6 +981,81 @@ err_genlmsg_free: return err; } +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_devmem_dmabuf_binding *binding; + struct netdev_nl_sock *priv; + struct net_device *netdev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + + if (!netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock_netdev; + } + + if (!netdev->netmem_tx) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Driver does not support netmem TX"); + goto err_unlock_netdev; + } + + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv, + info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock_netdev; + } + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + netdev_unlock(netdev); + mutex_unlock(&priv->lock); + + return genlmsg_reply(rsp, info); + +err_unlock_netdev: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); @@ -1009,10 +1096,14 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_REGISTER: + netdev_lock_ops_to_full(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); + netdev_unlock_full_to_ops(netdev); break; case NETDEV_UNREGISTER: + netdev_lock(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); + netdev_unlock(netdev); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index d126f10197bf4..3bf1151d80610 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -97,14 +97,12 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, if (!netdev_need_ops_lock(dev)) return -EOPNOTSUPP; - if (rxq_idx >= dev->real_num_rx_queues) - return -EINVAL; - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); - if (rxq_idx >= dev->real_num_rx_queues) { NL_SET_ERR_MSG(extack, "rx queue index out of range"); return -ERANGE; } + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); return -EINVAL; diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 7eadb8393e002..cd95394399b40 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,7 +5,7 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic; + return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) @@ -15,9 +15,16 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) static inline void netmem_clear_pp_magic(netmem_ref netmem) { + WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + __netmem_clear_lsb(netmem)->pp_magic = 0; } +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; +} + static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) { __netmem_clear_lsb(netmem)->pp = pool; @@ -28,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem, { __netmem_clear_lsb(netmem)->dma_addr = dma_addr; } + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = __netmem_clear_lsb(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + __netmem_clear_lsb(netmem)->pp_magic = magic; +} #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4ddb7490df4b8..5f65b62346d4e 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -58,13 +58,6 @@ static void zap_completion_queue(void); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); -#define np_info(np, fmt, ...) \ - pr_info("%s: " fmt, np->name, ##__VA_ARGS__) -#define np_err(np, fmt, ...) \ - pr_err("%s: " fmt, np->name, ##__VA_ARGS__) -#define np_notice(np, fmt, ...) \ - pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) - static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) @@ -379,6 +372,31 @@ out: return ret; } +static void netpoll_udp_checksum(struct netpoll *np, struct sk_buff *skb, + int len) +{ + struct udphdr *udph; + int udp_len; + + udp_len = len + sizeof(struct udphdr); + udph = udp_hdr(skb); + + /* check needs to be set, since it will be consumed in csum_partial */ + udph->check = 0; + if (np->ipv6) + udph->check = csum_ipv6_magic(&np->local_ip.in6, + &np->remote_ip.in6, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + else + udph->check = csum_tcpudp_magic(np->local_ip.ip, + np->remote_ip.ip, + udp_len, IPPROTO_UDP, + csum_partial(udph, udp_len, 0)); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; +} + netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; @@ -396,24 +414,101 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) } EXPORT_SYMBOL(netpoll_send_skb); +static void push_ipv6(struct netpoll *np, struct sk_buff *skb, int len) +{ + struct ipv6hdr *ip6h; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + + /* ip6h->version = 6; ip6h->priority = 0; */ + *(unsigned char *)ip6h = 0x60; + ip6h->flow_lbl[0] = 0; + ip6h->flow_lbl[1] = 0; + ip6h->flow_lbl[2] = 0; + + ip6h->payload_len = htons(sizeof(struct udphdr) + len); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->hop_limit = 32; + ip6h->saddr = np->local_ip.in6; + ip6h->daddr = np->remote_ip.in6; + + skb->protocol = htons(ETH_P_IPV6); +} + +static void push_ipv4(struct netpoll *np, struct sk_buff *skb, int len) +{ + static atomic_t ip_ident; + struct iphdr *iph; + int ip_len; + + ip_len = len + sizeof(struct udphdr) + sizeof(struct iphdr); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + + /* iph->version = 4; iph->ihl = 5; */ + *(unsigned char *)iph = 0x45; + iph->tos = 0; + put_unaligned(htons(ip_len), &iph->tot_len); + iph->id = htons(atomic_inc_return(&ip_ident)); + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(np->local_ip.ip, &iph->saddr); + put_unaligned(np->remote_ip.ip, &iph->daddr); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + skb->protocol = htons(ETH_P_IP); +} + +static void push_udp(struct netpoll *np, struct sk_buff *skb, int len) +{ + struct udphdr *udph; + int udp_len; + + udp_len = len + sizeof(struct udphdr); + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + udph = udp_hdr(skb); + udph->source = htons(np->local_port); + udph->dest = htons(np->remote_port); + udph->len = htons(udp_len); + + netpoll_udp_checksum(np, skb, len); +} + +static void push_eth(struct netpoll *np, struct sk_buff *skb) +{ + struct ethhdr *eth; + + eth = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + ether_addr_copy(eth->h_source, np->dev->dev_addr); + ether_addr_copy(eth->h_dest, np->remote_mac); + if (np->ipv6) + eth->h_proto = htons(ETH_P_IPV6); + else + eth->h_proto = htons(ETH_P_IP); +} + int netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; - struct udphdr *udph; - struct iphdr *iph; - struct ethhdr *eth; - static atomic_t ip_ident; - struct ipv6hdr *ip6h; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!irqs_disabled()); - udp_len = len + sizeof(*udph); + udp_len = len + sizeof(struct udphdr); if (np->ipv6) - ip_len = udp_len + sizeof(*ip6h); + ip_len = udp_len + sizeof(struct ipv6hdr); else - ip_len = udp_len + sizeof(*iph); + ip_len = udp_len + sizeof(struct iphdr); total_len = ip_len + LL_RESERVED_SPACE(np->dev); @@ -425,117 +520,18 @@ int netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); - skb_push(skb, sizeof(*udph)); - skb_reset_transport_header(skb); - udph = udp_hdr(skb); - udph->source = htons(np->local_port); - udph->dest = htons(np->remote_port); - udph->len = htons(udp_len); - - if (np->ipv6) { - udph->check = csum_ipv6_magic(&np->local_ip.in6, - &np->remote_ip.in6, - udp_len, IPPROTO_UDP, - csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*ip6h)); - skb_reset_network_header(skb); - ip6h = ipv6_hdr(skb); - - /* ip6h->version = 6; ip6h->priority = 0; */ - *(unsigned char *)ip6h = 0x60; - ip6h->flow_lbl[0] = 0; - ip6h->flow_lbl[1] = 0; - ip6h->flow_lbl[2] = 0; - - ip6h->payload_len = htons(sizeof(struct udphdr) + len); - ip6h->nexthdr = IPPROTO_UDP; - ip6h->hop_limit = 32; - ip6h->saddr = np->local_ip.in6; - ip6h->daddr = np->remote_ip.in6; - - eth = skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IPV6); - } else { - udph->check = 0; - udph->check = csum_tcpudp_magic(np->local_ip.ip, - np->remote_ip.ip, - udp_len, IPPROTO_UDP, - csum_partial(udph, udp_len, 0)); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - - skb_push(skb, sizeof(*iph)); - skb_reset_network_header(skb); - iph = ip_hdr(skb); - - /* iph->version = 4; iph->ihl = 5; */ - *(unsigned char *)iph = 0x45; - iph->tos = 0; - put_unaligned(htons(ip_len), &(iph->tot_len)); - iph->id = htons(atomic_inc_return(&ip_ident)); - iph->frag_off = 0; - iph->ttl = 64; - iph->protocol = IPPROTO_UDP; - iph->check = 0; - put_unaligned(np->local_ip.ip, &(iph->saddr)); - put_unaligned(np->remote_ip.ip, &(iph->daddr)); - iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - - eth = skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb->protocol = eth->h_proto = htons(ETH_P_IP); - } - - ether_addr_copy(eth->h_source, np->dev->dev_addr); - ether_addr_copy(eth->h_dest, np->remote_mac); - + push_udp(np, skb, len); + if (np->ipv6) + push_ipv6(np, skb, len); + else + push_ipv4(np, skb, len); + push_eth(np, skb); skb->dev = np->dev; return (int)netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); -void netpoll_print_options(struct netpoll *np) -{ - np_info(np, "local port %d\n", np->local_port); - if (np->ipv6) - np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); - else - np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); - np_info(np, "interface name '%s'\n", np->dev_name); - np_info(np, "local ethernet address '%pM'\n", np->dev_mac); - np_info(np, "remote port %d\n", np->remote_port); - if (np->ipv6) - np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); - else - np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip); - np_info(np, "remote ethernet address %pM\n", np->remote_mac); -} -EXPORT_SYMBOL(netpoll_print_options); - -static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) -{ - const char *end; - - if (!strchr(str, ':') && - in4_pton(str, -1, (void *)addr, -1, &end) > 0) { - if (!*end) - return 0; - } - if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) { -#if IS_ENABLED(CONFIG_IPV6) - if (!*end) - return 1; -#else - return -1; -#endif - } - return -1; -} static void skb_pool_flush(struct netpoll *np) { @@ -546,95 +542,6 @@ static void skb_pool_flush(struct netpoll *np) skb_queue_purge_reason(skb_pool, SKB_CONSUMED); } -int netpoll_parse_options(struct netpoll *np, char *opt) -{ - char *cur=opt, *delim; - int ipv6; - bool ipversion_set = false; - - if (*cur != '@') { - if ((delim = strchr(cur, '@')) == NULL) - goto parse_failed; - *delim = 0; - if (kstrtou16(cur, 10, &np->local_port)) - goto parse_failed; - cur = delim; - } - cur++; - - if (*cur != '/') { - ipversion_set = true; - if ((delim = strchr(cur, '/')) == NULL) - goto parse_failed; - *delim = 0; - ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip); - if (ipv6 < 0) - goto parse_failed; - else - np->ipv6 = (bool)ipv6; - cur = delim; - } - cur++; - - if (*cur != ',') { - /* parse out dev_name or dev_mac */ - if ((delim = strchr(cur, ',')) == NULL) - goto parse_failed; - *delim = 0; - - np->dev_name[0] = '\0'; - eth_broadcast_addr(np->dev_mac); - if (!strchr(cur, ':')) - strscpy(np->dev_name, cur, sizeof(np->dev_name)); - else if (!mac_pton(cur, np->dev_mac)) - goto parse_failed; - - cur = delim; - } - cur++; - - if (*cur != '@') { - /* dst port */ - if ((delim = strchr(cur, '@')) == NULL) - goto parse_failed; - *delim = 0; - if (*cur == ' ' || *cur == '\t') - np_info(np, "warning: whitespace is not allowed\n"); - if (kstrtou16(cur, 10, &np->remote_port)) - goto parse_failed; - cur = delim; - } - cur++; - - /* dst ip */ - if ((delim = strchr(cur, '/')) == NULL) - goto parse_failed; - *delim = 0; - ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip); - if (ipv6 < 0) - goto parse_failed; - else if (ipversion_set && np->ipv6 != (bool)ipv6) - goto parse_failed; - else - np->ipv6 = (bool)ipv6; - cur = delim + 1; - - if (*cur != 0) { - /* MAC address */ - if (!mac_pton(cur, np->remote_mac)) - goto parse_failed; - } - - netpoll_print_options(np); - - return 0; - - parse_failed: - np_info(np, "couldn't parse config at '%s'!\n", cur); - return -1; -} -EXPORT_SYMBOL(netpoll_parse_options); - static void refill_skbs_work_handler(struct work_struct *work) { struct netpoll *np = @@ -716,13 +623,97 @@ static char *egress_dev(struct netpoll *np, char *buf) return buf; } +static void netpoll_wait_carrier(struct netpoll *np, struct net_device *ndev, + unsigned int timeout) +{ + unsigned long atmost; + + atmost = jiffies + timeout * HZ; + while (!netif_carrier_ok(ndev)) { + if (time_after(jiffies, atmost)) { + np_notice(np, "timeout waiting for carrier\n"); + break; + } + msleep(1); + } +} + +/* + * Take the IPv6 from ndev and populate local_ip structure in netpoll + */ +static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) +{ + char buf[MAC_ADDR_STR_LEN + 1]; + int err = -EDESTADDRREQ; + struct inet6_dev *idev; + + if (!IS_ENABLED(CONFIG_IPV6)) { + np_err(np, "IPv6 is not supported %s, aborting\n", + egress_dev(np, buf)); + return -EINVAL; + } + + idev = __in6_dev_get(ndev); + if (idev) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != + !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) + continue; + /* Got the IP, let's return */ + np->local_ip.in6 = ifp->addr; + err = 0; + break; + } + read_unlock_bh(&idev->lock); + } + if (err) { + np_err(np, "no IPv6 address for %s, aborting\n", + egress_dev(np, buf)); + return err; + } + + np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); + return 0; +} + +/* + * Take the IPv4 from ndev and populate local_ip structure in netpoll + */ +static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) +{ + char buf[MAC_ADDR_STR_LEN + 1]; + const struct in_ifaddr *ifa; + struct in_device *in_dev; + + in_dev = __in_dev_get_rtnl(ndev); + if (!in_dev) { + np_err(np, "no IP address for %s, aborting\n", + egress_dev(np, buf)); + return -EDESTADDRREQ; + } + + ifa = rtnl_dereference(in_dev->ifa_list); + if (!ifa) { + np_err(np, "no IP address for %s, aborting\n", + egress_dev(np, buf)); + return -EDESTADDRREQ; + } + + np->local_ip.ip = ifa->ifa_local; + np_info(np, "local IP %pI4\n", &np->local_ip.ip); + + return 0; +} + int netpoll_setup(struct netpoll *np) { struct net *net = current->nsproxy->net_ns; char buf[MAC_ADDR_STR_LEN + 1]; struct net_device *ndev = NULL; bool ip_overwritten = false; - struct in_device *in_dev; int err; rtnl_lock(); @@ -746,91 +737,44 @@ int netpoll_setup(struct netpoll *np) } if (!netif_running(ndev)) { - unsigned long atmost; - np_info(np, "device %s not up yet, forcing it\n", egress_dev(np, buf)); err = dev_open(ndev, NULL); - if (err) { np_err(np, "failed to open %s\n", ndev->name); goto put; } rtnl_unlock(); - atmost = jiffies + carrier_timeout * HZ; - while (!netif_carrier_ok(ndev)) { - if (time_after(jiffies, atmost)) { - np_notice(np, "timeout waiting for carrier\n"); - break; - } - msleep(1); - } - + netpoll_wait_carrier(np, ndev, carrier_timeout); rtnl_lock(); } if (!np->local_ip.ip) { if (!np->ipv6) { - const struct in_ifaddr *ifa; - - in_dev = __in_dev_get_rtnl(ndev); - if (!in_dev) - goto put_noaddr; - - ifa = rtnl_dereference(in_dev->ifa_list); - if (!ifa) { -put_noaddr: - np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); - err = -EDESTADDRREQ; + err = netpoll_take_ipv4(np, ndev); + if (err) goto put; - } - - np->local_ip.ip = ifa->ifa_local; - ip_overwritten = true; - np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_dev *idev; - - err = -EDESTADDRREQ; - idev = __in6_dev_get(ndev); - if (idev) { - struct inet6_ifaddr *ifp; - - read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != - !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) - continue; - np->local_ip.in6 = ifp->addr; - ip_overwritten = true; - err = 0; - break; - } - read_unlock_bh(&idev->lock); - } - if (err) { - np_err(np, "no IPv6 address for %s, aborting\n", - egress_dev(np, buf)); + err = netpoll_take_ipv6(np, ndev); + if (err) goto put; - } else - np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); -#else - np_err(np, "IPv6 is not supported %s, aborting\n", - egress_dev(np, buf)); - err = -EINVAL; - goto put; -#endif } + ip_overwritten = true; } err = __netpoll_setup(np, ndev); if (err) goto flush; rtnl_unlock(); + + /* Make sure all NAPI polls which started before dev->npinfo + * was visible have exited before we start calling NAPI poll. + * NAPI skips locking if dev->npinfo is NULL. + */ + synchronize_rcu(); + return 0; flush: @@ -863,7 +807,7 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) kfree(npinfo); } -void __netpoll_cleanup(struct netpoll *np) +static void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; @@ -885,7 +829,6 @@ void __netpoll_cleanup(struct netpoll *np) skb_pool_flush(np); } -EXPORT_SYMBOL_GPL(__netpoll_cleanup); void __netpoll_free(struct netpoll *np) { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 7745ad924ae2d..05e2e22a8f7c2 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -153,9 +153,9 @@ u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) EXPORT_SYMBOL(page_pool_ethtool_stats_get); #else -#define alloc_stat_inc(pool, __stat) -#define recycle_stat_inc(pool, __stat) -#define recycle_stat_add(pool, __stat, val) +#define alloc_stat_inc(...) do { } while (0) +#define recycle_stat_inc(...) do { } while (0) +#define recycle_stat_add(...) do { } while (0) #endif static bool page_pool_producer_lock(struct page_pool *pool) @@ -276,8 +276,7 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->dma_map) - get_device(pool->p.dev); + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { netdev_assert_locked(pool->slow.netdev); @@ -320,9 +319,7 @@ free_ptr_ring: static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->dma_map) - put_device(pool->p.dev); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) @@ -374,7 +371,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); -static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); +static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem); static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) { @@ -412,7 +409,7 @@ static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); alloc_stat_inc(pool, waive); netmem = 0; break; @@ -463,13 +460,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } } -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; + u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -483,15 +488,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) if (dma_mapping_error(pool->p.dev, dma)) return false; - if (page_pool_set_dma_addr_netmem(netmem, dma)) + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; + } + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto unset_failed; + } + + netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: - WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); @@ -508,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } @@ -524,8 +544,8 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, } /* slow path */ -static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, - gfp_t gfp) +static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool, + gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; unsigned int pp_order = pool->p.order; @@ -554,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; - if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } @@ -595,7 +615,7 @@ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) netmem = pool->mp_ops->alloc_netmems(pool, gfp); else - netmem = __page_pool_alloc_pages_slow(pool, gfp); + netmem = __page_pool_alloc_netmems_slow(pool, gfp); return netmem; } EXPORT_SYMBOL(page_pool_alloc_netmems); @@ -653,9 +673,11 @@ void page_pool_clear_pp_info(netmem_ref netmem) netmem_set_pp(netmem, NULL); } -static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, - netmem_ref netmem) +static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, + netmem_ref netmem) { + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -664,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, */ return; + id = netmem_get_dma_index(netmem); + if (!id) + return; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return; + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ @@ -671,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); + netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -678,7 +712,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) +static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem) { int count; bool put; @@ -687,7 +721,7 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) put = pool->mp_ops->release_netmem(pool, netmem); else - __page_pool_release_page_dma(pool, netmem); + __page_pool_release_netmem_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. @@ -707,19 +741,16 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { - int ret; - /* BH protection not needed if current is softirq */ - if (in_softirq()) - ret = ptr_ring_produce(&pool->ring, (__force void *)netmem); - else - ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem); + bool in_softirq, ret; - if (!ret) { + /* BH protection not needed if current is softirq */ + in_softirq = page_pool_producer_lock(pool); + ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); + if (ret) recycle_stat_inc(pool, ring); - return true; - } + page_pool_producer_unlock(pool, in_softirq); - return false; + return ret; } /* Only allow direct recycling in special circumstances, into the @@ -795,7 +826,7 @@ __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); return 0; } @@ -805,6 +836,10 @@ static bool page_pool_napi_local(const struct page_pool *pool) const struct napi_struct *napi; u32 cpuid; + /* On PREEMPT_RT the softirq can be preempted by the consumer */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + if (unlikely(!in_softirq())) return false; @@ -829,12 +864,12 @@ void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, if (!allow_direct) allow_direct = page_pool_napi_local(pool); - netmem = - __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct); + netmem = __page_pool_put_page(pool, netmem, dma_sync_size, + allow_direct); if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } } EXPORT_SYMBOL(page_pool_put_unrefed_netmem); @@ -877,7 +912,7 @@ static void page_pool_recycle_ring_bulk(struct page_pool *pool, * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) - page_pool_return_page(pool, bulk[i]); + page_pool_return_netmem(pool, bulk[i]); } /** @@ -960,7 +995,7 @@ static netmem_ref page_pool_drain_frag(struct page_pool *pool, return netmem; } - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); return 0; } @@ -974,7 +1009,7 @@ static void page_pool_free_frag(struct page_pool *pool) if (!netmem || page_pool_unref_netmem(netmem, drain_count)) return; - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, @@ -1041,7 +1076,7 @@ static void page_pool_empty_ring(struct page_pool *pool) pr_crit("%s() page_pool refcnt %d violation\n", __func__, netmem_ref_count(netmem)); - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } } @@ -1074,14 +1109,35 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } } static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_netmem_dma(pool, page_to_netmem((struct page *)ptr)); + } /* No more consumers should exist, but producers could still * be in-flight. @@ -1091,10 +1147,14 @@ static void page_pool_scrub(struct page_pool *pool) static int page_pool_release(struct page_pool *pool) { + bool in_softirq; int inflight; page_pool_scrub(pool); inflight = page_pool_inflight(pool, true); + /* Acquire producer lock to make sure producers have exited. */ + in_softirq = page_pool_producer_lock(pool); + page_pool_producer_unlock(pool, in_softirq); if (!inflight) __page_pool_destroy(pool); @@ -1193,7 +1253,7 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, netmem); + page_pool_return_netmem(pool, netmem); } } EXPORT_SYMBOL(page_pool_update_nid); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fe7fdefab994c..0ebe5461d4d96 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -177,7 +177,7 @@ #define MAX_IMIX_ENTRIES 20 #define IMIX_PRECISION 100 /* Precision of IMIX distribution */ -#define func_enter() pr_debug("entering %s\n", __func__); +#define func_enter() pr_debug("entering %s\n", __func__) #define PKT_FLAGS \ pf(IPV6) /* Interface in IPV6 Mode */ \ @@ -227,12 +227,12 @@ static char *pkt_flag_names[] = { /* Xmit modes */ #define M_START_XMIT 0 /* Default normal TX */ -#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ #define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ -#define if_lock(t) mutex_lock(&(t->if_lock)); -#define if_unlock(t) mutex_unlock(&(t->if_lock)); +#define if_lock(t) mutex_lock(&(t->if_lock)) +#define if_unlock(t) mutex_unlock(&(t->if_lock)) /* Used to help with determining the pkts on receive */ #define PKTGEN_MAGIC 0xbe9be955 @@ -283,7 +283,8 @@ struct pktgen_dev { int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ + * removal by worker thread + */ struct page *page; u64 delay; /* nano-seconds */ @@ -346,10 +347,12 @@ struct pktgen_dev { __u16 udp_dst_max; /* exclusive, dest UDP port */ /* DSCP + ECN */ - __u8 tos; /* six MSB of (former) IPv4 TOS - are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 - (see RFC 3260, sec. 4) */ + __u8 tos; /* six MSB of (former) IPv4 TOS + * are for dscp codepoint + */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + * (see RFC 3260, sec. 4) + */ /* IMIX */ unsigned int n_imix_entries; @@ -389,12 +392,12 @@ struct pktgen_dev { __u8 hh[14]; /* = { - 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, - - We fill in SRC address later - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x08, 0x00 - }; + * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + * + * We fill in SRC address later + * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + * 0x08, 0x00 + * }; */ __u16 pad; /* pad out the hh struct to an even 16 bytes */ @@ -458,7 +461,8 @@ struct pktgen_thread { char result[512]; /* Field for thread to receive "posted" events terminate, - stop ifs etc. */ + * stop ifs etc. + */ u32 control; int cpu; @@ -472,8 +476,7 @@ struct pktgen_thread { #define FIND 0 static const char version[] = - "Packet Generator for packet performance testing. " - "Version: " VERSION "\n"; + "Packet Generator for packet performance testing. Version: " VERSION "\n"; static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); @@ -624,8 +627,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, - " udp_src_min: %d udp_src_max: %d" - " udp_dst_min: %d udp_dst_max: %d\n", + " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); @@ -754,6 +756,7 @@ static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen, for (; i < maxlen; i++) { int value; char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; value = hex_to_bin(c); @@ -773,6 +776,7 @@ static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen) for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -799,6 +803,7 @@ static ssize_t num_arg(const char __user *user_buffer, size_t maxlen, for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; if ((c >= '0') && (c <= '9')) { @@ -816,6 +821,7 @@ static ssize_t strn_len(const char __user *user_buffer, size_t maxlen) for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -974,8 +980,8 @@ static __u32 pktgen_read_flag(const char *f, bool *disable) } static ssize_t pktgen_if_write(struct file *file, - const char __user * user_buffer, size_t count, - loff_t * offset) + const char __user *user_buffer, size_t count, + loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; @@ -1307,9 +1313,9 @@ static ssize_t pktgen_if_write(struct file *file, put_page(pkt_dev->page); pkt_dev->page = NULL; } - } - else + } else { sprintf(pg_result, "ERROR: node not possible"); + } return count; } if (!strcmp(name, "xmit_mode")) { @@ -1413,8 +1419,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { - memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strcpy(pkt_dev->dst_min, buf); + strscpy_pad(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } @@ -1434,8 +1439,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { - memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strcpy(pkt_dev->dst_max, buf); + strscpy_pad(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } @@ -1544,8 +1548,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { - memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strcpy(pkt_dev->src_min, buf); + strscpy_pad(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } @@ -1565,8 +1568,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { - memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strcpy(pkt_dev->src_max, buf); + strscpy_pad(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } @@ -1909,8 +1911,8 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) } static ssize_t pktgen_thread_write(struct file *file, - const char __user * user_buffer, - size_t count, loff_t * offset) + const char __user *user_buffer, + size_t count, loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; @@ -1962,6 +1964,7 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; + memset(f, 0, 32); max = min(sizeof(f) - 1, count - i); len = strn_len(&user_buffer[i], max); @@ -2397,13 +2400,14 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... -*/ + */ #define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { #ifdef CONFIG_XFRM struct xfrm_state *x = pkt_dev->flows[flow].x; struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); + if (!x) { if (pkt_dev->spi) { @@ -2436,6 +2440,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { t = get_random_u32_inclusive(pkt_dev->queue_map_min, pkt_dev->queue_map_max); @@ -2517,6 +2522,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_MPLS_RND) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | @@ -2561,6 +2567,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) imx = ntohl(pkt_dev->saddr_max); if (imn < imx) { __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) t = get_random_u32_inclusive(imn, imx - 1); else { @@ -2581,6 +2588,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { do { @@ -2628,6 +2636,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { t = get_random_u32_inclusive(pkt_dev->min_pkt_size, pkt_dev->max_pkt_size - 1); @@ -2694,7 +2703,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) if (!x) return 0; /* XXX: we dont support tunnel mode for now until - * we resolve the dst issue */ + * we resolve the dst issue + */ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) return 0; @@ -2729,8 +2739,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev) if (pkt_dev->cflows) { /* let go of the SAs if we have them */ int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { xfrm_state_put(x); pkt_dev->flows[i].x = NULL; @@ -2745,6 +2757,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; + if (x) { struct ethhdr *eth; struct iphdr *iph; @@ -2788,6 +2801,7 @@ err: static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; @@ -2900,7 +2914,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb->dev = dev; } } else { - skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } /* the caller pre-fetches from skb->data and reserves for the mac hdr */ @@ -2981,7 +2995,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->priority = pkt_dev->skb_priority; memcpy(eth, pkt_dev->hh, 12); - *(__be16 *) & eth[12] = protocol; + *(__be16 *)ð[12] = protocol; /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - @@ -3210,11 +3224,11 @@ static void pktgen_run(struct pktgen_thread *t) set_pkt_overhead(pkt_dev); - strcpy(pkt_dev->result, "Starting"); + strscpy(pkt_dev->result, "Starting"); pkt_dev->running = 1; /* Cranke yeself! */ started++; } else - strcpy(pkt_dev->result, "Error starting"); + strscpy(pkt_dev->result, "Error starting"); } rcu_read_unlock(); if (started) @@ -3473,6 +3487,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t) static void pktgen_resched(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); + schedule(); pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); } @@ -3788,7 +3803,8 @@ static int add_dev_to_thread(struct pktgen_thread *t, * userspace on another CPU than the kthread. The if_lock() * is used here to sync with concurrent instances of * _rem_dev_from_if_list() invoked via kthread, which is also - * updating the if_list */ + * updating the if_list + */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3826,7 +3842,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) if (!pkt_dev) return -ENOMEM; - strcpy(pkt_dev->odevname, ifname); + strscpy(pkt_dev->odevname, ifname); pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, sizeof(struct flow_state)), node); @@ -3983,7 +3999,8 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Remove proc before if_list entry, because add_device uses * list to determine if interface already exist, avoid race - * with proc_create_data() */ + * with proc_create_data() + */ proc_remove(pkt_dev->entry); /* And update the thread if_list */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c5a7f41982a57..094b085cff206 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1026,9 +1026,11 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_error = error, .rta_id = id, }; + unsigned long delta; if (dst) { - ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); + delta = jiffies - READ_ONCE(dst->lastuse); + ci.rta_lastuse = jiffies_delta_to_clock_t(delta); ci.rta_used = dst->__use; ci.rta_clntref = rcuref_read(&dst->__rcuref); } @@ -1446,7 +1448,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) struct netdev_phys_item_id ppid = { }; int err; - err = dev_get_port_parent_id(dev, &ppid, false); + err = netif_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -2036,7 +2038,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, ifm->__ifi_pad = 0; ifm->ifi_type = READ_ONCE(dev->type); ifm->ifi_index = READ_ONCE(dev->ifindex); - ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) @@ -2390,12 +2392,12 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); @@ -3080,17 +3082,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, } if (tb[IFLA_ADDRESS]) { - struct sockaddr *sa; - int len; - - len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, - sizeof(*sa)); - sa = kmalloc(len, GFP_KERNEL); - if (!sa) { - err = -ENOMEM; - goto errout; - } - sa->sa_family = dev->type; + struct sockaddr_storage ss = { }; netdev_unlock_ops(dev); @@ -3098,10 +3090,9 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, down_write(&dev_addr_sem); netdev_lock_ops(dev); - memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), - dev->addr_len); - err = netif_set_mac_address(dev, sa, extack); - kfree(sa); + ss.ss_family = dev->type; + memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); + err = netif_set_mac_address(dev, &ss, extack); if (err) { up_write(&dev_addr_sem); goto errout; @@ -3580,7 +3571,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { - unsigned int old_flags; + unsigned int old_flags, changed; int err; old_flags = dev->flags; @@ -3591,12 +3582,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, return err; } - if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); - } else { - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); + changed = old_flags ^ dev->flags; + if (dev->rtnl_link_initializing) { + dev->rtnl_link_initializing = false; + changed = ~0U; } + + __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); @@ -3654,7 +3646,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev_net_set(dev, net); dev->rtnl_link_ops = ops; - dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->rtnl_link_initializing = true; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); @@ -3681,7 +3673,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) @@ -4083,7 +4075,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, struct ifinfomsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } @@ -4092,7 +4085,6 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); @@ -4883,12 +4875,12 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); @@ -5051,12 +5043,12 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); @@ -5235,7 +5227,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; - ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_flags = netif_get_flags(dev); ifm->ifi_change = 0; @@ -5323,12 +5315,12 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); @@ -6220,7 +6212,8 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, { struct if_stats_msg *ifsm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { + ifsm = nlmsg_payload(nlh, sizeof(*ifsm)); + if (!ifsm) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } @@ -6228,8 +6221,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, if (!strict_check) return 0; - ifsm = nlmsg_data(nlh); - /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ @@ -6457,12 +6448,12 @@ static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, { struct br_port_msg *bpm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { + bpm = nlmsg_payload(nlh, sizeof(*bpm)); + if (!bpm) { NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); return -EINVAL; } - bpm = nlmsg_data(nlh); if (bpm->ifindex) { NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; diff --git a/net/core/scm.c b/net/core/scm.c index 733c0cbd393d2..072d5742440ad 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -23,6 +23,8 @@ #include <linux/security.h> #include <linux/pid_namespace.h> #include <linux/pid.h> +#include <uapi/linux/pidfd.h> +#include <linux/pidfs.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/errqueue.h> @@ -145,6 +147,22 @@ void __scm_destroy(struct scm_cookie *scm) } EXPORT_SYMBOL(__scm_destroy); +static inline int scm_replace_pid(struct scm_cookie *scm, struct pid *pid) +{ + int err; + + /* drop all previous references */ + scm_destroy_cred(scm); + + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + + scm->pid = pid; + scm->creds.pid = pid_vnr(pid); + return 0; +} + int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { const struct proto_ops *ops = READ_ONCE(sock->ops); @@ -189,15 +207,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) if (err) goto error; - p->creds.pid = creds.pid; if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(creds.pid); if (!pid) goto error; - put_pid(p->pid); - p->pid = pid; + + /* pass a struct pid reference from + * find_get_pid() to scm_replace_pid(). + */ + err = scm_replace_pid(p, pid); + if (err) { + put_pid(pid); + goto error; + } } err = -EINVAL; @@ -404,3 +428,125 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) return new_fpl; } EXPORT_SYMBOL(scm_fp_dup); + +#ifdef CONFIG_SECURITY_NETWORK +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ + struct lsm_context ctx; + int err; + + if (sk->sk_scm_security) { + err = security_secid_to_secctx(scm->secid, &ctx); + + if (err >= 0) { + put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, + ctx.context); + + security_release_secctx(&ctx); + } + } +} + +static bool scm_has_secdata(struct sock *sk) +{ + return sk->sk_scm_security; +} +#else +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ +} + +static bool scm_has_secdata(struct sock *sk) +{ + return false; +} +#endif + +static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) +{ + struct file *pidfd_file = NULL; + int len, pidfd; + + /* put_cmsg() doesn't return an error if CMSG is truncated, + * that's why we need to opencode these checks here. + */ + if (msg->msg_flags & MSG_CMSG_COMPAT) + len = sizeof(struct compat_cmsghdr) + sizeof(int); + else + len = sizeof(struct cmsghdr) + sizeof(int); + + if (msg->msg_controllen < len) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + + if (!scm->pid) + return; + + pidfd = pidfd_prepare(scm->pid, PIDFD_STALE, &pidfd_file); + + if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { + if (pidfd_file) { + put_unused_fd(pidfd); + fput(pidfd_file); + } + + return; + } + + if (pidfd_file) + fd_install(pidfd, pidfd_file); +} + +static bool __scm_recv_common(struct sock *sk, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!msg->msg_control) { + if (sk->sk_scm_credentials || sk->sk_scm_pidfd || + scm->fp || scm_has_secdata(sk)) + msg->msg_flags |= MSG_CTRUNC; + + scm_destroy(scm); + return false; + } + + if (sk->sk_scm_credentials) { + struct user_namespace *current_ns = current_user_ns(); + struct ucred ucreds = { + .pid = scm->creds.pid, + .uid = from_kuid_munged(current_ns, scm->creds.uid), + .gid = from_kgid_munged(current_ns, scm->creds.gid), + }; + + put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); + } + + scm_passec(sk, msg, scm); + + if (scm->fp) + scm_detach_fds(msg, scm); + + return true; +} + +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + scm_destroy_cred(scm); +} +EXPORT_SYMBOL(scm_recv); + +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + if (sock->sk->sk_scm_pidfd) + scm_pidfd_recv(msg, scm); + + scm_destroy_cred(scm); +} diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 568779d5a0efa..9a39656804513 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif - -#if IS_ENABLED(CONFIG_IP_DCCP) -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) -{ - u64 seq; - net_secret_init(); - seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccp_sequence_number); - -#if IS_ENABLED(CONFIG_IPV6) -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - __be16 sport; - __be16 dport; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - .sport = sport, - .dport = dport - }; - u64 seq; - net_secret_init(); - seq = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccpv6_sequence_number); -#endif -#endif diff --git a/net/core/selftests.c b/net/core/selftests.c index 35f807ea99523..3d79133a91a61 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -27,6 +27,7 @@ struct net_packet_attrs { int max_size; u8 id; u16 queue_mapping; + bool bad_csum; }; struct net_test_priv { @@ -160,10 +161,25 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, skb->csum = 0; skb->ip_summed = CHECKSUM_PARTIAL; if (attr->tcp) { - thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, - ihdr->daddr, 0); + int l4len = skb->len - skb_transport_offset(skb); + + thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); + + if (attr->bad_csum) { + /* Force mangled checksum */ + if (skb_checksum_help(skb)) { + kfree_skb(skb); + return NULL; + } + + if (thdr->check != CSUM_MANGLED_0) + thdr->check = CSUM_MANGLED_0; + else + thdr->check = csum16_sub(thdr->check, + cpu_to_be16(1)); + } } else { udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr); } @@ -238,7 +254,11 @@ static int net_test_loopback_validate(struct sk_buff *skb, if (tpriv->packet->id != shdr->id) goto out; - tpriv->ok = true; + if (tpriv->packet->bad_csum && skb->ip_summed == CHECKSUM_UNNECESSARY) + tpriv->ok = -EIO; + else + tpriv->ok = true; + complete(&tpriv->comp); out: kfree_skb(skb); @@ -284,7 +304,12 @@ static int __net_test_loopback(struct net_device *ndev, attr->timeout = NET_LB_TIMEOUT; wait_for_completion_timeout(&tpriv->comp, attr->timeout); - ret = tpriv->ok ? 0 : -ETIMEDOUT; + if (tpriv->ok < 0) + ret = tpriv->ok; + else if (!tpriv->ok) + ret = -ETIMEDOUT; + else + ret = 0; cleanup: dev_remove_pack(&tpriv->pt); @@ -344,6 +369,42 @@ static int net_test_phy_loopback_tcp(struct net_device *ndev) return __net_test_loopback(ndev, &attr); } +/** + * net_test_phy_loopback_tcp_bad_csum - PHY loopback test with a deliberately + * corrupted TCP checksum + * @ndev: the network device to test + * + * Builds the same minimal Ethernet/IPv4/TCP frame as + * net_test_phy_loopback_tcp(), then flips the least-significant bit of the TCP + * checksum so the resulting value is provably invalid (neither 0 nor 0xFFFF). + * The frame is transmitted through the device’s internal PHY loopback path: + * + * test code -> MAC driver -> MAC HW -> xMII -> PHY -> + * internal PHY loopback -> xMII -> MAC HW -> MAC driver -> test code + * + * Result interpretation + * --------------------- + * 0 The frame is delivered to the stack and the driver reports + * ip_summed as CHECKSUM_NONE or CHECKSUM_COMPLETE - both are + * valid ways to indicate “bad checksum, let the stack verify.” + * -ETIMEDOUT The MAC/PHY silently dropped the frame; hardware checksum + * verification filtered it out before the driver saw it. + * -EIO The driver returned the frame with ip_summed == + * CHECKSUM_UNNECESSARY, falsely claiming a valid checksum and + * indicating a serious RX-path defect. + * + * Return: 0 on success or a negative error code on failure. + */ +static int net_test_phy_loopback_tcp_bad_csum(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + attr.bad_csum = true; + return __net_test_loopback(ndev, &attr); +} + static const struct net_test { char name[ETH_GSTRING_LEN]; int (*fn)(struct net_device *ndev); @@ -368,6 +429,9 @@ static const struct net_test { .name = "PHY internal loopback, TCP ", .fn = net_test_phy_loopback_tcp, }, { + .name = "PHY loopback, bad TCP csum ", + .fn = net_test_phy_loopback_tcp_bad_csum, + }, { /* This test should be done after all PHY loopback test */ .name = "PHY internal loopback, disable", .fn = net_test_phy_loopback_disable, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6cbf77bc61fce..ee0274417948e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -64,6 +64,7 @@ #include <linux/mpls.h> #include <linux/kcov.h> #include <linux/iov_iter.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <net/dst.h> @@ -89,6 +90,7 @@ #include <linux/textsearch.h> #include "dev.h" +#include "devmem.h" #include "netmem_priv.h" #include "sock_destructor.h" @@ -382,8 +384,7 @@ static inline void __finalize_skb_around(struct sk_buff *skb, void *data, skb_set_kcov_handle(skb, kcov_common_handle()); } -static inline void *__slab_build_skb(struct sk_buff *skb, void *data, - unsigned int *size) +static inline void *__slab_build_skb(void *data, unsigned int *size) { void *resized; @@ -416,7 +417,7 @@ struct sk_buff *slab_build_skb(void *data) return NULL; memset(skb, 0, offsetof(struct sk_buff, tail)); - data = __slab_build_skb(skb, data, &size); + data = __slab_build_skb(data, &size); __finalize_skb_around(skb, data, size); return skb; @@ -433,7 +434,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data, * using slab buffer should use slab_build_skb() instead. */ if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) - data = __slab_build_skb(skb, data, &size); + data = __slab_build_skb(data, &size); __finalize_skb_around(skb, data, size); } @@ -893,11 +894,6 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool is_pp_netmem(netmem_ref netmem) -{ - return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; -} - int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom) { @@ -995,14 +991,7 @@ bool napi_pp_put_page(netmem_ref netmem) { netmem = netmem_compound_head(netmem); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely(!is_pp_netmem(netmem))) + if (unlikely(!netmem_is_pp(netmem))) return false; page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); @@ -1042,7 +1031,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) { head_netmem = netmem_compound_head(shinfo->frags[i].netmem); - if (likely(is_pp_netmem(head_netmem))) + if (likely(netmem_is_pp(head_netmem))) page_pool_ref_netmem(head_netmem); else page_ref_inc(netmem_to_page(head_netmem)); @@ -1666,7 +1655,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; @@ -1681,7 +1671,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->mmp.user = NULL; - if (mm_account_pinned_pages(&uarg->mmp, size)) { + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } @@ -1704,7 +1694,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) + struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; @@ -1734,7 +1724,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { - if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; @@ -1749,7 +1740,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return msg_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); @@ -1853,7 +1844,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg) + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) { int err, orig_len = skb->len; @@ -1872,7 +1864,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return -EEXIST; } - err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; @@ -3066,10 +3059,8 @@ static bool spd_can_coalesce(const struct splice_pipe_desc *spd, /* * Fill page/offset/length into spd, if it can hold more pages. */ -static bool spd_fill_page(struct splice_pipe_desc *spd, - struct pipe_inode_info *pipe, struct page *page, - unsigned int *len, unsigned int offset, - bool linear, +static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page, + unsigned int *len, unsigned int offset, bool linear, struct sock *sk) { if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) @@ -3097,8 +3088,7 @@ static bool __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct splice_pipe_desc *spd, bool linear, - struct sock *sk, - struct pipe_inode_info *pipe) + struct sock *sk) { if (!*len) return true; @@ -3117,8 +3107,7 @@ static bool __splice_segment(struct page *page, unsigned int poff, do { unsigned int flen = min(*len, plen); - if (spd_fill_page(spd, pipe, page, &flen, poff, - linear, sk)) + if (spd_fill_page(spd, page, &flen, poff, linear, sk)) return true; poff += flen; plen -= flen; @@ -3136,8 +3125,8 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, unsigned int *offset, unsigned int *len, struct splice_pipe_desc *spd, struct sock *sk) { - int seg; struct sk_buff *iter; + int seg; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a @@ -3149,7 +3138,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, skb_headlen(skb), offset, len, spd, skb_head_is_locked(skb), - sk, pipe)) + sk)) return true; /* @@ -3166,7 +3155,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, if (__splice_segment(skb_frag_page(f), skb_frag_off(f), skb_frag_size(f), - offset, len, spd, false, sk, pipe)) + offset, len, spd, false, sk)) return true; } @@ -3239,8 +3228,9 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, - int len, sendmsg_func sendmsg) + int len, sendmsg_func sendmsg, int flags) { + int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0; unsigned int orig_len = len; struct sk_buff *head = skb; unsigned short fragidx; @@ -3257,7 +3247,9 @@ do_frag_list: kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + msg.msg_flags = MSG_DONTWAIT | flags; + if (slen < len) + msg.msg_flags |= more_hint; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, @@ -3294,9 +3286,12 @@ do_frag_list: while (slen) { struct bio_vec bvec; struct msghdr msg = { - .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | + flags, }; + if (slen < len) + msg.msg_flags |= more_hint; bvec_set_page(&bvec, skb_frag_page(frag), slen, skb_frag_off(frag) + offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, @@ -3340,14 +3335,21 @@ error: int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_locked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); + /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); } /** @@ -3443,8 +3445,7 @@ fault: EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ -__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, - __wsum csum, const struct skb_checksum_ops *ops) +__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -3455,8 +3456,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, if (copy > 0) { if (copy > len) copy = len; - csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, - skb->data + offset, copy, csum); + csum = csum_partial(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -3486,13 +3486,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); - csum2 = INDIRECT_CALL_1(ops->update, - csum_partial_ext, - vaddr + p_off, p_len, 0); + csum2 = csum_partial(vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); - csum = INDIRECT_CALL_1(ops->combine, - csum_block_add_ext, csum, - csum2, pos, p_len); + csum = csum_block_add(csum, csum2, pos); pos += p_len; } @@ -3513,10 +3509,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum2; if (copy > len) copy = len; - csum2 = __skb_checksum(frag_iter, offset - start, - copy, 0, ops); - csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, - csum, csum2, pos, copy); + csum2 = skb_checksum(frag_iter, offset - start, copy, + 0); + csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; offset += copy; @@ -3528,18 +3523,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, return csum; } -EXPORT_SYMBOL(__skb_checksum); - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) -{ - const struct skb_checksum_ops ops = { - .update = csum_partial_ext, - .combine = csum_block_add_ext, - }; - - return __skb_checksum(skb, offset, len, csum, &ops); -} EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ @@ -3632,6 +3615,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +#ifdef CONFIG_NET_CRC32C +u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + if (copy > 0) { + copy = min(copy, len); + crc = crc32c(crc, skb->data + offset, copy); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + copy = end - offset; + if (copy > 0) { + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + copy = min(copy, len); + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + crc = crc32c(crc, vaddr + p_off, p_len); + kunmap_atomic(vaddr); + } + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + copy = end - offset; + if (copy > 0) { + copy = min(copy, len); + crc = skb_crc32c(frag_iter, offset - start, copy, crc); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + BUG_ON(len); + + return crc; +} +EXPORT_SYMBOL(skb_crc32c); +#endif /* CONFIG_NET_CRC32C */ + __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) { __sum16 sum; @@ -3691,32 +3746,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) } EXPORT_SYMBOL(__skb_checksum_complete); -static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static const struct skb_checksum_ops default_crc32c_ops = { - .update = warn_crc32c_csum_update, - .combine = warn_crc32c_csum_combine, -}; - -const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = - &default_crc32c_ops; -EXPORT_SYMBOL(crc32c_csum_stub); - /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer @@ -6232,9 +6261,6 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) if (!pskb_may_pull(skb, write_len)) return -ENOMEM; - if (!skb_frags_readable(skb)) - return -EFAULT; - if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) return 0; @@ -6737,8 +6763,7 @@ static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); /* carve out the first eat bytes from skb's frag_list. May recurse into * pskb_carve() */ -static int pskb_carve_frag_list(struct sk_buff *skb, - struct skb_shared_info *shinfo, int eat, +static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat, gfp_t gfp_mask) { struct sk_buff *list = shinfo->frag_list; @@ -6843,7 +6868,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_clone_fraglist(skb); /* split line is in frag list */ - if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { + if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) { /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ if (skb_has_frag_list(skb)) kfree_skb_list(skb_shinfo(skb)->frag_list); @@ -7208,7 +7233,6 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, * @skb: The buffer to add pages to * @iter: Iterator representing the pages to be added * @maxsize: Maximum amount of pages to be added - * @gfp: Allocation flags * * This is a common helper function for supporting MSG_SPLICE_PAGES. It * extracts pages from an iterator and adds them to the socket buffer if @@ -7219,7 +7243,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, * insufficient space in the buffer to transfer anything. */ ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, - ssize_t maxsize, gfp_t gfp) + ssize_t maxsize) { size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); struct page *pages[8], **ppages = pages; @@ -7317,3 +7341,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); + +void get_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); + return; + } + get_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(get_netmem); + +void put_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); + return; + } + + put_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(put_netmem); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 0ddc4c7188332..83c78379932e2 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -530,16 +530,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, - struct sk_msg *msg) + struct sk_msg *msg, + bool take_ref) { int num_sge, copied; + /* skb_to_sgvec will fail when the total number of fragments in + * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the + * caller may aggregate multiple skbs. + */ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (num_sge < 0) { /* skb linearize may fail with ENOMEM, but lets simply try again * later if this happens. Under memory pressure we don't want to * drop the skb. We need to linearize the skb so that the mapping * in skb_to_sgvec can not error. + * Note that skb_linearize requires the skb not to be shared. */ if (skb_linearize(skb)) return -EAGAIN; @@ -556,7 +562,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; - msg->skb = skb; + msg->skb = take_ref ? skb_get(skb) : skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); @@ -564,7 +570,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len); + u32 off, u32 len, bool take_ref); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) @@ -578,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * correctly. */ if (unlikely(skb->sk == sk)) - return sk_psock_skb_ingress_self(psock, skb, off, len); + return sk_psock_skb_ingress_self(psock, skb, off, len, true); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; @@ -590,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * into user buffers. */ skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); if (err < 0) kfree(msg); return err; @@ -601,7 +607,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len) + u32 off, u32 len, bool take_ref) { struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; @@ -610,7 +616,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); return err; @@ -619,18 +625,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { - int err = 0; - if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } - skb_get(skb); - err = sk_psock_skb_ingress(psock, skb, off, len); - if (err < 0) - kfree_skb(skb); - return err; + + return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, @@ -655,12 +656,21 @@ static void sk_psock_backlog(struct work_struct *work) bool ingress; int ret; - mutex_lock(&psock->work_mutex); - if (unlikely(state->len)) { - len = state->len; - off = state->off; - } + /* If sk is quickly removed from the map and then added back, the old + * psock should not be scheduled, because there are now two psocks + * pointing to the same sk. + */ + if (!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + return; + /* Increment the psock refcnt to synchronize with close(fd) path in + * sock_map_close(), ensuring we wait for backlog thread completion + * before sk_socket freed. If refcnt increment fails, it indicates + * sock_map_close() completed with sk_socket potentially already freed. + */ + if (!sk_psock_get(psock->sk)) + return; + mutex_lock(&psock->work_mutex); while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; @@ -670,6 +680,13 @@ static void sk_psock_backlog(struct work_struct *work) off = stm->offset; len = stm->full_len; } + + /* Resume processing from previous partial state */ + if (unlikely(state->len)) { + len = state->len; + off = state->off; + } + ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { @@ -680,7 +697,8 @@ static void sk_psock_backlog(struct work_struct *work) if (ret <= 0) { if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, len, off); - + /* Restore redir info we cleared before */ + skb_bpf_set_redir(skb, psock->sk, ingress); /* Delay slightly to prioritize any * other work that might be here. */ @@ -697,11 +715,14 @@ static void sk_psock_backlog(struct work_struct *work) len -= ret; } while (len); + /* The entire skb sent, clear state */ + sk_psock_skb_state(psock, state, 0, 0); skb = skb_dequeue(&psock->ingress_skb); kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); + sk_psock_put(psock->sk, psock); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -1014,7 +1035,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, off = stm->offset; len = stm->full_len; } - err = sk_psock_skb_ingress_self(psock, skb, off, len); + err = sk_psock_skb_ingress_self(psock, skb, off, len, false); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); diff --git a/net/core/sock.c b/net/core/sock.c index e54449c9ab0ba..7c26ec8dce630 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -148,6 +148,8 @@ #include <linux/ethtool.h> +#include <uapi/linux/pidfd.h> + #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); @@ -524,11 +526,10 @@ int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason drop_reason; int err; - err = sk_filter(sk, skb); - if (err) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + err = sk_filter_reason(sk, skb, &drop_reason); + if (err) goto out; - } + err = __sock_queue_rcv_skb(sk, skb); switch (err) { case -ENOMEM: @@ -551,15 +552,18 @@ EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; int rc = NET_RX_SUCCESS; + int err; - if (sk_filter_trim_cap(sk, skb, trim_cap)) + if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) goto discard_and_relse; skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { atomic_inc(&sk->sk_drops); + reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } if (nested) @@ -575,8 +579,12 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); - } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { + } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { bh_unlock_sock(sk); + if (err == -ENOMEM) + reason = SKB_DROP_REASON_PFMEMALLOC; + if (err == -ENOBUFS) + reason = SKB_DROP_REASON_SOCKET_BACKLOG; atomic_inc(&sk->sk_drops); goto discard_and_relse; } @@ -587,7 +595,7 @@ out: sock_put(sk); return rc; discard_and_relse: - kfree_skb(skb); + sk_skb_reason_drop(sk, skb, reason); goto out; } EXPORT_SYMBOL(__sk_receive_skb); @@ -600,7 +608,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->obsolete && + if (dst && READ_ONCE(dst->obsolete) && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); @@ -618,7 +626,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); - if (dst && dst->obsolete && + if (dst && READ_ONCE(dst->obsolete) && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_dst_reset(sk); @@ -816,12 +824,10 @@ EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { - lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); - release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); @@ -835,14 +841,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) } } -void sock_enable_timestamps(struct sock *sk) -{ - lock_sock(sk); - __sock_set_timestamps(sk, true, false, true); - release_sock(sk); -} -EXPORT_SYMBOL(sock_enable_timestamps); - void sock_set_timestamp(struct sock *sk, int optname, bool valbool) { switch (optname) { @@ -1220,15 +1218,6 @@ int sk_setsockopt(struct sock *sk, int level, int optname, return 0; } return -EPERM; - case SO_PASSSEC: - assign_bit(SOCK_PASSSEC, &sock->flags, valbool); - return 0; - case SO_PASSCRED: - assign_bit(SOCK_PASSCRED, &sock->flags, valbool); - return 0; - case SO_PASSPIDFD: - assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); - return 0; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: @@ -1276,6 +1265,8 @@ int sk_setsockopt(struct sock *sk, int level, int optname, return 0; } case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; if (val < -1 || val > 1) return -EINVAL; if ((u8)val == SOCK_TXREHASH_DEFAULT) @@ -1300,6 +1291,14 @@ int sk_setsockopt(struct sock *sk, int level, int optname, case SO_DEVMEM_DONTNEED: return sock_devmem_dontneed(sk, optval, optlen); #endif + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + return sock_set_timeout(&sk->sk_sndtimeo, optval, + optlen, optname == SO_SNDTIMEO_OLD); + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + return sock_set_timeout(&sk->sk_rcvtimeo, optval, + optlen, optname == SO_RCVTIMEO_OLD); } sockopt_lock_sock(sk); @@ -1455,18 +1454,6 @@ set_sndbuf: WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; } - case SO_RCVTIMEO_OLD: - case SO_RCVTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, - optlen, optname == SO_RCVTIMEO_OLD); - break; - - case SO_SNDTIMEO_OLD: - case SO_SNDTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, - optlen, optname == SO_SNDTIMEO_OLD); - break; - case SO_ATTACH_FILTER: { struct sock_fprog fprog; @@ -1557,6 +1544,33 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; + case SO_PASSCRED: + if (sk_may_scm_recv(sk)) + sk->sk_scm_credentials = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSSEC: + if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) + sk->sk_scm_security = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSPIDFD: + if (sk_is_unix(sk)) + sk->sk_scm_pidfd = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSRIGHTS: + if (sk_is_unix(sk)) + sk->sk_scm_rights = valbool; + else + ret = -EOPNOTSUPP; + break; case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); @@ -1853,11 +1867,24 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSCRED: - v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); + if (!sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_credentials; break; case SO_PASSPIDFD: - v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_pidfd; + break; + + case SO_PASSRIGHTS: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_rights; break; case SO_PEERCRED: @@ -1879,6 +1906,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, { struct pid *peer_pid; struct file *pidfd_file = NULL; + unsigned int flags = 0; int pidfd; if (len > sizeof(pidfd)) @@ -1891,7 +1919,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!peer_pid) return -ENODATA; - pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); + /* The use of PIDFD_STALE requires stashing of struct pid + * on pidfs with pidfs_register_pid() and only AF_UNIX + * were prepared for this. + */ + if (sk->sk_family == AF_UNIX) + flags = PIDFD_STALE; + + pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); put_pid(peer_pid); if (pidfd < 0) return pidfd; @@ -1954,7 +1989,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSSEC: - v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); + if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_security; break; case SO_PEERSEC: @@ -2102,6 +2140,9 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; + /* Paired with WRITE_ONCE() in sk_setsockopt() */ v.val = READ_ONCE(sk->sk_txrehash); break; @@ -2494,17 +2535,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; + + goto free; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + if (bpf_sk_storage_clone(sk, newsk)) + goto free; /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. @@ -2534,18 +2572,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) net_enable_timestamp(); out: return newsk; -} -EXPORT_SYMBOL_GPL(sk_clone_lock); - -void sk_free_unlock_clone(struct sock *sk) -{ +free: /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - sk->sk_destruct = NULL; - bh_unlock_sock(sk); - sk_free(sk); + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; } -EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { @@ -2557,8 +2594,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ - max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : - READ_ONCE(dst->dev->gso_ipv4_max_size); + max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) : + READ_ONCE(dst_dev(dst)->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; @@ -2569,7 +2606,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - sk->sk_route_caps = dst->dev->features; + sk->sk_route_caps = dst_dev(dst)->features; if (sk_is_tcp(sk)) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2587,7 +2624,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; @@ -2743,17 +2780,6 @@ void sock_pfree(struct sk_buff *skb) EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -kuid_t sock_i_uid(struct sock *sk) -{ - kuid_t uid; - - read_lock_bh(&sk->sk_callback_lock); - uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; - read_unlock_bh(&sk->sk_callback_lock); - return uid; -} -EXPORT_SYMBOL(sock_i_uid); - unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; @@ -3022,6 +3048,11 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, return -EPERM; sockc->priority = *(u32 *)CMSG_DATA(cmsg); break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } @@ -3234,16 +3265,16 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; - bool charged = false; + bool charged = true; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (memcg) { - if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) + charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); + if (!charged) goto suppress_allocation; - charged = true; } /* Under limit. */ @@ -3328,7 +3359,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (charged) + if (memcg && charged) mem_cgroup_uncharge_skmem(memcg, amt); return 0; @@ -4004,7 +4035,7 @@ static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); - if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } @@ -4015,7 +4046,7 @@ static int assign_proto_idx(struct proto *prot) static void release_proto_idx(struct proto *prot) { - if (prot->inuse_idx != PROTO_INUSE_NR - 1) + if (prot->inuse_idx != PROTO_INUSE_NR) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a08eed9b9142e..b23594c767f2e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -264,8 +264,6 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: - case DCCPDIAG_GETSOCK: - if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 82a14f131d00c..5947b38e4f8b6 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1709,7 +1709,6 @@ EXPORT_SYMBOL_GPL(sock_map_close); struct sockmap_link { struct bpf_link link; struct bpf_map *map; - enum bpf_attach_type attach_type; }; static void sock_map_link_release(struct bpf_link *link) @@ -1721,7 +1720,7 @@ static void sock_map_link_release(struct bpf_link *link) goto out; WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link, - sockmap_link->attach_type)); + link->attach_type)); bpf_map_put_with_uref(sockmap_link->map); sockmap_link->map = NULL; @@ -1772,7 +1771,7 @@ static int sock_map_link_update_prog(struct bpf_link *link, } ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink, - sockmap_link->attach_type); + link->attach_type); if (ret) goto out; @@ -1817,7 +1816,7 @@ static int sock_map_link_fill_info(const struct bpf_link *link, u32 map_id = sock_map_link_get_map_id(sockmap_link); info->sockmap.map_id = map_id; - info->sockmap.attach_type = sockmap_link->attach_type; + info->sockmap.attach_type = link->attach_type; return 0; } @@ -1828,7 +1827,7 @@ static void sock_map_link_show_fdinfo(const struct bpf_link *link, u32 map_id = sock_map_link_get_map_id(sockmap_link); seq_printf(seq, "map_id:\t%u\n", map_id); - seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type); + seq_printf(seq, "attach_type:\t%u\n", link->attach_type); } static const struct bpf_link_ops sock_map_link_ops = { @@ -1866,9 +1865,9 @@ int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog) } attach_type = attr->link_create.attach_type; - bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog); + bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog, + attach_type); sockmap_link->map = map; - sockmap_link->attach_type = attach_type; ret = bpf_link_prime(&sockmap_link->link, &link_primer); if (ret) { diff --git a/net/core/stream.c b/net/core/stream.c index b16dfa568a2d5..7a37e7dd2c43e 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -23,9 +23,13 @@ /** * sk_stream_write_space - stream socket write_space callback. - * @sk: socket + * @sk: pointer to the socket structure * - * FIXME: write proper description + * This function is invoked when there's space available in the socket's + * send buffer for writing. It first checks if the socket is writable, + * clears the SOCK_NOSPACE flag indicating that memory for writing + * is now available, wakes up any processes waiting for write operations + * and sends asynchronous notifications if needed. */ void sk_stream_write_space(struct sock *sk) { diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c7769ee0d9c55..8cf04b57ade1e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -28,6 +28,7 @@ #include <net/rps.h> #include "dev.h" +#include "net-sysfs.h" static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; @@ -96,50 +97,40 @@ free_buf: #ifdef CONFIG_RPS -static struct cpumask *rps_default_mask_cow_alloc(struct net *net) -{ - struct cpumask *rps_default_mask; - - if (net->core.rps_default_mask) - return net->core.rps_default_mask; - - rps_default_mask = kzalloc(cpumask_size(), GFP_KERNEL); - if (!rps_default_mask) - return NULL; - - /* pairs with READ_ONCE in rx_queue_default_mask() */ - WRITE_ONCE(net->core.rps_default_mask, rps_default_mask); - return rps_default_mask; -} +DEFINE_MUTEX(rps_default_mask_mutex); static int rps_default_mask_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->data; + struct cpumask *mask; int err = 0; - rtnl_lock(); + mutex_lock(&rps_default_mask_mutex); + mask = net->core.rps_default_mask; if (write) { - struct cpumask *rps_default_mask = rps_default_mask_cow_alloc(net); - + if (!mask) { + mask = kzalloc(cpumask_size(), GFP_KERNEL); + net->core.rps_default_mask = mask; + } err = -ENOMEM; - if (!rps_default_mask) + if (!mask) goto done; - err = cpumask_parse(buffer, rps_default_mask); + err = cpumask_parse(buffer, mask); if (err) goto done; - err = rps_cpumask_housekeeping(rps_default_mask); + err = rps_cpumask_housekeeping(mask); if (err) goto done; } else { err = dump_cpumask(buffer, lenp, ppos, - net->core.rps_default_mask ? : cpu_none_mask); + mask ?: cpu_none_mask); } done: - rtnl_unlock(); + mutex_unlock(&rps_default_mask_mutex); return err; } @@ -201,7 +192,7 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu_mightsleep(orig_sock_table); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -239,7 +230,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu_mightsleep(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); @@ -248,7 +239,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } diff --git a/net/core/utils.c b/net/core/utils.c index 27f4cffaae05d..5e63b0ea21f3f 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -399,9 +399,9 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, } EXPORT_SYMBOL(inet_pton_with_scope); -bool inet_addr_is_any(struct sockaddr *addr) +bool inet_addr_is_any(struct sockaddr_storage *addr) { - if (addr->sa_family == AF_INET6) { + if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; const struct sockaddr_in6 in6_any = { .sin6_addr = IN6ADDR_ANY_INIT }; @@ -409,13 +409,13 @@ bool inet_addr_is_any(struct sockaddr *addr) if (!memcmp(in6->sin6_addr.s6_addr, in6_any.sin6_addr.s6_addr, 16)) return true; - } else if (addr->sa_family == AF_INET) { + } else if (addr->ss_family == AF_INET) { struct sockaddr_in *in = (struct sockaddr_in *)addr; if (in->sin_addr.s_addr == htonl(INADDR_ANY)) return true; } else { - pr_warn("unexpected address family %u\n", addr->sa_family); + pr_warn("unexpected address family %u\n", addr->ss_family); } return false; @@ -473,11 +473,11 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, EXPORT_SYMBOL(inet_proto_csum_replace16); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr) + __wsum diff, bool pseudohdr, bool ipv6) { if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_replace_by_diff(sum, diff); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6) skb->csum = ~csum_sub(diff, skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); diff --git a/net/core/xdp.c b/net/core/xdp.c index f86eedad586a7..491334b9b8bec 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include <net/page_pool/helpers.h> #include <net/hotdata.h> +#include <net/netdev_lock.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> @@ -437,8 +438,8 @@ void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as mem->type knows this a page_pool page + /* No need to check netmem_is_pp() as mem->type knows this a + * page_pool page */ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi_direct); @@ -697,23 +698,23 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, nr_frags = xinfo->nr_frags; for (u32 i = 0; i < nr_frags; i++) { - u32 len = skb_frag_size(&xinfo->frags[i]); + const skb_frag_t *frag = &xinfo->frags[i]; + u32 len = skb_frag_size(frag); u32 offset, truesize = len; - netmem_ref netmem; + struct page *page; - netmem = page_pool_dev_alloc_netmem(pp, &offset, &truesize); - if (unlikely(!netmem)) { + page = page_pool_dev_alloc(pp, &offset, &truesize); + if (unlikely(!page)) { sinfo->nr_frags = i; return false; } - memcpy(__netmem_address(netmem), - __netmem_address(xinfo->frags[i].netmem), + memcpy(page_address(page) + offset, skb_frag_address(frag), LARGEST_ALIGN(len)); - __skb_fill_netmem_desc_noacc(sinfo, i, netmem, offset, len); + __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); tsize += truesize; - pfmemalloc |= netmem_is_pfmemalloc(netmem); + pfmemalloc |= page_is_pfmemalloc(page); } xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, @@ -737,25 +738,27 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, */ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) { - struct page_pool *pp = this_cpu_read(system_page_pool); const struct xdp_rxq_info *rxq = xdp->rxq; u32 len = xdp->data_end - xdp->data_meta; u32 truesize = xdp->frame_sz; - struct sk_buff *skb; + struct sk_buff *skb = NULL; + struct page_pool *pp; int metalen; void *data; if (!IS_ENABLED(CONFIG_PAGE_POOL)) return NULL; + local_lock_nested_bh(&system_page_pool.bh_lock); + pp = this_cpu_read(system_page_pool.pool); data = page_pool_dev_alloc_va(pp, &truesize); if (unlikely(!data)) - return NULL; + goto out; skb = napi_build_skb(data, truesize); if (unlikely(!skb)) { page_pool_free_va(pp, data, true); - return NULL; + goto out; } skb_mark_for_recycle(skb); @@ -774,13 +777,16 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) if (unlikely(xdp_buff_has_frags(xdp)) && unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { napi_consume_skb(skb, true); - return NULL; + skb = NULL; + goto out; } xsk_buff_free(xdp); skb->protocol = eth_type_trans(skb, rxq->dev); +out: + local_unlock_nested_bh(&system_page_pool.bh_lock); return skb; } EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); @@ -991,34 +997,60 @@ static int __init xdp_metadata_init(void) } late_initcall(xdp_metadata_init); -void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) { val &= NETDEV_XDP_ACT_MASK; if (dev->xdp_features == val) return; + netdev_assert_locked_or_invisible(dev); dev->xdp_features = val; if (dev->reg_state == NETREG_REGISTERED) call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); } +EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); + +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + netdev_lock(dev); + xdp_set_features_flag_locked(dev, val); + netdev_unlock(dev); +} EXPORT_SYMBOL_GPL(xdp_set_features_flag); -void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg) { xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); if (support_sg) val |= NETDEV_XDP_ACT_NDO_XMIT_SG; - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked); + +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ + netdev_lock(dev); + xdp_features_set_redirect_target_locked(dev, support_sg); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); -void xdp_features_clear_redirect_target(struct net_device *dev) +void xdp_features_clear_redirect_target_locked(struct net_device *dev) { xdp_features_t val = dev->xdp_features; val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked); + +void xdp_features_clear_redirect_target(struct net_device *dev) +{ + netdev_lock(dev); + xdp_features_clear_redirect_target_locked(dev); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig deleted file mode 100644 index 0c7d2f66ba278..0000000000000 --- a/net/dccp/Kconfig +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menuconfig IP_DCCP - tristate "The DCCP Protocol" - depends on INET - help - Datagram Congestion Control Protocol (RFC 4340) - - From https://www.ietf.org/rfc/rfc4340.txt: - - The Datagram Congestion Control Protocol (DCCP) is a transport - protocol that implements bidirectional, unicast connections of - congestion-controlled, unreliable datagrams. It should be suitable - for use by applications such as streaming media, Internet telephony, - and on-line games. - - To compile this protocol support as a module, choose M here: the - module will be called dccp. - - If in doubt, say N. - -if IP_DCCP - -config INET_DCCP_DIAG - depends on INET_DIAG - def_tristate y if (IP_DCCP = y && INET_DIAG = y) - def_tristate m - -source "net/dccp/ccids/Kconfig" - -menu "DCCP Kernel Hacking" - depends on DEBUG_KERNEL=y - -config IP_DCCP_DEBUG - bool "DCCP debug messages" - help - Only use this if you're hacking DCCP. - - When compiling DCCP as a module, this debugging output can be toggled - by setting the parameter dccp_debug of the `dccp' module to 0 or 1. - - Just say N. - - -endmenu - -endif # IP_DDCP diff --git a/net/dccp/Makefile b/net/dccp/Makefile deleted file mode 100644 index 5b4ff37bc8061..0000000000000 --- a/net/dccp/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o - -dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \ - qpolicy.o -# -# CCID algorithms to be used by dccp.ko -# -# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency -dccp-y += ccids/ccid2.o ackvec.o -dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o -dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \ - ccids/lib/tfrc_equation.o \ - ccids/lib/packet_history.o \ - ccids/lib/loss_interval.o - -dccp_ipv4-y := ipv4.o - -# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module -obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o -dccp_ipv6-y := ipv6.o - -obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o - -dccp-$(CONFIG_SYSCTL) += sysctl.o - -dccp_diag-y := diag.o - -# build with local directory for trace.h -CFLAGS_proto.o := -I$(src) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c deleted file mode 100644 index 1cba001bb4c8e..0000000000000 --- a/net/dccp/ackvec.c +++ /dev/null @@ -1,403 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ackvec.c - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> - */ -#include "dccp.h" -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/export.h> - -static struct kmem_cache *dccp_ackvec_slab; -static struct kmem_cache *dccp_ackvec_record_slab; - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) -{ - struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); - - if (av != NULL) { - av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; - INIT_LIST_HEAD(&av->av_records); - } - return av; -} - -static void dccp_ackvec_purge_records(struct dccp_ackvec *av) -{ - struct dccp_ackvec_record *cur, *next; - - list_for_each_entry_safe(cur, next, &av->av_records, avr_node) - kmem_cache_free(dccp_ackvec_record_slab, cur); - INIT_LIST_HEAD(&av->av_records); -} - -void dccp_ackvec_free(struct dccp_ackvec *av) -{ - if (likely(av != NULL)) { - dccp_ackvec_purge_records(av); - kmem_cache_free(dccp_ackvec_slab, av); - } -} - -/** - * dccp_ackvec_update_records - Record information about sent Ack Vectors - * @av: Ack Vector records to update - * @seqno: Sequence number of the packet carrying the Ack Vector just sent - * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector - */ -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) -{ - struct dccp_ackvec_record *avr; - - avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); - if (avr == NULL) - return -ENOBUFS; - - avr->avr_ack_seqno = seqno; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = nonce_sum; - avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); - /* - * When the buffer overflows, we keep no more than one record. This is - * the simplest way of disambiguating sender-Acks dating from before the - * overflow from sender-Acks which refer to after the overflow; a simple - * solution is preferable here since we are handling an exception. - */ - if (av->av_overflow) - dccp_ackvec_purge_records(av); - /* - * Since GSS is incremented for each packet, the list is automatically - * arranged in descending order of @ack_seqno. - */ - list_add(&avr->avr_node, &av->av_records); - - dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", - (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno, - avr->avr_ack_runlen); - return 0; -} - -static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, - const u64 ackno) -{ - struct dccp_ackvec_record *avr; - /* - * Exploit that records are inserted in descending order of sequence - * number, start with the oldest record first. If @ackno is `before' - * the earliest ack_ackno, the packet is too old to be considered. - */ - list_for_each_entry_reverse(avr, av_list, avr_node) { - if (avr->avr_ack_seqno == ackno) - return avr; - if (before48(ackno, avr->avr_ack_seqno)) - break; - } - return NULL; -} - -/* - * Buffer index and length computation using modulo-buffersize arithmetic. - * Note that, as pointers move from right to left, head is `before' tail. - */ -static inline u16 __ackvec_idx_add(const u16 a, const u16 b) -{ - return (a + b) % DCCPAV_MAX_ACKVEC_LEN; -} - -static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) -{ - return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); -} - -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) -{ - if (unlikely(av->av_overflow)) - return DCCPAV_MAX_ACKVEC_LEN; - return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); -} - -/** - * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 - * @av: non-empty buffer to update - * @distance: negative or zero distance of @seqno from buf_ackno downward - * @seqno: the (old) sequence number whose record is to be updated - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, - u64 seqno, enum dccp_ackvec_states state) -{ - u16 ptr = av->av_buf_head; - - BUG_ON(distance > 0); - if (unlikely(dccp_ackvec_is_empty(av))) - return; - - do { - u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); - - if (distance + runlen >= 0) { - /* - * Only update the state if packet has not been received - * yet. This is OK as per the second table in RFC 4340, - * 11.4.1; i.e. here we are using the following table: - * RECEIVED - * 0 1 3 - * S +---+---+---+ - * T 0 | 0 | 0 | 0 | - * O +---+---+---+ - * R 1 | 1 | 1 | 1 | - * E +---+---+---+ - * D 3 | 0 | 1 | 3 | - * +---+---+---+ - * The "Not Received" state was set by reserve_seats(). - */ - if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) - av->av_buf[ptr] = state; - else - dccp_pr_debug("Not changing %llu state to %u\n", - (unsigned long long)seqno, state); - break; - } - - distance += runlen + 1; - ptr = __ackvec_idx_add(ptr, 1); - - } while (ptr != av->av_buf_tail); -} - -/* Mark @num entries after buf_head as "Not yet received". */ -static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) -{ - u16 start = __ackvec_idx_add(av->av_buf_head, 1), - len = DCCPAV_MAX_ACKVEC_LEN - start; - - /* check for buffer wrap-around */ - if (num > len) { - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); - start = 0; - num -= len; - } - if (num) - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); -} - -/** - * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer - * @av: container of buffer to update (can be empty or non-empty) - * @num_packets: number of packets to register (must be >= 1) - * @seqno: sequence number of the first packet in @num_packets - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, - u64 seqno, enum dccp_ackvec_states state) -{ - u32 num_cells = num_packets; - - if (num_packets > DCCPAV_BURST_THRESH) { - u32 lost_packets = num_packets - 1; - - DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); - /* - * We received 1 packet and have a loss of size "num_packets-1" - * which we squeeze into num_cells-1 rather than reserving an - * entire byte for each lost packet. - * The reason is that the vector grows in O(burst_length); when - * it grows too large there will no room left for the payload. - * This is a trade-off: if a few packets out of the burst show - * up later, their state will not be changed; it is simply too - * costly to reshuffle/reallocate/copy the buffer each time. - * Should such problems persist, we will need to switch to a - * different underlying data structure. - */ - for (num_packets = num_cells = 1; lost_packets; ++num_cells) { - u8 len = min_t(u32, lost_packets, DCCPAV_MAX_RUNLEN); - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); - av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; - - lost_packets -= len; - } - } - - if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { - DCCP_CRIT("Ack Vector buffer overflow: dropping old entries"); - av->av_overflow = true; - } - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); - if (av->av_overflow) - av->av_buf_tail = av->av_buf_head; - - av->av_buf[av->av_buf_head] = state; - av->av_buf_ackno = seqno; - - if (num_packets > 1) - dccp_ackvec_reserve_seats(av, num_packets - 1); -} - -/** - * dccp_ackvec_input - Register incoming packet in the buffer - * @av: Ack Vector to register packet to - * @skb: Packet to register - */ -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) -{ - u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; - enum dccp_ackvec_states state = DCCPAV_RECEIVED; - - if (dccp_ackvec_is_empty(av)) { - dccp_ackvec_add_new(av, 1, seqno, state); - av->av_tail_ackno = seqno; - - } else { - s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); - u8 *current_head = av->av_buf + av->av_buf_head; - - if (num_packets == 1 && - dccp_ackvec_state(current_head) == state && - dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { - - *current_head += 1; - av->av_buf_ackno = seqno; - - } else if (num_packets > 0) { - dccp_ackvec_add_new(av, num_packets, seqno, state); - } else { - dccp_ackvec_update_old(av, num_packets, seqno, state); - } - } -} - -/** - * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection - * @av: Ack Vector record to clean - * @ackno: last Ack Vector which has been acknowledged - * - * This routine is called when the peer acknowledges the receipt of Ack Vectors - * up to and including @ackno. While based on section A.3 of RFC 4340, here - * are additional precautions to prevent corrupted buffer state. In particular, - * we use tail_ackno to identify outdated records; it always marks the earliest - * packet of group (2) in 11.4.2. - */ -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) -{ - struct dccp_ackvec_record *avr, *next; - u8 runlen_now, eff_runlen; - s64 delta; - - avr = dccp_ackvec_lookup(&av->av_records, ackno); - if (avr == NULL) - return; - /* - * Deal with outdated acknowledgments: this arises when e.g. there are - * several old records and the acks from the peer come in slowly. In - * that case we may still have records that pre-date tail_ackno. - */ - delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); - if (delta < 0) - goto free_records; - /* - * Deal with overlapping Ack Vectors: don't subtract more than the - * number of packets between tail_ackno and ack_ackno. - */ - eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; - - runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); - /* - * The run length of Ack Vector cells does not decrease over time. If - * the run length is the same as at the time the Ack Vector was sent, we - * free the ack_ptr cell. That cell can however not be freed if the run - * length has increased: in this case we need to move the tail pointer - * backwards (towards higher indices), to its next-oldest neighbour. - */ - if (runlen_now > eff_runlen) { - - av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; - av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); - - /* This move may not have cleared the overflow flag. */ - if (av->av_overflow) - av->av_overflow = (av->av_buf_head == av->av_buf_tail); - } else { - av->av_buf_tail = avr->avr_ack_ptr; - /* - * We have made sure that avr points to a valid cell within the - * buffer. This cell is either older than head, or equals head - * (empty buffer): in both cases we no longer have any overflow. - */ - av->av_overflow = 0; - } - - /* - * The peer has acknowledged up to and including ack_ackno. Hence the - * first packet in group (2) of 11.4.2 is the successor of ack_ackno. - */ - av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); - -free_records: - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del(&avr->avr_node); - kmem_cache_free(dccp_ackvec_record_slab, avr); - } -} - -/* - * Routines to keep track of Ack Vectors received in an skb - */ -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) -{ - struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); - - if (new == NULL) - return -ENOBUFS; - new->vec = vec; - new->len = len; - new->nonce = nonce; - - list_add_tail(&new->node, head); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); - -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) -{ - struct dccp_ackvec_parsed *cur, *next; - - list_for_each_entry_safe(cur, next, parsed_chunks, node) - kfree(cur); - INIT_LIST_HEAD(parsed_chunks); -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); - -int __init dccp_ackvec_init(void) -{ - dccp_ackvec_slab = KMEM_CACHE(dccp_ackvec, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_slab == NULL) - goto out_err; - - dccp_ackvec_record_slab = KMEM_CACHE(dccp_ackvec_record, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_record_slab == NULL) - goto out_destroy_slab; - - return 0; - -out_destroy_slab: - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; -out_err: - DCCP_CRIT("Unable to create Ack Vector slab cache"); - return -ENOBUFS; -} - -void dccp_ackvec_exit(void) -{ - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; -} diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h deleted file mode 100644 index d2c4220fb377b..0000000000000 --- a/net/dccp/ackvec.h +++ /dev/null @@ -1,136 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ACKVEC_H -#define _ACKVEC_H -/* - * net/dccp/ackvec.h - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> - */ - -#include <linux/dccp.h> -#include <linux/compiler.h> -#include <linux/list.h> -#include <linux/types.h> - -/* - * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, - * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 - * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives - * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. - * The maximum value is bounded by the u16 types for indices and functions. - */ -#define DCCPAV_NUM_ACKVECS 2 -#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) - -/* Estimated minimum average Ack Vector length - used for updating MPS */ -#define DCCPAV_MIN_OPTLEN 16 - -/* Threshold for coping with large bursts of losses */ -#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) - -enum dccp_ackvec_states { - DCCPAV_RECEIVED = 0x00, - DCCPAV_ECN_MARKED = 0x40, - DCCPAV_RESERVED = 0x80, - DCCPAV_NOT_RECEIVED = 0xC0 -}; -#define DCCPAV_MAX_RUNLEN 0x3F - -static inline u8 dccp_ackvec_runlen(const u8 *cell) -{ - return *cell & DCCPAV_MAX_RUNLEN; -} - -static inline u8 dccp_ackvec_state(const u8 *cell) -{ - return *cell & ~DCCPAV_MAX_RUNLEN; -} - -/** - * struct dccp_ackvec - Ack Vector main data structure - * - * This implements a fixed-size circular buffer within an array and is largely - * based on Appendix A of RFC 4340. - * - * @av_buf: circular buffer storage area - * @av_buf_head: head index; begin of live portion in @av_buf - * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf - * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf - * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf - * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to - * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf - * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound - * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) - */ -struct dccp_ackvec { - u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; - u16 av_buf_head; - u16 av_buf_tail; - u64 av_buf_ackno:48; - u64 av_tail_ackno:48; - bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; - u8 av_overflow:1; - struct list_head av_records; -}; - -/** - * struct dccp_ackvec_record - Records information about sent Ack Vectors - * - * These list entries define the additional information which the HC-Receiver - * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. - * - * @avr_node: the list node in @av_records - * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on - * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to - * @avr_ack_ptr: pointer into @av_buf where this record starts - * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending - * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent - * - * The list as a whole is sorted in descending order by @avr_ack_seqno. - */ -struct dccp_ackvec_record { - struct list_head avr_node; - u64 avr_ack_seqno:48; - u64 avr_ack_ackno:48; - u16 avr_ack_ptr; - u8 avr_ack_runlen; - u8 avr_ack_nonce:1; -}; - -int dccp_ackvec_init(void); -void dccp_ackvec_exit(void); - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); -void dccp_ackvec_free(struct dccp_ackvec *av); - -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); - -static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) -{ - return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; -} - -/** - * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb - * @vec: start of vector (offset into skb) - * @len: length of @vec - * @nonce: whether @vec had an ECN nonce of 0 or 1 - * @node: FIFO - arranged in descending order of ack_ackno - * - * This structure is used by CCIDs to access Ack Vectors in a received skb. - */ -struct dccp_ackvec_parsed { - u8 *vec, - len, - nonce:1; - struct list_head node; -}; - -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce); -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); -#endif /* _ACKVEC_H */ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c deleted file mode 100644 index 6beac5d348e2b..0000000000000 --- a/net/dccp/ccid.c +++ /dev/null @@ -1,219 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ccid.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * - * CCID infrastructure - */ - -#include <linux/slab.h> - -#include "ccid.h" -#include "ccids/lib/tfrc.h" - -static struct ccid_operations *ccids[] = { - &ccid2_ops, -#ifdef CONFIG_IP_DCCP_CCID3 - &ccid3_ops, -#endif -}; - -static struct ccid_operations *ccid_by_number(const u8 id) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - if (ccids[i]->ccid_id == id) - return ccids[i]; - return NULL; -} - -/* check that up to @array_len members in @ccid_array are supported */ -bool ccid_support_check(u8 const *ccid_array, u8 array_len) -{ - while (array_len > 0) - if (ccid_by_number(ccid_array[--array_len]) == NULL) - return false; - return true; -} - -/** - * ccid_get_builtin_ccids - Populate a list of built-in CCIDs - * @ccid_array: pointer to copy into - * @array_len: value to return length into - * - * This function allocates memory - caller must see that it is freed after use. - */ -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) -{ - *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any()); - if (*ccid_array == NULL) - return -ENOBUFS; - - for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1) - (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id; - return 0; -} - -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *optval, int __user *optlen) -{ - u8 *ccid_array, array_len; - int err = 0; - - if (ccid_get_builtin_ccids(&ccid_array, &array_len)) - return -ENOBUFS; - - if (put_user(array_len, optlen)) - err = -EFAULT; - else if (len > 0 && copy_to_user(optval, ccid_array, - len > array_len ? array_len : len)) - err = -EFAULT; - - kfree(ccid_array); - return err; -} - -static __printf(3, 4) struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) -{ - struct kmem_cache *slab; - va_list args; - - va_start(args, fmt); - vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args); - va_end(args); - - slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); - return slab; -} - -static void ccid_kmem_cache_destroy(struct kmem_cache *slab) -{ - kmem_cache_destroy(slab); -} - -static int __init ccid_activate(struct ccid_operations *ccid_ops) -{ - int err = -ENOBUFS; - - ccid_ops->ccid_hc_rx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size, - ccid_ops->ccid_hc_rx_slab_name, - "ccid%u_hc_rx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_rx_slab == NULL) - goto out; - - ccid_ops->ccid_hc_tx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size, - ccid_ops->ccid_hc_tx_slab_name, - "ccid%u_hc_tx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_tx_slab == NULL) - goto out_free_rx_slab; - - pr_info("DCCP: Activated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); - err = 0; -out: - return err; -out_free_rx_slab: - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - goto out; -} - -static void ccid_deactivate(struct ccid_operations *ccid_ops) -{ - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab); - ccid_ops->ccid_hc_tx_slab = NULL; - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - - pr_info("DCCP: Deactivated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); -} - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx) -{ - struct ccid_operations *ccid_ops = ccid_by_number(id); - struct ccid *ccid = NULL; - - if (ccid_ops == NULL) - goto out; - - ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, gfp_any()); - if (ccid == NULL) - goto out; - ccid->ccid_ops = ccid_ops; - if (rx) { - memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size); - if (ccid->ccid_ops->ccid_hc_rx_init != NULL && - ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0) - goto out_free_ccid; - } else { - memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size); - if (ccid->ccid_ops->ccid_hc_tx_init != NULL && - ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0) - goto out_free_ccid; - } -out: - return ccid; -out_free_ccid: - kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, ccid); - ccid = NULL; - goto out; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_rx_exit != NULL) - ccid->ccid_ops->ccid_hc_rx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid); - } -} - -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_tx_exit != NULL) - ccid->ccid_ops->ccid_hc_tx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid); - } -} - -int __init ccid_initialize_builtins(void) -{ - int i, err = tfrc_lib_init(); - - if (err) - return err; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) { - err = ccid_activate(ccids[i]); - if (err) - goto unwind_registrations; - } - return 0; - -unwind_registrations: - while(--i >= 0) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); - return err; -} - -void ccid_cleanup_builtins(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); -} diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h deleted file mode 100644 index 105f3734dadbf..0000000000000 --- a/net/dccp/ccid.h +++ /dev/null @@ -1,262 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _CCID_H -#define _CCID_H -/* - * net/dccp/ccid.h - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * - * CCID infrastructure - */ - -#include <net/sock.h> -#include <linux/compiler.h> -#include <linux/dccp.h> -#include <linux/list.h> -#include <linux/module.h> - -/* maximum value for a CCID (RFC 4340, 19.5) */ -#define CCID_MAX 255 -#define CCID_SLAB_NAME_LENGTH 32 - -struct tcp_info; - -/** - * struct ccid_operations - Interface to Congestion-Control Infrastructure - * - * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) - * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) - * @ccid_name: alphabetical identifier string for @ccid_id - * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection - * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket - * - * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) - * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) - * @ccid_hc_rx_packet_recv: implements the HC-receiver side - * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options - * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options - * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender - * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender - * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender - * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender - * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender - */ -struct ccid_operations { - unsigned char ccid_id; - __u32 ccid_ccmps; - const char *ccid_name; - struct kmem_cache *ccid_hc_rx_slab, - *ccid_hc_tx_slab; - char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH]; - char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH]; - __u32 ccid_hc_rx_obj_size, - ccid_hc_tx_obj_size; - /* Interface Routines */ - int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); - int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); - void (*ccid_hc_rx_exit)(struct sock *sk); - void (*ccid_hc_tx_exit)(struct sock *sk); - void (*ccid_hc_rx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_rx_insert_options)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_tx_send_packet)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_sent)(struct sock *sk, - unsigned int len); - void (*ccid_hc_rx_get_info)(struct sock *sk, - struct tcp_info *info); - void (*ccid_hc_tx_get_info)(struct sock *sk, - struct tcp_info *info); - int (*ccid_hc_rx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); - int (*ccid_hc_tx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); -}; - -extern struct ccid_operations ccid2_ops; -#ifdef CONFIG_IP_DCCP_CCID3 -extern struct ccid_operations ccid3_ops; -#endif - -int ccid_initialize_builtins(void); -void ccid_cleanup_builtins(void); - -struct ccid { - struct ccid_operations *ccid_ops; - char ccid_priv[]; -}; - -static inline void *ccid_priv(const struct ccid *ccid) -{ - return (void *)ccid->ccid_priv; -} - -bool ccid_support_check(u8 const *ccid_array, u8 array_len); -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *, int __user *); - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); - -static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_rx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_tx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); - -/* - * Congestion control of queued data packets via CCID decision. - * - * The TX CCID performs its congestion-control by indicating whether and when a - * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). - * The following modes are supported via the symbolic constants below: - * - timer-based pacing (CCID returns a delay value in milliseconds); - * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). - */ - -enum ccid_dequeueing_decision { - CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ - CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ - CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ - CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ - CCID_PACKET_ERR = 0xF0000, /* error condition */ -}; - -static inline int ccid_packet_dequeue_eval(const int return_code) -{ - if (return_code < 0) - return CCID_PACKET_ERR; - if (return_code == 0) - return CCID_PACKET_SEND_AT_ONCE; - if (return_code <= CCID_PACKET_DELAY_MAX) - return CCID_PACKET_DELAY; - return return_code; -} - -static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return CCID_PACKET_SEND_AT_ONCE; -} - -static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - unsigned int len) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); -} - -static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb); -} - -static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); -} - -/** - * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver - * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) - * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) - * @val: value of @opt - * @len: length of @val in bytes - */ -static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); -} - -/** - * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender - * Arguments are analogous to ccid_hc_tx_parse_options() - */ -static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); -} - -static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL) - return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb); - return 0; -} - -static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL) - ccid->ccid_ops->ccid_hc_rx_get_info(sk, info); -} - -static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL) - ccid->ccid_ops->ccid_hc_tx_get_info(sk, info); -} - -static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} - -static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} -#endif /* _CCID_H */ diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig deleted file mode 100644 index e3d388c33d256..0000000000000 --- a/net/dccp/ccids/Kconfig +++ /dev/null @@ -1,55 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menu "DCCP CCIDs Configuration" - -config IP_DCCP_CCID2_DEBUG - bool "CCID-2 debugging messages" - help - Enable CCID-2 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid2_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_CCID3 - bool "CCID-3 (TCP-Friendly)" - default IP_DCCP = y || IP_DCCP = m - help - CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based - rate-controlled congestion control mechanism. TFRC is designed to - be reasonably fair when competing for bandwidth with TCP-like flows, - where a flow is "reasonably fair" if its sending rate is generally - within a factor of two of the sending rate of a TCP flow under the - same conditions. However, TFRC has a much lower variation of - throughput over time compared with TCP, which makes CCID-3 more - suitable than CCID-2 for applications such streaming media where a - relatively smooth sending rate is of importance. - - CCID-3 is further described in RFC 4342, - https://www.ietf.org/rfc/rfc4342.txt - - The TFRC congestion control algorithms were initially described in - RFC 5348. - - This text was extracted from RFC 4340 (sec. 10.2), - https://www.ietf.org/rfc/rfc4340.txt - - If in doubt, say N. - -config IP_DCCP_CCID3_DEBUG - bool "CCID-3 debugging messages" - depends on IP_DCCP_CCID3 - help - Enable CCID-3 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid3_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_TFRC_LIB - def_bool y if IP_DCCP_CCID3 - -config IP_DCCP_TFRC_DEBUG - def_bool y if IP_DCCP_CCID3_DEBUG -endmenu diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c deleted file mode 100644 index d6b30700af679..0000000000000 --- a/net/dccp/ccids/ccid2.c +++ /dev/null @@ -1,794 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk> - * - * Changes to meet Linux coding standards, and DCCP infrastructure fixes. - * - * Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -/* - * This implementation should follow RFC 4341 - */ -#include <linux/slab.h> -#include "../feat.h" -#include "ccid2.h" - - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -static bool ccid2_debug; -#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) -#else -#define ccid2_pr_debug(format, a...) -#endif - -static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) -{ - struct ccid2_seq *seqp; - int i; - - /* check if we have space to preserve the pointer to the buffer */ - if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) / - sizeof(struct ccid2_seq *))) - return -ENOMEM; - - /* allocate buffer and initialize linked list */ - seqp = kmalloc_array(CCID2_SEQBUF_LEN, sizeof(struct ccid2_seq), - gfp_any()); - if (seqp == NULL) - return -ENOMEM; - - for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) { - seqp[i].ccid2s_next = &seqp[i + 1]; - seqp[i + 1].ccid2s_prev = &seqp[i]; - } - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp; - seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - - /* This is the first allocation. Initiate the head and tail. */ - if (hc->tx_seqbufc == 0) - hc->tx_seqh = hc->tx_seqt = seqp; - else { - /* link the existing list with the one we just created */ - hc->tx_seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hc->tx_seqh; - - hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt; - } - - /* store the original pointer to the buffer so we can free it */ - hc->tx_seqbuf[hc->tx_seqbufc] = seqp; - hc->tx_seqbufc++; - - return 0; -} - -static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) - return CCID_PACKET_WILL_DEQUEUE_LATER; - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) -{ - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2); - - /* - * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from - * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always - * acceptable since this causes starvation/deadlock whenever cwnd < 2. - * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled). - */ - if (val == 0 || val > max_ratio) { - DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); - val = max_ratio; - } - dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO, - min_t(u32, val, DCCPF_ACK_RATIO_MAX)); -} - -static void ccid2_check_l_ack_ratio(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - /* - * After a loss, idle period, application limited period, or RTO we - * need to check that the ack ratio is still less than the congestion - * window. Otherwise, we will send an entire congestion window of - * packets and got no response because we haven't sent ack ratio - * packets yet. - * If the ack ratio does need to be reduced, we reduce it to half of - * the congestion window (or 1 if that's zero) instead of to the - * congestion window. This prevents problems if one ack is lost. - */ - if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd) - ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U); -} - -static void ccid2_change_l_seq_window(struct sock *sk, u64 val) -{ - dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW, - clamp_val(val, DCCPF_SEQ_WMIN, - DCCPF_SEQ_WMAX)); -} - -static void dccp_tasklet_schedule(struct sock *sk) -{ - struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet; - - if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - sock_hold(sk); - __tasklet_schedule(t); - } -} - -static void ccid2_hc_tx_rto_expire(struct timer_list *t) -{ - struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer); - struct sock *sk = hc->sk; - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5); - goto out; - } - - ccid2_pr_debug("RTO_EXPIRE\n"); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - /* back-off timer */ - hc->tx_rto <<= 1; - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; - - /* adjust pipe, cwnd etc */ - hc->tx_ssthresh = hc->tx_cwnd / 2; - if (hc->tx_ssthresh < 2) - hc->tx_ssthresh = 2; - hc->tx_cwnd = 1; - hc->tx_pipe = 0; - - /* clear state about stuff we sent */ - hc->tx_seqt = hc->tx_seqh; - hc->tx_packets_acked = 0; - - /* clear ack ratio state. */ - hc->tx_rpseq = 0; - hc->tx_rpdupack = -1; - ccid2_change_l_ack_ratio(sk, 1); - - /* if we were blocked before, we may now send cwnd=1 packet */ - if (sender_was_blocked) - dccp_tasklet_schedule(sk); - /* restart backed-off timer */ - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/* - * Congestion window validation (RFC 2861). - */ -static bool ccid2_do_cwv = true; -module_param(ccid2_do_cwv, bool, 0644); -MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); - -/** - * ccid2_update_used_window - Track how much of cwnd is actually used - * @hc: socket to update window - * @new_wnd: new window values to add into the filter - * - * This is done in addition to CWV. The sender needs to have an idea of how many - * packets may be in flight, to set the local Sequence Window value accordingly - * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the - * maximum-used window. We use an EWMA low-pass filter to filter out noise. - */ -static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd) -{ - hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4; -} - -/* This borrows the code of tcp_cwnd_application_limited() */ -static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - /* don't reduce cwnd below the initial window (IW) */ - u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache), - win_used = max(hc->tx_cwnd_used, init_win); - - if (win_used < hc->tx_cwnd) { - hc->tx_ssthresh = max(hc->tx_ssthresh, - (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2)); - hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1; - } - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - - ccid2_check_l_ack_ratio(sk); -} - -/* This borrows the code of tcp_cwnd_restart() */ -static void ccid2_cwnd_restart(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - u32 cwnd = hc->tx_cwnd, restart_cwnd, - iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); - s32 delta = now - hc->tx_lsndtime; - - hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); - - /* don't reduce cwnd below the initial window (IW) */ - restart_cwnd = min(cwnd, iwnd); - - while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd) - cwnd >>= 1; - hc->tx_cwnd = max(cwnd, restart_cwnd); - hc->tx_cwnd_stamp = now; - hc->tx_cwnd_used = 0; - - ccid2_check_l_ack_ratio(sk); -} - -static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const u32 now = ccid2_jiffies32; - struct ccid2_seq *next; - - /* slow-start after idle periods (RFC 2581, RFC 2861) */ - if (ccid2_do_cwv && !hc->tx_pipe && - (s32)(now - hc->tx_lsndtime) >= hc->tx_rto) - ccid2_cwnd_restart(sk, now); - - hc->tx_lsndtime = now; - hc->tx_pipe += 1; - - /* see whether cwnd was fully used (RFC 2861), update expected window */ - if (ccid2_cwnd_network_limited(hc)) { - ccid2_update_used_window(hc, hc->tx_cwnd); - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - } else { - if (hc->tx_pipe > hc->tx_cwnd_used) - hc->tx_cwnd_used = hc->tx_pipe; - - ccid2_update_used_window(hc, hc->tx_cwnd_used); - - if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto) - ccid2_cwnd_application_limited(sk, now); - } - - hc->tx_seqh->ccid2s_seq = dp->dccps_gss; - hc->tx_seqh->ccid2s_acked = 0; - hc->tx_seqh->ccid2s_sent = now; - - next = hc->tx_seqh->ccid2s_next; - /* check if we need to alloc more space */ - if (next == hc->tx_seqt) { - if (ccid2_hc_tx_alloc_seq(hc)) { - DCCP_CRIT("packet history - out of memory!"); - /* FIXME: find a more graceful way to bail out */ - return; - } - next = hc->tx_seqh->ccid2s_next; - BUG_ON(next == hc->tx_seqt); - } - hc->tx_seqh = next; - - ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe); - - /* - * FIXME: The code below is broken and the variables have been removed - * from the socket struct. The `ackloss' variable was always set to 0, - * and with arsent there are several problems: - * (i) it doesn't just count the number of Acks, but all sent packets; - * (ii) it is expressed in # of packets, not # of windows, so the - * comparison below uses the wrong formula: Appendix A of RFC 4341 - * comes up with the number K = cwnd / (R^2 - R) of consecutive windows - * of data with no lost or marked Ack packets. If arsent were the # of - * consecutive Acks received without loss, then Ack Ratio needs to be - * decreased by 1 when - * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2) - * where cwnd / R is the number of Acks received per window of data - * (cf. RFC 4341, App. A). The problems are that - * - arsent counts other packets as well; - * - the comparison uses a formula different from RFC 4341; - * - computing a cubic/quadratic equation each time is too complicated. - * Hence a different algorithm is needed. - */ -#if 0 - /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hc->tx_arsent++; - /* We had an ack loss in this window... */ - if (hc->tx_ackloss) { - if (hc->tx_arsent >= hc->tx_cwnd) { - hc->tx_arsent = 0; - hc->tx_ackloss = 0; - } - } else { - /* No acks lost up to now... */ - /* decrease ack ratio if enough packets were sent */ - if (dp->dccps_l_ack_ratio > 1) { - /* XXX don't calculate denominator each time */ - int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - - dp->dccps_l_ack_ratio; - - denom = hc->tx_cwnd * hc->tx_cwnd / denom; - - if (hc->tx_arsent >= denom) { - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hc->tx_arsent = 0; - } - } else { - /* we can't increase ack ratio further [1] */ - hc->tx_arsent = 0; /* or maybe set it to cwnd*/ - } - } -#endif - - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG - do { - struct ccid2_seq *seqp = hc->tx_seqt; - - while (seqp != hc->tx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", - (unsigned long long)seqp->ccid2s_seq, - seqp->ccid2s_acked, seqp->ccid2s_sent); - seqp = seqp->ccid2s_next; - } - } while (0); - ccid2_pr_debug("=========\n"); -#endif -} - -/** - * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm - * @sk: socket to perform estimator on - * @mrtt: measured RTT - * - * This code is almost identical with TCP's tcp_rtt_estimator(), since - * - it has a higher sampling frequency (recommended by RFC 1323), - * - the RTO does not collapse into RTT due to RTTVAR going towards zero, - * - it is simple (cf. more complex proposals such as Eifel timer or research - * which suggests that the gain should be set according to window size), - * - in tests it was found to work well with CCID2 [gerrit]. - */ -static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - long m = mrtt ? : 1; - - if (hc->tx_srtt == 0) { - /* First measurement m */ - hc->tx_srtt = m << 3; - hc->tx_mdev = m << 1; - - hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); - hc->tx_rttvar = hc->tx_mdev_max; - - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - } else { - /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ - m -= (hc->tx_srtt >> 3); - hc->tx_srtt += m; - - /* Similarly, update scaled mdev with regard to |m| */ - if (m < 0) { - m = -m; - m -= (hc->tx_mdev >> 2); - /* - * This neutralises RTO increase when RTT < SRTT - mdev - * (see P. Sarolahti, A. Kuznetsov,"Congestion Control - * in Linux TCP", USENIX 2002, pp. 49-62). - */ - if (m > 0) - m >>= 3; - } else { - m -= (hc->tx_mdev >> 2); - } - hc->tx_mdev += m; - - if (hc->tx_mdev > hc->tx_mdev_max) { - hc->tx_mdev_max = hc->tx_mdev; - if (hc->tx_mdev_max > hc->tx_rttvar) - hc->tx_rttvar = hc->tx_mdev_max; - } - - /* - * Decay RTTVAR at most once per flight, exploiting that - * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) - * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) - * GAR is a useful bound for FlightSize = pipe. - * AWL is probably too low here, as it over-estimates pipe. - */ - if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { - if (hc->tx_mdev_max < hc->tx_rttvar) - hc->tx_rttvar -= (hc->tx_rttvar - - hc->tx_mdev_max) >> 2; - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - hc->tx_mdev_max = tcp_rto_min(sk); - } - } - - /* - * Set RTO from SRTT and RTTVAR - * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. - * This agrees with RFC 4341, 5: - * "Because DCCP does not retransmit data, DCCP does not require - * TCP's recommended minimum timeout of one second". - */ - hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; - - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; -} - -static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, - unsigned int *maxincr) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio; - - if (hc->tx_cwnd < dp->dccps_l_seq_win && - r_seq_used < dp->dccps_r_seq_win) { - if (hc->tx_cwnd < hc->tx_ssthresh) { - if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) { - hc->tx_cwnd += 1; - *maxincr -= 1; - hc->tx_packets_acked = 0; - } - } else if (++hc->tx_packets_acked >= hc->tx_cwnd) { - hc->tx_cwnd += 1; - hc->tx_packets_acked = 0; - } - } - - /* - * Adjust the local sequence window and the ack ratio to allow about - * 5 times the number of packets in the network (RFC 4340 7.5.2) - */ - if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2); - else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U); - - if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2); - else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2); - - /* - * FIXME: RTT is sampled several times per acknowledgment (for each - * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). - * This causes the RTT to be over-estimated, since the older entries - * in the Ack Vector have earlier sending times. - * The cleanest solution is to not use the ccid2s_sent field at all - * and instead use DCCP timestamps: requires changes in other places. - */ - ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent); -} - -static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { - ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); - return; - } - - hc->tx_last_cong = ccid2_jiffies32; - - hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; - hc->tx_ssthresh = max(hc->tx_cwnd, 2U); - - ccid2_check_l_ack_ratio(sk); -} - -static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - switch (option) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen, - option - DCCPO_ACK_VECTOR_0); - } - return 0; -} - -static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - struct dccp_ackvec_parsed *avp; - u64 ackno, seqno; - struct ccid2_seq *seqp; - int done = 0; - unsigned int maxincr = 0; - - /* check reverse path congestion */ - seqno = DCCP_SKB_CB(skb)->dccpd_seq; - - /* XXX this whole "algorithm" is broken. Need to fix it to keep track - * of the seqnos of the dupacks so that rpseq and rpdupack are correct - * -sorbo. - */ - /* need to bootstrap */ - if (hc->tx_rpdupack == -1) { - hc->tx_rpdupack = 0; - hc->tx_rpseq = seqno; - } else { - /* check if packet is consecutive */ - if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1) - hc->tx_rpseq = seqno; - /* it's a later packet */ - else if (after48(seqno, hc->tx_rpseq)) { - hc->tx_rpdupack++; - - /* check if we got enough dupacks */ - if (hc->tx_rpdupack >= NUMDUPACK) { - hc->tx_rpdupack = -1; /* XXX lame */ - hc->tx_rpseq = 0; -#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__ - /* - * FIXME: Ack Congestion Control is broken; in - * the current state instabilities occurred with - * Ack Ratios greater than 1; causing hang-ups - * and long RTO timeouts. This needs to be fixed - * before opening up dynamic changes. -- gerrit - */ - ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); -#endif - } - } - } - - /* check forward path congestion */ - if (dccp_packet_without_ack(skb)) - return; - - /* still didn't send out new data packets */ - if (hc->tx_seqh == hc->tx_seqt) - goto done; - - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hc->tx_high_ack)) - hc->tx_high_ack = ackno; - - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, ackno)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - - /* - * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2 - * packets per acknowledgement. Rounding up avoids that cwnd is not - * advanced when Ack Ratio is 1 and gives a slight edge otherwise. - */ - if (hc->tx_cwnd < hc->tx_ssthresh) - maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); - - /* go through all ack vectors */ - list_for_each_entry(avp, &hc->tx_av_chunks, node) { - /* go through this ack vector */ - for (; avp->len--; avp->vec++) { - u64 ackno_end_rl = SUB48(ackno, - dccp_ackvec_runlen(avp->vec)); - - ccid2_pr_debug("ackvec %llu |%u,%u|\n", - (unsigned long long)ackno, - dccp_ackvec_state(avp->vec) >> 6, - dccp_ackvec_runlen(avp->vec)); - /* if the seqno we are analyzing is larger than the - * current ackno, then move towards the tail of our - * seqnos. - */ - while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - /* check all seqnos in the range of the vector - * run length - */ - while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(avp->vec); - - /* new packet received or marked */ - if (state != DCCPAV_NOT_RECEIVED && - !seqp->ccid2s_acked) { - if (state == DCCPAV_ECN_MARKED) - ccid2_congestion_event(sk, - seqp); - else - ccid2_new_ack(sk, seqp, - &maxincr); - - seqp->ccid2s_acked = 1; - ccid2_pr_debug("Got ack for %llu\n", - (unsigned long long)seqp->ccid2s_seq); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - ackno = SUB48(ackno_end_rl, 1); - } - if (done) - break; - } - - /* The state about what is acked should be correct now - * Check for NUMDUPACK - */ - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - done = 0; - while (1) { - if (seqp->ccid2s_acked) { - done++; - if (done == NUMDUPACK) - break; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - /* If there are at least 3 acknowledgements, anything unacknowledged - * below the last sequence number is considered lost - */ - if (done == NUMDUPACK) { - struct ccid2_seq *last_acked = seqp; - - /* check for lost packets */ - while (1) { - if (!seqp->ccid2s_acked) { - ccid2_pr_debug("Packet lost: %llu\n", - (unsigned long long)seqp->ccid2s_seq); - /* XXX need to traverse from tail -> head in - * order to detect multiple congestion events in - * one ack vector. - */ - ccid2_congestion_event(sk, seqp); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - hc->tx_seqt = last_acked; - } - - /* trim acked packets in tail */ - while (hc->tx_seqt != hc->tx_seqh) { - if (!hc->tx_seqt->ccid2s_acked) - break; - - hc->tx_seqt = hc->tx_seqt->ccid2s_next; - } - - /* restart RTO timer if not all outstanding data has been acked */ - if (hc->tx_pipe == 0) - sk_stop_timer(sk, &hc->tx_rtotimer); - else - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -done: - /* check if incoming Acks allow pending packets to be sent */ - if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) - dccp_tasklet_schedule(sk); - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid_priv(ccid); - struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio; - - /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hc->tx_ssthresh = ~0U; - - /* Use larger initial windows (RFC 4341, section 5). */ - hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); - hc->tx_expected_wnd = hc->tx_cwnd; - - /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); - if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) - dp->dccps_l_ack_ratio = max_ratio; - - /* XXX init ~ to window size... */ - if (ccid2_hc_tx_alloc_seq(hc)) - return -ENOMEM; - - hc->tx_rto = DCCP_TIMEOUT_INIT; - hc->tx_rpdupack = -1; - hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; - hc->tx_cwnd_used = 0; - hc->sk = sk; - timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0); - INIT_LIST_HEAD(&hc->tx_av_chunks); - return 0; -} - -static void ccid2_hc_tx_exit(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - int i; - - sk_stop_timer(sk, &hc->tx_rtotimer); - - for (i = 0; i < hc->tx_seqbufc; i++) - kfree(hc->tx_seqbuf[i]); - hc->tx_seqbufc = 0; - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk); - - if (!dccp_data_packet(skb)) - return; - - if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) { - dccp_send_ack(sk); - hc->rx_num_data_pkts = 0; - } -} - -struct ccid_operations ccid2_ops = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, -}; - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -module_param(ccid2_debug, bool, 0644); -MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h deleted file mode 100644 index 330c7b4ec0013..0000000000000 --- a/net/dccp/ccids/ccid2.h +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> - */ -#ifndef _DCCP_CCID2_H_ -#define _DCCP_CCID2_H_ - -#include <linux/timer.h> -#include <linux/types.h> -#include "../ccid.h" -#include "../dccp.h" - -/* - * CCID-2 timestamping faces the same issues as TCP timestamping. - * Hence we reuse/share as much of the code as possible. - */ -#define ccid2_jiffies32 ((u32)jiffies) - -/* NUMDUPACK parameter from RFC 4341, p. 6 */ -#define NUMDUPACK 3 - -struct ccid2_seq { - u64 ccid2s_seq; - u32 ccid2s_sent; - int ccid2s_acked; - struct ccid2_seq *ccid2s_prev; - struct ccid2_seq *ccid2s_next; -}; - -#define CCID2_SEQBUF_LEN 1024 -#define CCID2_SEQBUF_MAX 128 - -/* - * Multiple of congestion window to keep the sequence window at - * (RFC 4340 7.5.2) - */ -#define CCID2_WIN_CHANGE_FACTOR 5 - -/** - * struct ccid2_hc_tx_sock - CCID2 TX half connection - * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 - * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @tx_srtt: smoothed RTT estimate, scaled by 2^3 - * @tx_mdev: smoothed RTT variation, scaled by 2^2 - * @tx_mdev_max: maximum of @mdev during one flight - * @tx_rttvar: moving average/maximum of @mdev_max - * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) - * @tx_rtt_seq: to decay RTTVAR at most once per flight - * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861 - * @tx_expected_wnd: moving average of @tx_cwnd_used - * @tx_cwnd_stamp: to track idle periods in CWV - * @tx_lsndtime: last time (in jiffies) a data packet was sent - * @tx_rpseq: last consecutive seqno - * @tx_rpdupack: dupacks since rpseq - * @tx_av_chunks: list of Ack Vectors received on current skb - */ -struct ccid2_hc_tx_sock { - u32 tx_cwnd; - u32 tx_ssthresh; - u32 tx_pipe; - u32 tx_packets_acked; - struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX]; - int tx_seqbufc; - struct ccid2_seq *tx_seqh; - struct ccid2_seq *tx_seqt; - - /* RTT measurement: variables/principles are the same as in TCP */ - u32 tx_srtt, - tx_mdev, - tx_mdev_max, - tx_rttvar, - tx_rto; - u64 tx_rtt_seq:48; - struct timer_list tx_rtotimer; - struct sock *sk; - - /* Congestion Window validation (optional, RFC 2861) */ - u32 tx_cwnd_used, - tx_expected_wnd, - tx_cwnd_stamp, - tx_lsndtime; - - u64 tx_rpseq; - int tx_rpdupack; - u32 tx_last_cong; - u64 tx_high_ack; - struct list_head tx_av_chunks; -}; - -static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) -{ - return hc->tx_pipe >= hc->tx_cwnd; -} - -/* - * Convert RFC 3390 larger initial window into an equivalent number of packets. - * This is based on the numbers specified in RFC 5681, 3.1. - */ -static inline u32 rfc3390_bytes_to_packets(const u32 smss) -{ - return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3); -} - -/** - * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection - * @rx_num_data_pkts: number of data packets received since last feedback - */ -struct ccid2_hc_rx_sock { - u32 rx_num_data_pkts; -}; - -static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); -} - -static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); -} -#endif /* _DCCP_CCID2_H_ */ diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c deleted file mode 100644 index f349d16dd8f65..0000000000000 --- a/net/dccp/ccids/ccid3.c +++ /dev/null @@ -1,866 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ -#include "../dccp.h" -#include "ccid3.h" - -#include <linux/unaligned.h> - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static bool ccid3_debug; -#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) -#else -#define ccid3_pr_debug(format, a...) -#endif - -/* - * Transmitter Half-Connection Routines - */ -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) -{ - static const char *const ccid3_state_names[] = { - [TFRC_SSTATE_NO_SENT] = "NO_SENT", - [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", - [TFRC_SSTATE_FBACK] = "FBACK", - }; - - return ccid3_state_names[state]; -} -#endif - -static void ccid3_hc_tx_set_state(struct sock *sk, - enum ccid3_hc_tx_states state) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - enum ccid3_hc_tx_states oldstate = hc->tx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_tx_state_name(oldstate), - ccid3_tx_state_name(state)); - WARN_ON(state == oldstate); - hc->tx_state = state; -} - -/* - * Compute the initial sending rate X_init in the manner of RFC 3390: - * - * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT - * - * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis - * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. - * For consistency with other parts of the code, X_init is scaled by 2^6. - */ -static inline u64 rfc3390_initial_rate(struct sock *sk) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s); - - return scaled_div(w_init << 6, hc->tx_rtt); -} - -/** - * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst - * @hc: socket to have the send interval updated - * - * This respects the granularity of X_inst (64 * bytes/second). - */ -static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) -{ - hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); - - DCCP_BUG_ON(hc->tx_t_ipi == 0); - ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi, - hc->tx_s, (unsigned int)(hc->tx_x >> 6)); -} - -static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count); - - return delta / hc->tx_rtt; -} - -/** - * ccid3_hc_tx_update_x - Update allowed sending rate X - * @sk: socket to be updated - * @stamp: most recent time if available - can be left NULL. - * - * This function tracks draft rfc3448bis, check there for latest details. - * - * Note: X and X_recv are both stored in units of 64 * bytes/second, to support - * fine-grained resolution of sending rates. This requires scaling by 2^6 - * throughout the code. Only X_calc is unscaled (in bytes/second). - * - */ -static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __u64 min_rate = 2 * hc->tx_x_recv; - const __u64 old_x = hc->tx_x; - ktime_t now = stamp ? *stamp : ktime_get_real(); - - /* - * Handle IDLE periods: do not reduce below RFC3390 initial sending rate - * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis: - * a sender is idle if it has not sent anything over a 2-RTT-period. - * For consistency with X and X_recv, min_rate is also scaled by 2^6. - */ - if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) { - min_rate = rfc3390_initial_rate(sk); - min_rate = max(min_rate, 2 * hc->tx_x_recv); - } - - if (hc->tx_p > 0) { - - hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate); - hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - - } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) { - - hc->tx_x = min(2 * hc->tx_x, min_rate); - hc->tx_x = max(hc->tx_x, - scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt)); - hc->tx_t_ld = now; - } - - if (hc->tx_x != old_x) { - ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " - "X_recv=%u\n", (unsigned int)(old_x >> 6), - (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6)); - - ccid3_update_send_interval(hc); - } -} - -/** - * ccid3_hc_tx_update_s - Track the mean packet size `s' - * @hc: socket to be updated - * @len: DCCP packet payload size in bytes - * - * cf. RFC 4342, 5.3 and RFC 3448, 4.1 - */ -static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len) -{ - const u16 old_s = hc->tx_s; - - hc->tx_s = tfrc_ewma(hc->tx_s, len, 9); - - if (hc->tx_s != old_s) - ccid3_update_send_interval(hc); -} - -/* - * Update Window Counter using the algorithm from [RFC 4342, 8.1]. - * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt(). - */ -static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc, - ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count), - quarter_rtts = (4 * delta) / hc->tx_rtt; - - if (quarter_rtts > 0) { - hc->tx_t_last_win_count = now; - hc->tx_last_win_count += min(quarter_rtts, 5U); - hc->tx_last_win_count &= 0xF; /* mod 16 */ - } -} - -static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t) -{ - struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer); - struct sock *sk = hc->sk; - unsigned long t_nfb = USEC_PER_SEC / 5; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - /* XXX: set some sensible MIB */ - goto restart_timer; - } - - ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, - ccid3_tx_state_name(hc->tx_state)); - - /* Ignore and do not restart after leaving the established state */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - goto out; - - /* Reset feedback state to "no feedback received" */ - if (hc->tx_state == TFRC_SSTATE_FBACK) - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - /* - * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 - * RTO is 0 if and only if no feedback has been received yet. - */ - if (hc->tx_t_rto == 0 || hc->tx_p == 0) { - - /* halve send rate directly */ - hc->tx_x = max(hc->tx_x / 2, - (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - ccid3_update_send_interval(hc); - } else { - /* - * Modify the cached value of X_recv - * - * If (X_calc > 2 * X_recv) - * X_recv = max(X_recv / 2, s / (2 * t_mbi)); - * Else - * X_recv = X_calc / 4; - * - * Note that X_recv is scaled by 2^6 while X_calc is not - */ - if (hc->tx_x_calc > (hc->tx_x_recv >> 5)) - hc->tx_x_recv = - max(hc->tx_x_recv / 2, - (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI)); - else { - hc->tx_x_recv = hc->tx_x_calc; - hc->tx_x_recv <<= 4; - } - ccid3_hc_tx_update_x(sk, NULL); - } - ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", - (unsigned long long)hc->tx_x); - - /* - * Set new timeout for the nofeedback timer. - * See comments in packet_recv() regarding the value of t_RTO. - */ - if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ - t_nfb = TFRC_INITIAL_TIMEOUT; - else - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - -restart_timer: - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets - * @sk: socket to send packet from - * @skb: next packet candidate to send on @sk - * - * This function uses the convention of ccid_packet_dequeue_eval() and - * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. - */ -static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - ktime_t now = ktime_get_real(); - s64 delay; - - /* - * This function is called only for Data and DataAck packets. Sending - * zero-sized Data(Ack)s is theoretically possible, but for congestion - * control this case is pathological - ignore it. - */ - if (unlikely(skb->len == 0)) - return -EBADMSG; - - if (hc->tx_state == TFRC_SSTATE_NO_SENT) { - sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + - usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); - hc->tx_last_win_count = 0; - hc->tx_t_last_win_count = now; - - /* Set t_0 for initial packet */ - hc->tx_t_nom = now; - - hc->tx_s = skb->len; - - /* - * Use initial RTT sample when available: recommended by erratum - * to RFC 4342. This implements the initialisation procedure of - * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6. - */ - if (dp->dccps_syn_rtt) { - ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); - hc->tx_rtt = dp->dccps_syn_rtt; - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - } else { - /* - * Sender does not have RTT sample: - * - set fallback RTT (RFC 4340, 3.4) since a RTT value - * is needed in several parts (e.g. window counter); - * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. - */ - hc->tx_rtt = DCCP_FALLBACK_RTT; - hc->tx_x = hc->tx_s; - hc->tx_x <<= 6; - } - ccid3_update_send_interval(hc); - - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - } else { - delay = ktime_us_delta(hc->tx_t_nom, now); - ccid3_pr_debug("delay=%ld\n", (long)delay); - /* - * Scheduling of packet transmissions (RFC 5348, 8.3) - * - * if (t_now > t_nom - delta) - * // send the packet now - * else - * // send the packet in (t_nom - t_now) milliseconds. - */ - if (delay >= TFRC_T_DELTA) - return (u32)delay / USEC_PER_MSEC; - - ccid3_hc_tx_update_win_count(hc, now); - } - - /* prepare to send now (add options etc.) */ - dp->dccps_hc_tx_insert_options = 1; - DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count; - - /* set the nominal send time for the next following packet */ - hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - ccid3_hc_tx_update_s(hc, len); - - if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss)) - DCCP_CRIT("packet history - out of memory!"); -} - -static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_hist_entry *acked; - ktime_t now; - unsigned long t_nfb; - u32 r_sample; - - /* we are only interested in ACKs */ - if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || - DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) - return; - /* - * Locate the acknowledged packet in the TX history. - * - * Returning "entry not found" here can for instance happen when - * - the host has not sent out anything (e.g. a passive server), - * - the Ack is outdated (packet with higher Ack number was received), - * - it is a bogus Ack (for a packet not sent on this connection). - */ - acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb)); - if (acked == NULL) - return; - /* For the sake of RTT sampling, ignore/remove all older entries */ - tfrc_tx_hist_purge(&acked->next); - - /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ - now = ktime_get_real(); - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); - hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); - - /* - * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 - */ - if (hc->tx_state == TFRC_SSTATE_NO_FBACK) { - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - - if (hc->tx_t_rto == 0) { - /* - * Initial feedback packet: Larger Initial Windows (4.2) - */ - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - - ccid3_update_send_interval(hc); - - goto done_computing_x; - } else if (hc->tx_p == 0) { - /* - * First feedback after nofeedback timer expiry (4.3) - */ - goto done_computing_x; - } - } - - /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hc->tx_p > 0) - hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p); - ccid3_hc_tx_update_x(sk, &now); - -done_computing_x: - ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " - "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), sk, hc->tx_rtt, r_sample, - hc->tx_s, hc->tx_p, hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6), - (unsigned int)(hc->tx_x >> 6)); - - /* unschedule no feedback timer */ - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - - /* - * As we have calculated new ipi, delta, t_nom it is possible - * that we now can send a packet, so wake up dccp_wait_for_ccid - */ - sk->sk_write_space(sk); - - /* - * Update timeout interval for the nofeedback timer. In order to control - * rate halving on networks with very low RTTs (<= 1 ms), use per-route - * tunable RTAX_RTO_MIN value as the lower bound. - */ - hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, - USEC_PER_SEC/HZ * tcp_rto_min(sk)); - /* - * Schedule no feedback timer to expire in - * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) - */ - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - - ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " - "expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); - - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -} - -static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __be32 opt_val; - - switch (option) { - case TFRC_OPT_RECEIVE_RATE: - case TFRC_OPT_LOSS_EVENT_RATE: - /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ - if (packet_type == DCCP_PKT_DATA) - break; - if (unlikely(optlen != 4)) { - DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, optlen, option); - return -EINVAL; - } - opt_val = ntohl(get_unaligned((__be32 *)optval)); - - if (option == TFRC_OPT_RECEIVE_RATE) { - /* Receive Rate is kept in units of 64 bytes/second */ - hc->tx_x_recv = opt_val; - hc->tx_x_recv <<= 6; - - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } else { - /* Update the fixpoint Loss Event Rate fraction */ - hc->tx_p = tfrc_invert_loss_event_rate(opt_val); - - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } - } - return 0; -} - -static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid_priv(ccid); - - hc->tx_state = TFRC_SSTATE_NO_SENT; - hc->tx_hist = NULL; - hc->sk = sk; - timer_setup(&hc->tx_no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, 0); - return 0; -} - -static void ccid3_hc_tx_exit(struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - tfrc_tx_hist_purge(&hc->tx_hist); -} - -static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; - info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; -} - -static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_info tfrc; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(tfrc)) - return -EINVAL; - memset(&tfrc, 0, sizeof(tfrc)); - tfrc.tfrctx_x = hc->tx_x; - tfrc.tfrctx_x_recv = hc->tx_x_recv; - tfrc.tfrctx_x_calc = hc->tx_x_calc; - tfrc.tfrctx_rtt = hc->tx_rtt; - tfrc.tfrctx_p = hc->tx_p; - tfrc.tfrctx_rto = hc->tx_t_rto; - tfrc.tfrctx_ipi = hc->tx_t_ipi; - len = sizeof(tfrc); - val = &tfrc; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -/* - * Receiver Half-Connection Routines - */ - -/* CCID3 feedback types */ -enum ccid3_fback_type { - CCID3_FBACK_NONE = 0, - CCID3_FBACK_INITIAL, - CCID3_FBACK_PERIODIC, - CCID3_FBACK_PARAM_CHANGE -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) -{ - static const char *const ccid3_rx_state_names[] = { - [TFRC_RSTATE_NO_DATA] = "NO_DATA", - [TFRC_RSTATE_DATA] = "DATA", - }; - - return ccid3_rx_state_names[state]; -} -#endif - -static void ccid3_hc_rx_set_state(struct sock *sk, - enum ccid3_hc_rx_states state) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_hc_rx_states oldstate = hc->rx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_rx_state_name(oldstate), - ccid3_rx_state_name(state)); - WARN_ON(state == oldstate); - hc->rx_state = state; -} - -static void ccid3_hc_rx_send_feedback(struct sock *sk, - const struct sk_buff *skb, - enum ccid3_fback_type fbtype) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - ktime_t now = ktime_get(); - s64 delta = 0; - - switch (fbtype) { - case CCID3_FBACK_INITIAL: - hc->rx_x_recv = 0; - hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */ - break; - case CCID3_FBACK_PARAM_CHANGE: - /* - * When parameters change (new loss or p > p_prev), we do not - * have a reliable estimate for R_m of [RFC 3448, 6.2] and so - * need to reuse the previous value of X_recv. However, when - * X_recv was 0 (due to early loss), this would kill X down to - * s/t_mbi (i.e. one packet in 64 seconds). - * To avoid such drastic reduction, we approximate X_recv as - * the number of bytes since last feedback. - * This is a safe fallback, since X is bounded above by X_calc. - */ - if (hc->rx_x_recv > 0) - break; - fallthrough; - case CCID3_FBACK_PERIODIC: - delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); - break; - default: - return; - } - - ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta, - hc->rx_x_recv, hc->rx_pinv); - - hc->rx_tstamp_last_feedback = now; - hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval; - hc->rx_bytes_recv = 0; - - dp->dccps_hc_rx_insert_options = 1; - dccp_send_ack(sk); -} - -static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - __be32 x_recv, pinv; - - if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) - return 0; - - if (dccp_packet_without_ack(skb)) - return 0; - - x_recv = htonl(hc->rx_x_recv); - pinv = htonl(hc->rx_pinv); - - if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE, - &pinv, sizeof(pinv)) || - dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE, - &x_recv, sizeof(x_recv))) - return -1; - - return 0; -} - -/** - * ccid3_first_li - Implements [RFC 5348, 6.3.1] - * @sk: socket to calculate loss interval for - * - * Determine the length of the first loss interval via inverse lookup. - * Assume that X_recv can be computed by the throughput equation - * s - * X_recv = -------- - * R * fval - * Find some p such that f(p) = fval; return 1/p (scaled). - */ -static u32 ccid3_first_li(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - u32 x_recv, p; - s64 delta; - u64 fval; - - if (hc->rx_rtt == 0) { - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - hc->rx_rtt = DCCP_FALLBACK_RTT; - } - - delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - x_recv = scaled_div32(hc->rx_bytes_recv, delta); - if (x_recv == 0) { /* would also trigger divide-by-zero */ - DCCP_WARN("X_recv==0\n"); - if (hc->rx_x_recv == 0) { - DCCP_BUG("stored value of X_recv is zero"); - return ~0U; - } - x_recv = hc->rx_x_recv; - } - - fval = scaled_div(hc->rx_s, hc->rx_rtt); - fval = scaled_div32(fval, x_recv); - p = tfrc_calc_x_reverse_lookup(fval); - - ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " - "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); - - return p == 0 ? ~0U : scaled_div(1, p); -} - -static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; - const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; - const bool is_data_packet = dccp_data_packet(skb); - - if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) { - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - do_feedback = CCID3_FBACK_INITIAL; - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); - hc->rx_s = payload; - /* - * Not necessary to update rx_bytes_recv here, - * since X_recv = 0 for the first feedback packet (cf. - * RFC 3448, 6.3) -- gerrit - */ - } - goto update_records; - } - - if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb)) - return; /* done receiving */ - - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - /* - * Update moving-average of s and the sum of received payload bytes - */ - hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9); - hc->rx_bytes_recv += payload; - } - - /* - * Perform loss detection and handle pending losses - */ - if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist, - skb, ndp, ccid3_first_li, sk)) { - do_feedback = CCID3_FBACK_PARAM_CHANGE; - goto done_receiving; - } - - if (tfrc_rx_hist_loss_pending(&hc->rx_hist)) - return; /* done receiving */ - - /* - * Handle data packets: RTT sampling and monitoring p - */ - if (unlikely(!is_data_packet)) - goto update_records; - - if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) { - const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb); - /* - * Empty loss history: no loss so far, hence p stays 0. - * Sample RTT values, since an RTT estimate is required for the - * computation of p when the first loss occurs; RFC 3448, 6.3.1. - */ - if (sample != 0) - hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9); - - } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) { - /* - * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean - * has decreased (resp. p has increased), send feedback now. - */ - do_feedback = CCID3_FBACK_PARAM_CHANGE; - } - - /* - * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 - */ - if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3) - do_feedback = CCID3_FBACK_PERIODIC; - -update_records: - tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp); - -done_receiving: - if (do_feedback) - ccid3_hc_rx_send_feedback(sk, skb, do_feedback); -} - -static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid_priv(ccid); - - hc->rx_state = TFRC_RSTATE_NO_DATA; - tfrc_lh_init(&hc->rx_li_hist); - return tfrc_rx_hist_alloc(&hc->rx_hist); -} - -static void ccid3_hc_rx_exit(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - - tfrc_rx_hist_purge(&hc->rx_hist); - tfrc_lh_cleanup(&hc->rx_li_hist); -} - -static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; - info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; -} - -static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct tfrc_rx_info rx_info; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_RX_INFO: - if (len < sizeof(rx_info)) - return -EINVAL; - rx_info.tfrcrx_x_recv = hc->rx_x_recv; - rx_info.tfrcrx_rtt = hc->rx_rtt; - rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv); - len = sizeof(rx_info); - val = &rx_info; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -struct ccid_operations ccid3_ops = { - .ccid_id = DCCPC_CCID3, - .ccid_name = "TCP-Friendly Rate Control", - .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), - .ccid_hc_tx_init = ccid3_hc_tx_init, - .ccid_hc_tx_exit = ccid3_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, - .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, - .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, - .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock), - .ccid_hc_rx_init = ccid3_hc_rx_init, - .ccid_hc_rx_exit = ccid3_hc_rx_exit, - .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, - .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, - .ccid_hc_rx_get_info = ccid3_hc_rx_get_info, - .ccid_hc_tx_get_info = ccid3_hc_tx_get_info, - .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt, - .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -module_param(ccid3_debug, bool, 0644); -MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h deleted file mode 100644 index 02e0fc9f6334b..0000000000000 --- a/net/dccp/ccids/ccid3.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ -#ifndef _DCCP_CCID3_H_ -#define _DCCP_CCID3_H_ - -#include <linux/ktime.h> -#include <linux/list.h> -#include <linux/types.h> -#include <linux/tfrc.h> -#include "lib/tfrc.h" -#include "../ccid.h" - -/* Two seconds as per RFC 5348, 4.2 */ -#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) - -/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ -#define TFRC_T_MBI 64 - -/* - * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are - * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. - * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse - * resolution of HZ < 500 means that the error is below one timer tick (t_gran) - * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). - */ -#if (HZ >= 500) -# define TFRC_T_DELTA USEC_PER_MSEC -#else -# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) -#endif - -enum ccid3_options { - TFRC_OPT_LOSS_EVENT_RATE = 192, - TFRC_OPT_LOSS_INTERVALS = 193, - TFRC_OPT_RECEIVE_RATE = 194, -}; - -/* TFRC sender states */ -enum ccid3_hc_tx_states { - TFRC_SSTATE_NO_SENT = 1, - TFRC_SSTATE_NO_FBACK, - TFRC_SSTATE_FBACK, -}; - -/** - * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket - * @tx_x: Current sending rate in 64 * bytes per second - * @tx_x_recv: Receive rate in 64 * bytes per second - * @tx_x_calc: Calculated rate in bytes per second - * @tx_rtt: Estimate of current round trip time in usecs - * @tx_p: Current loss event rate (0-1) scaled by 1000000 - * @tx_s: Packet size in bytes - * @tx_t_rto: Nofeedback Timer setting in usecs - * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @tx_state: Sender state, one of %ccid3_hc_tx_states - * @tx_last_win_count: Last window counter sent - * @tx_t_last_win_count: Timestamp of earliest packet - * with last_win_count value sent - * @tx_no_feedback_timer: Handle to no feedback timer - * @tx_t_ld: Time last doubled during slow start - * @tx_t_nom: Nominal send time of next packet - * @tx_hist: Packet history - */ -struct ccid3_hc_tx_sock { - u64 tx_x; - u64 tx_x_recv; - u32 tx_x_calc; - u32 tx_rtt; - u32 tx_p; - u32 tx_t_rto; - u32 tx_t_ipi; - u16 tx_s; - enum ccid3_hc_tx_states tx_state:8; - u8 tx_last_win_count; - ktime_t tx_t_last_win_count; - struct timer_list tx_no_feedback_timer; - struct sock *sk; - ktime_t tx_t_ld; - ktime_t tx_t_nom; - struct tfrc_tx_hist_entry *tx_hist; -}; - -static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) -{ - struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); - BUG_ON(hctx == NULL); - return hctx; -} - -/* TFRC receiver states */ -enum ccid3_hc_rx_states { - TFRC_RSTATE_NO_DATA = 1, - TFRC_RSTATE_DATA, -}; - -/** - * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket - * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) - * @rx_state: Receiver state, one of %ccid3_hc_rx_states - * @rx_bytes_recv: Total sum of DCCP payload bytes - * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @rx_rtt: Receiver estimate of RTT - * @rx_tstamp_last_feedback: Time at which last feedback was sent - * @rx_hist: Packet history (loss detection + RTT sampling) - * @rx_li_hist: Loss Interval database - * @rx_s: Received packet size in bytes - * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5) - */ -struct ccid3_hc_rx_sock { - u8 rx_last_counter:4; - enum ccid3_hc_rx_states rx_state:8; - u32 rx_bytes_recv; - u32 rx_x_recv; - u32 rx_rtt; - ktime_t rx_tstamp_last_feedback; - struct tfrc_rx_hist rx_hist; - struct tfrc_loss_hist rx_li_hist; - u16 rx_s; -#define rx_pinv rx_li_hist.i_mean -}; - -static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) -{ - struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); - BUG_ON(hcrx == NULL); - return hcrx; -} - -#endif /* _DCCP_CCID3_H_ */ diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c deleted file mode 100644 index da95319842bbe..0000000000000 --- a/net/dccp/ccids/lib/loss_interval.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ -#include <net/sock.h> -#include "tfrc.h" - -static struct kmem_cache *tfrc_lh_slab __read_mostly; -/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */ -static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 }; - -/* implements LIFO semantics on the array */ -static inline u8 LIH_INDEX(const u8 ctr) -{ - return LIH_SIZE - 1 - (ctr % LIH_SIZE); -} - -/* the `counter' index always points at the next entry to be populated */ -static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh) -{ - return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL; -} - -/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */ -static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i) -{ - BUG_ON(i >= lh->counter); - return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length; -} - -/* - * On-demand allocation and de-allocation of entries - */ -static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh) -{ - if (lh->ring[LIH_INDEX(lh->counter)] == NULL) - lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab, - GFP_ATOMIC); - return lh->ring[LIH_INDEX(lh->counter)]; -} - -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh) -{ - if (!tfrc_lh_is_initialised(lh)) - return; - - for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++) - if (lh->ring[LIH_INDEX(lh->counter)] != NULL) { - kmem_cache_free(tfrc_lh_slab, - lh->ring[LIH_INDEX(lh->counter)]); - lh->ring[LIH_INDEX(lh->counter)] = NULL; - } -} - -static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) -{ - u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0; - int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */ - - if (k <= 0) - return; - - for (i = 0; i <= k; i++) { - i_i = tfrc_lh_get_interval(lh, i); - - if (i < k) { - i_tot0 += i_i * tfrc_lh_weights[i]; - w_tot += tfrc_lh_weights[i]; - } - if (i > 0) - i_tot1 += i_i * tfrc_lh_weights[i-1]; - } - - lh->i_mean = max(i_tot0, i_tot1) / w_tot; -} - -/** - * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 - * @lh: histogram to update - * @skb: received socket triggering loss interval update - * - * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev - */ -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); - u32 old_i_mean = lh->i_mean; - s64 len; - - if (cur == NULL) /* not initialised */ - return 0; - - len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; - - if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ - return 0; - - if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) - /* - * Implements RFC 4342, 10.2: - * If a packet S (skb) exists whose seqno comes `after' the one - * starting the current loss interval (cur) and if the modulo-16 - * distance from C(cur) to C(S) is greater than 4, consider all - * subsequent packets as belonging to a new loss interval. This - * test is necessary since CCVal may wrap between intervals. - */ - cur->li_is_closed = 1; - - if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ - return 0; - - cur->li_length = len; - tfrc_lh_calc_i_mean(lh); - - return lh->i_mean < old_i_mean; -} - -/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ -static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, - struct tfrc_rx_hist_entry *new_loss) -{ - return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 && - (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4); -} - -/** - * tfrc_lh_interval_add - Insert new record into the Loss Interval database - * @lh: Loss Interval database - * @rh: Receive history containing a fresh loss event - * @calc_first_li: Caller-dependent routine to compute length of first interval - * @sk: Used by @calc_first_li in caller-specific way (subtyping) - * - * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. - */ -int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; - - if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) - return 0; - - new = tfrc_lh_demand_next(lh); - if (unlikely(new == NULL)) { - DCCP_CRIT("Cannot allocate/add loss record."); - return 0; - } - - new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; - new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval; - new->li_is_closed = 0; - - if (++lh->counter == 1) - lh->i_mean = new->li_length = (*calc_first_li)(sk); - else { - cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno); - new->li_length = dccp_delta_seqno(new->li_seqno, - tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1; - if (lh->counter > (2*LIH_SIZE)) - lh->counter -= LIH_SIZE; - - tfrc_lh_calc_i_mean(lh); - } - return 1; -} - -int __init tfrc_li_init(void) -{ - tfrc_lh_slab = kmem_cache_create("tfrc_li_hist", - sizeof(struct tfrc_loss_interval), 0, - SLAB_HWCACHE_ALIGN, NULL); - return tfrc_lh_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_li_exit(void) -{ - if (tfrc_lh_slab != NULL) { - kmem_cache_destroy(tfrc_lh_slab); - tfrc_lh_slab = NULL; - } -} diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h deleted file mode 100644 index c3d95f85e43b1..0000000000000 --- a/net/dccp/ccids/lib/loss_interval.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _DCCP_LI_HIST_ -#define _DCCP_LI_HIST_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ -#include <linux/ktime.h> -#include <linux/list.h> -#include <linux/slab.h> - -/* - * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than - * NINTERVAL, since the `open' interval I_0 is always stored as the first entry. - */ -#define NINTERVAL 8 -#define LIH_SIZE (NINTERVAL + 1) - -/** - * tfrc_loss_interval - Loss history record for TFRC-based protocols - * @li_seqno: Highest received seqno before the start of loss - * @li_ccval: The CCVal belonging to @li_seqno - * @li_is_closed: Whether @li_seqno is older than 1 RTT - * @li_length: Loss interval sequence length - */ -struct tfrc_loss_interval { - u64 li_seqno:48, - li_ccval:4, - li_is_closed:1; - u32 li_length; -}; - -/** - * tfrc_loss_hist - Loss record database - * @ring: Circular queue managed in LIFO manner - * @counter: Current count of entries (can be more than %LIH_SIZE) - * @i_mean: Current Average Loss Interval [RFC 3448, 5.4] - */ -struct tfrc_loss_hist { - struct tfrc_loss_interval *ring[LIH_SIZE]; - u8 counter; - u32 i_mean; -}; - -static inline void tfrc_lh_init(struct tfrc_loss_hist *lh) -{ - memset(lh, 0, sizeof(struct tfrc_loss_hist)); -} - -static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh) -{ - return lh->counter > 0; -} - -static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) -{ - return min(lh->counter, (u8)LIH_SIZE); -} - -struct tfrc_rx_hist; - -int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, - u32 (*first_li)(struct sock *), struct sock *); -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); - -#endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c deleted file mode 100644 index 0cdda3c66fb5e..0000000000000 --- a/net/dccp/ccids/lib/packet_history.c +++ /dev/null @@ -1,439 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/string.h> -#include <linux/slab.h> -#include "packet_history.h" -#include "../../dccp.h" - -/* - * Transmitter History Routines - */ -static struct kmem_cache *tfrc_tx_hist_slab; - -int __init tfrc_tx_packet_history_init(void) -{ - tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist", - sizeof(struct tfrc_tx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_tx_packet_history_exit(void) -{ - if (tfrc_tx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_tx_hist_slab); - tfrc_tx_hist_slab = NULL; - } -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) -{ - struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); - - if (entry == NULL) - return -ENOBUFS; - entry->seqno = seqno; - entry->stamp = ktime_get_real(); - entry->next = *headp; - *headp = entry; - return 0; -} - -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) -{ - struct tfrc_tx_hist_entry *head = *headp; - - while (head != NULL) { - struct tfrc_tx_hist_entry *next = head->next; - - kmem_cache_free(tfrc_tx_hist_slab, head); - head = next; - } - - *headp = NULL; -} - -/* - * Receiver History Routines - */ -static struct kmem_cache *tfrc_rx_hist_slab; - -int __init tfrc_rx_packet_history_init(void) -{ - tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache", - sizeof(struct tfrc_rx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_rx_packet_history_exit(void) -{ - if (tfrc_rx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_rx_hist_slab); - tfrc_rx_hist_slab = NULL; - } -} - -static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry, - const struct sk_buff *skb, - const u64 ndp) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - - entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - entry->tfrchrx_ccval = dh->dccph_ccval; - entry->tfrchrx_type = dh->dccph_type; - entry->tfrchrx_ndp = ndp; - entry->tfrchrx_tstamp = ktime_get_real(); -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, - const struct sk_buff *skb, - const u64 ndp) -{ - struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h); - - tfrc_rx_hist_entry_from_skb(entry, skb, ndp); -} - -/* has the packet contained in skb been seen before? */ -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) -{ - const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq; - int i; - - if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0) - return 1; - - for (i = 1; i <= h->loss_count; i++) - if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq) - return 1; - - return 0; -} - -static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) -{ - const u8 idx_a = tfrc_rx_hist_index(h, a), - idx_b = tfrc_rx_hist_index(h, b); - - swap(h->ring[idx_a], h->ring[idx_b]); -} - -/* - * Private helper functions for loss detection. - * - * In the descriptions, `Si' refers to the sequence number of entry number i, - * whose NDP count is `Ni' (lower case is used for variables). - * Note: All __xxx_loss functions expect that a test against duplicates has been - * performed already: the seqno of the skb must not be less than the seqno - * of loss_prev; and it must not equal that of any valid history entry. - */ -static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ - h->loss_count = 1; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); - } -} - -static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */ - h->loss_count = 2; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2); - return; - } - - /* S0 < S2 < S1 */ - - if (dccp_loss_free(s0, s2, n2)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s2, s1, n1)) { - /* hole is filled: S0, S2, and S1 are consecutive */ - h->loss_count = 0; - h->loss_start = tfrc_rx_hist_index(h, 1); - } else - /* gap between S2 and S1: just update loss_prev */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); - - } else { /* gap between S0 and S2 */ - /* - * Reorder history to insert S2 between S0 and S1 - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2); - h->loss_count = 2; - } -} - -/* return 1 if a new loss event has been identified */ -static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */ - h->loss_count = 3; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3); - return 1; - } - - /* S3 < S2 */ - - if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */ - /* - * Reorder history to insert S3 between S1 and S2 - */ - tfrc_rx_hist_swap(h, 2, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3); - h->loss_count = 3; - return 1; - } - - /* S0 < S3 < S1 */ - - if (dccp_loss_free(s0, s3, n3)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s3, s1, n1)) { - /* hole between S0 and S1 filled by S3 */ - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - /* entire hole filled by S0, S3, S1, S2 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 0; - } else { - /* gap remains between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 1; - } - - } else /* gap exists between S3 and S1, loss_count stays at 2 */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3); - - return 0; - } - - /* - * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3 - * Reorder history to insert S3 between S0 and S1. - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3); - h->loss_count = 3; - - return 1; -} - -/* recycle RX history records to continue loss detection if necessary */ -static void __three_after_loss(struct tfrc_rx_hist *h) -{ - /* - * At this stage we know already that there is a gap between S0 and S1 - * (since S0 was the highest sequence number received before detecting - * the loss). To recycle the loss record, it is thus only necessary to - * check for other possible gaps between S1/S2 and between S2/S3. - */ - u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno; - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp, - n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - - if (dccp_loss_free(s2, s3, n3)) { - /* no gap between S2 and S3: entire hole is filled */ - h->loss_start = tfrc_rx_hist_index(h, 3); - h->loss_count = 0; - } else { - /* gap between S2 and S3 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 1; - } - - } else { /* gap between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 2; - } -} - -/** - * tfrc_rx_handle_loss - Loss detection and further processing - * @h: The non-empty RX history object - * @lh: Loss Intervals database to update - * @skb: Currently received packet - * @ndp: The NDP count belonging to @skb - * @calc_first_li: Caller-dependent computation of first loss interval in @lh - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) - * - * Chooses action according to pending loss, updates LI database when a new - * loss was detected, and does required post-processing. Returns 1 when caller - * should send feedback, 0 otherwise. - * Since it also takes care of reordering during loss detection and updates the - * records accordingly, the caller should not perform any more RX history - * operations when loss_count is greater than 0 after calling this function. - */ -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - int is_new_loss = 0; - - if (h->loss_count == 0) { - __do_track_loss(h, skb, ndp); - } else if (h->loss_count == 1) { - __one_after_loss(h, skb, ndp); - } else if (h->loss_count != 2) { - DCCP_BUG("invalid loss_count %d", h->loss_count); - } else if (__two_after_loss(h, skb, ndp)) { - /* - * Update Loss Interval database and recycle RX records - */ - is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); - __three_after_loss(h); - } - return is_new_loss; -} - -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; i++) { - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); - if (h->ring[i] == NULL) - goto out_free; - } - - h->loss_count = h->loss_start = 0; - return 0; - -out_free: - while (i-- != 0) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } - return -ENOBUFS; -} - -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; ++i) - if (h->ring[i] != NULL) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } -} - -/** - * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) -{ - return h->ring[0]; -} - -/** - * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) -{ - return h->ring[h->rtt_sample_prev]; -} - -/** - * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal - * @h: receive histogram - * @skb: packet containing timestamp. - * - * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able - * to compute a sample with given data - calling function should check this. - */ -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) -{ - u32 sample = 0, - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - - if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ - if (h->rtt_sample_prev == 2) { /* previous candidate stored */ - sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - if (sample) - sample = 4 / sample * - ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); - else /* - * FIXME: This condition is in principle not - * possible but occurs when CCID is used for - * two-way data traffic. I have tried to trace - * it, but the cause does not seem to be here. - */ - DCCP_BUG("please report to dccp@vger.kernel.org" - " => prev = %u, last = %u", - tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - } else if (delta_v < 1) { - h->rtt_sample_prev = 1; - goto keep_ref_for_next_time; - } - - } else if (delta_v == 4) /* optimal match */ - sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); - else { /* suboptimal match */ - h->rtt_sample_prev = 2; - goto keep_ref_for_next_time; - } - - if (unlikely(sample > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %u too large, using max\n", sample); - sample = DCCP_SANE_RTT_MAX; - } - - h->rtt_sample_prev = 0; /* use current entry as next reference */ -keep_ref_for_next_time: - - return sample; -} diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h deleted file mode 100644 index 159cc9326eabc..0000000000000 --- a/net/dccp/ccids/lib/packet_history.h +++ /dev/null @@ -1,142 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Packet RX/TX history data structures and routines for TFRC-based protocols. - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#ifndef _DCCP_PKT_HIST_ -#define _DCCP_PKT_HIST_ - -#include <linux/list.h> -#include <linux/slab.h> -#include "tfrc.h" - -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - -static inline struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - return head; -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); - -/* Subtraction a-b modulo-16, respects circular wrap-around */ -#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) - -/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */ -#define TFRC_NDUPACK 3 - -/** - * tfrc_rx_hist_entry - Store information about a single received packet - * @tfrchrx_seqno: DCCP packet sequence number - * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1) - * @tfrchrx_ndp: the NDP count (if any) of the packet - * @tfrchrx_tstamp: actual receive time of packet - */ -struct tfrc_rx_hist_entry { - u64 tfrchrx_seqno:48, - tfrchrx_ccval:4, - tfrchrx_type:4; - u64 tfrchrx_ndp:48; - ktime_t tfrchrx_tstamp; -}; - -/** - * tfrc_rx_hist - RX history structure for TFRC-based protocols - * @ring: Packet history for RTT sampling and loss detection - * @loss_count: Number of entries in circular history - * @loss_start: Movable index (for loss detection) - * @rtt_sample_prev: Used during RTT sampling, points to candidate entry - */ -struct tfrc_rx_hist { - struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; - u8 loss_count:2, - loss_start:2; -#define rtt_sample_prev loss_start -}; - -/** - * tfrc_rx_hist_index - index to reach n-th entry after loss_start - */ -static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n) -{ - return (h->loss_start + n) & TFRC_NDUPACK; -} - -/** - * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h) -{ - return h->ring[tfrc_rx_hist_index(h, h->loss_count)]; -} - -/** - * tfrc_rx_hist_entry - return the n-th history entry after loss_start - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n) -{ - return h->ring[tfrc_rx_hist_index(h, n)]; -} - -/** - * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h) -{ - return h->ring[h->loss_start]; -} - -/* indicate whether previously a packet was detected missing */ -static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) -{ - return h->loss_count > 0; -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, - const u64 ndp); - -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); - -struct tfrc_loss_hist; -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *sk), struct sock *sk); -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb); -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); - -#endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c deleted file mode 100644 index d7f265e1f50c8..0000000000000 --- a/net/dccp/ccids/lib/tfrc.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * TFRC library initialisation - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com> - */ -#include <linux/moduleparam.h> -#include "tfrc.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -bool tfrc_debug; -module_param(tfrc_debug, bool, 0644); -MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages"); -#endif - -int __init tfrc_lib_init(void) -{ - int rc = tfrc_li_init(); - - if (rc) - goto out; - - rc = tfrc_tx_packet_history_init(); - if (rc) - goto out_free_loss_intervals; - - rc = tfrc_rx_packet_history_init(); - if (rc) - goto out_free_tx_history; - return 0; - -out_free_tx_history: - tfrc_tx_packet_history_exit(); -out_free_loss_intervals: - tfrc_li_exit(); -out: - return rc; -} - -void tfrc_lib_exit(void) -{ - tfrc_rx_packet_history_exit(); - tfrc_tx_packet_history_exit(); - tfrc_li_exit(); -} diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h deleted file mode 100644 index 0a63e8750cc5b..0000000000000 --- a/net/dccp/ccids/lib/tfrc.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _TFRC_H_ -#define _TFRC_H_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include <linux/types.h> -#include <linux/math64.h> -#include "../../dccp.h" - -/* internal includes that this library exports: */ -#include "loss_interval.h" -#include "packet_history.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -extern bool tfrc_debug; -#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) -#else -#define tfrc_pr_debug(format, a...) -#endif - -/* integer-arithmetic divisions of type (a * 1000000)/b */ -static inline u64 scaled_div(u64 a, u64 b) -{ - BUG_ON(b == 0); - return div64_u64(a * 1000000, b); -} - -static inline u32 scaled_div32(u64 a, u64 b) -{ - u64 result = scaled_div(a, b); - - if (result > UINT_MAX) { - DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX", - (unsigned long long)a, (unsigned long long)b); - return UINT_MAX; - } - return result; -} - -/** - * tfrc_ewma - Exponentially weighted moving average - * @weight: Weight to be used as damping factor, in units of 1/10 - */ -static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) -{ - return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval; -} - -u32 tfrc_calc_x(u16 s, u32 R, u32 p); -u32 tfrc_calc_x_reverse_lookup(u32 fvalue); -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); - -int tfrc_tx_packet_history_init(void); -void tfrc_tx_packet_history_exit(void); -int tfrc_rx_packet_history_init(void); -void tfrc_rx_packet_history_exit(void); - -int tfrc_li_init(void); -void tfrc_li_exit(void); - -#ifdef CONFIG_IP_DCCP_TFRC_LIB -int tfrc_lib_init(void); -void tfrc_lib_exit(void); -#else -#define tfrc_lib_init() (0) -#define tfrc_lib_exit() -#endif -#endif /* _TFRC_H_ */ diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c deleted file mode 100644 index 92a8c6bea3167..0000000000000 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ /dev/null @@ -1,702 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ - -#include <linux/module.h> -#include "../../dccp.h" -#include "tfrc.h" - -#define TFRC_CALC_X_ARRSIZE 500 -#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ -#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) - -/* - TFRC TCP Reno Throughput Equation Lookup Table for f(p) - - The following two-column lookup table implements a part of the TCP throughput - equation from [RFC 3448, sec. 3.1]: - - s - X_calc = -------------------------------------------------------------- - R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) - - Where: - X is the transmit rate in bytes/second - s is the packet size in bytes - R is the round trip time in seconds - p is the loss event rate, between 0 and 1.0, of the number of loss - events as a fraction of the number of packets transmitted - t_RTO is the TCP retransmission timeout value in seconds - b is the number of packets acknowledged by a single TCP ACK - - We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: - - s - X_calc = ------------------------------------------------------- - R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) - - which we can break down into: - - s - X_calc = --------- - R * f(p) - - where f(p) is given for 0 < p <= 1 by: - - f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) - - Since this is kernel code, floating-point arithmetic is avoided in favour of - integer arithmetic. This means that nearly all fractional parameters are - scaled by 1000000: - * the parameters p and R - * the return result f(p) - The lookup table therefore actually tabulates the following function g(q): - - g(q) = 1000000 * f(q/1000000) - - Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer - granularity for the practically more relevant case of small values of p (up to - 5%), the second column is used; the first one ranges up to 100%. This split - corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also - determines the smallest resolution possible with this lookup table: - - TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE - - The entire table is generated by: - for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { - lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); - lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); - } - - With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, - lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) - lookup[M][0] = g(1000000) = 1000000 * f(100%) - lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) - lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) - - In summary, the two columns represent f(p) for the following ranges: - * The first column is for 0.002 <= p <= 1.0 - * The second column is for 0.0001 <= p <= 0.05 - Where the columns overlap, the second (finer-grained) is given preference, - i.e. the first column is used only for p >= 0.05. - */ -static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { - { 37172, 8172 }, - { 53499, 11567 }, - { 66664, 14180 }, - { 78298, 16388 }, - { 89021, 18339 }, - { 99147, 20108 }, - { 108858, 21738 }, - { 118273, 23260 }, - { 127474, 24693 }, - { 136520, 26052 }, - { 145456, 27348 }, - { 154316, 28589 }, - { 163130, 29783 }, - { 171919, 30935 }, - { 180704, 32049 }, - { 189502, 33130 }, - { 198328, 34180 }, - { 207194, 35202 }, - { 216114, 36198 }, - { 225097, 37172 }, - { 234153, 38123 }, - { 243294, 39055 }, - { 252527, 39968 }, - { 261861, 40864 }, - { 271305, 41743 }, - { 280866, 42607 }, - { 290553, 43457 }, - { 300372, 44293 }, - { 310333, 45117 }, - { 320441, 45929 }, - { 330705, 46729 }, - { 341131, 47518 }, - { 351728, 48297 }, - { 362501, 49066 }, - { 373460, 49826 }, - { 384609, 50577 }, - { 395958, 51320 }, - { 407513, 52054 }, - { 419281, 52780 }, - { 431270, 53499 }, - { 443487, 54211 }, - { 455940, 54916 }, - { 468635, 55614 }, - { 481581, 56306 }, - { 494785, 56991 }, - { 508254, 57671 }, - { 521996, 58345 }, - { 536019, 59014 }, - { 550331, 59677 }, - { 564939, 60335 }, - { 579851, 60988 }, - { 595075, 61636 }, - { 610619, 62279 }, - { 626491, 62918 }, - { 642700, 63553 }, - { 659253, 64183 }, - { 676158, 64809 }, - { 693424, 65431 }, - { 711060, 66050 }, - { 729073, 66664 }, - { 747472, 67275 }, - { 766266, 67882 }, - { 785464, 68486 }, - { 805073, 69087 }, - { 825103, 69684 }, - { 845562, 70278 }, - { 866460, 70868 }, - { 887805, 71456 }, - { 909606, 72041 }, - { 931873, 72623 }, - { 954614, 73202 }, - { 977839, 73778 }, - { 1001557, 74352 }, - { 1025777, 74923 }, - { 1050508, 75492 }, - { 1075761, 76058 }, - { 1101544, 76621 }, - { 1127867, 77183 }, - { 1154739, 77741 }, - { 1182172, 78298 }, - { 1210173, 78852 }, - { 1238753, 79405 }, - { 1267922, 79955 }, - { 1297689, 80503 }, - { 1328066, 81049 }, - { 1359060, 81593 }, - { 1390684, 82135 }, - { 1422947, 82675 }, - { 1455859, 83213 }, - { 1489430, 83750 }, - { 1523671, 84284 }, - { 1558593, 84817 }, - { 1594205, 85348 }, - { 1630518, 85878 }, - { 1667543, 86406 }, - { 1705290, 86932 }, - { 1743770, 87457 }, - { 1782994, 87980 }, - { 1822973, 88501 }, - { 1863717, 89021 }, - { 1905237, 89540 }, - { 1947545, 90057 }, - { 1990650, 90573 }, - { 2034566, 91087 }, - { 2079301, 91600 }, - { 2124869, 92111 }, - { 2171279, 92622 }, - { 2218543, 93131 }, - { 2266673, 93639 }, - { 2315680, 94145 }, - { 2365575, 94650 }, - { 2416371, 95154 }, - { 2468077, 95657 }, - { 2520707, 96159 }, - { 2574271, 96660 }, - { 2628782, 97159 }, - { 2684250, 97658 }, - { 2740689, 98155 }, - { 2798110, 98651 }, - { 2856524, 99147 }, - { 2915944, 99641 }, - { 2976382, 100134 }, - { 3037850, 100626 }, - { 3100360, 101117 }, - { 3163924, 101608 }, - { 3228554, 102097 }, - { 3294263, 102586 }, - { 3361063, 103073 }, - { 3428966, 103560 }, - { 3497984, 104045 }, - { 3568131, 104530 }, - { 3639419, 105014 }, - { 3711860, 105498 }, - { 3785467, 105980 }, - { 3860253, 106462 }, - { 3936229, 106942 }, - { 4013410, 107422 }, - { 4091808, 107902 }, - { 4171435, 108380 }, - { 4252306, 108858 }, - { 4334431, 109335 }, - { 4417825, 109811 }, - { 4502501, 110287 }, - { 4588472, 110762 }, - { 4675750, 111236 }, - { 4764349, 111709 }, - { 4854283, 112182 }, - { 4945564, 112654 }, - { 5038206, 113126 }, - { 5132223, 113597 }, - { 5227627, 114067 }, - { 5324432, 114537 }, - { 5422652, 115006 }, - { 5522299, 115474 }, - { 5623389, 115942 }, - { 5725934, 116409 }, - { 5829948, 116876 }, - { 5935446, 117342 }, - { 6042439, 117808 }, - { 6150943, 118273 }, - { 6260972, 118738 }, - { 6372538, 119202 }, - { 6485657, 119665 }, - { 6600342, 120128 }, - { 6716607, 120591 }, - { 6834467, 121053 }, - { 6953935, 121514 }, - { 7075025, 121976 }, - { 7197752, 122436 }, - { 7322131, 122896 }, - { 7448175, 123356 }, - { 7575898, 123815 }, - { 7705316, 124274 }, - { 7836442, 124733 }, - { 7969291, 125191 }, - { 8103877, 125648 }, - { 8240216, 126105 }, - { 8378321, 126562 }, - { 8518208, 127018 }, - { 8659890, 127474 }, - { 8803384, 127930 }, - { 8948702, 128385 }, - { 9095861, 128840 }, - { 9244875, 129294 }, - { 9395760, 129748 }, - { 9548529, 130202 }, - { 9703198, 130655 }, - { 9859782, 131108 }, - { 10018296, 131561 }, - { 10178755, 132014 }, - { 10341174, 132466 }, - { 10505569, 132917 }, - { 10671954, 133369 }, - { 10840345, 133820 }, - { 11010757, 134271 }, - { 11183206, 134721 }, - { 11357706, 135171 }, - { 11534274, 135621 }, - { 11712924, 136071 }, - { 11893673, 136520 }, - { 12076536, 136969 }, - { 12261527, 137418 }, - { 12448664, 137867 }, - { 12637961, 138315 }, - { 12829435, 138763 }, - { 13023101, 139211 }, - { 13218974, 139658 }, - { 13417071, 140106 }, - { 13617407, 140553 }, - { 13819999, 140999 }, - { 14024862, 141446 }, - { 14232012, 141892 }, - { 14441465, 142339 }, - { 14653238, 142785 }, - { 14867346, 143230 }, - { 15083805, 143676 }, - { 15302632, 144121 }, - { 15523842, 144566 }, - { 15747453, 145011 }, - { 15973479, 145456 }, - { 16201939, 145900 }, - { 16432847, 146345 }, - { 16666221, 146789 }, - { 16902076, 147233 }, - { 17140429, 147677 }, - { 17381297, 148121 }, - { 17624696, 148564 }, - { 17870643, 149007 }, - { 18119154, 149451 }, - { 18370247, 149894 }, - { 18623936, 150336 }, - { 18880241, 150779 }, - { 19139176, 151222 }, - { 19400759, 151664 }, - { 19665007, 152107 }, - { 19931936, 152549 }, - { 20201564, 152991 }, - { 20473907, 153433 }, - { 20748982, 153875 }, - { 21026807, 154316 }, - { 21307399, 154758 }, - { 21590773, 155199 }, - { 21876949, 155641 }, - { 22165941, 156082 }, - { 22457769, 156523 }, - { 22752449, 156964 }, - { 23049999, 157405 }, - { 23350435, 157846 }, - { 23653774, 158287 }, - { 23960036, 158727 }, - { 24269236, 159168 }, - { 24581392, 159608 }, - { 24896521, 160049 }, - { 25214642, 160489 }, - { 25535772, 160929 }, - { 25859927, 161370 }, - { 26187127, 161810 }, - { 26517388, 162250 }, - { 26850728, 162690 }, - { 27187165, 163130 }, - { 27526716, 163569 }, - { 27869400, 164009 }, - { 28215234, 164449 }, - { 28564236, 164889 }, - { 28916423, 165328 }, - { 29271815, 165768 }, - { 29630428, 166208 }, - { 29992281, 166647 }, - { 30357392, 167087 }, - { 30725779, 167526 }, - { 31097459, 167965 }, - { 31472452, 168405 }, - { 31850774, 168844 }, - { 32232445, 169283 }, - { 32617482, 169723 }, - { 33005904, 170162 }, - { 33397730, 170601 }, - { 33792976, 171041 }, - { 34191663, 171480 }, - { 34593807, 171919 }, - { 34999428, 172358 }, - { 35408544, 172797 }, - { 35821174, 173237 }, - { 36237335, 173676 }, - { 36657047, 174115 }, - { 37080329, 174554 }, - { 37507197, 174993 }, - { 37937673, 175433 }, - { 38371773, 175872 }, - { 38809517, 176311 }, - { 39250924, 176750 }, - { 39696012, 177190 }, - { 40144800, 177629 }, - { 40597308, 178068 }, - { 41053553, 178507 }, - { 41513554, 178947 }, - { 41977332, 179386 }, - { 42444904, 179825 }, - { 42916290, 180265 }, - { 43391509, 180704 }, - { 43870579, 181144 }, - { 44353520, 181583 }, - { 44840352, 182023 }, - { 45331092, 182462 }, - { 45825761, 182902 }, - { 46324378, 183342 }, - { 46826961, 183781 }, - { 47333531, 184221 }, - { 47844106, 184661 }, - { 48358706, 185101 }, - { 48877350, 185541 }, - { 49400058, 185981 }, - { 49926849, 186421 }, - { 50457743, 186861 }, - { 50992759, 187301 }, - { 51531916, 187741 }, - { 52075235, 188181 }, - { 52622735, 188622 }, - { 53174435, 189062 }, - { 53730355, 189502 }, - { 54290515, 189943 }, - { 54854935, 190383 }, - { 55423634, 190824 }, - { 55996633, 191265 }, - { 56573950, 191706 }, - { 57155606, 192146 }, - { 57741621, 192587 }, - { 58332014, 193028 }, - { 58926806, 193470 }, - { 59526017, 193911 }, - { 60129666, 194352 }, - { 60737774, 194793 }, - { 61350361, 195235 }, - { 61967446, 195677 }, - { 62589050, 196118 }, - { 63215194, 196560 }, - { 63845897, 197002 }, - { 64481179, 197444 }, - { 65121061, 197886 }, - { 65765563, 198328 }, - { 66414705, 198770 }, - { 67068508, 199213 }, - { 67726992, 199655 }, - { 68390177, 200098 }, - { 69058085, 200540 }, - { 69730735, 200983 }, - { 70408147, 201426 }, - { 71090343, 201869 }, - { 71777343, 202312 }, - { 72469168, 202755 }, - { 73165837, 203199 }, - { 73867373, 203642 }, - { 74573795, 204086 }, - { 75285124, 204529 }, - { 76001380, 204973 }, - { 76722586, 205417 }, - { 77448761, 205861 }, - { 78179926, 206306 }, - { 78916102, 206750 }, - { 79657310, 207194 }, - { 80403571, 207639 }, - { 81154906, 208084 }, - { 81911335, 208529 }, - { 82672880, 208974 }, - { 83439562, 209419 }, - { 84211402, 209864 }, - { 84988421, 210309 }, - { 85770640, 210755 }, - { 86558080, 211201 }, - { 87350762, 211647 }, - { 88148708, 212093 }, - { 88951938, 212539 }, - { 89760475, 212985 }, - { 90574339, 213432 }, - { 91393551, 213878 }, - { 92218133, 214325 }, - { 93048107, 214772 }, - { 93883493, 215219 }, - { 94724314, 215666 }, - { 95570590, 216114 }, - { 96422343, 216561 }, - { 97279594, 217009 }, - { 98142366, 217457 }, - { 99010679, 217905 }, - { 99884556, 218353 }, - { 100764018, 218801 }, - { 101649086, 219250 }, - { 102539782, 219698 }, - { 103436128, 220147 }, - { 104338146, 220596 }, - { 105245857, 221046 }, - { 106159284, 221495 }, - { 107078448, 221945 }, - { 108003370, 222394 }, - { 108934074, 222844 }, - { 109870580, 223294 }, - { 110812910, 223745 }, - { 111761087, 224195 }, - { 112715133, 224646 }, - { 113675069, 225097 }, - { 114640918, 225548 }, - { 115612702, 225999 }, - { 116590442, 226450 }, - { 117574162, 226902 }, - { 118563882, 227353 }, - { 119559626, 227805 }, - { 120561415, 228258 }, - { 121569272, 228710 }, - { 122583219, 229162 }, - { 123603278, 229615 }, - { 124629471, 230068 }, - { 125661822, 230521 }, - { 126700352, 230974 }, - { 127745083, 231428 }, - { 128796039, 231882 }, - { 129853241, 232336 }, - { 130916713, 232790 }, - { 131986475, 233244 }, - { 133062553, 233699 }, - { 134144966, 234153 }, - { 135233739, 234608 }, - { 136328894, 235064 }, - { 137430453, 235519 }, - { 138538440, 235975 }, - { 139652876, 236430 }, - { 140773786, 236886 }, - { 141901190, 237343 }, - { 143035113, 237799 }, - { 144175576, 238256 }, - { 145322604, 238713 }, - { 146476218, 239170 }, - { 147636442, 239627 }, - { 148803298, 240085 }, - { 149976809, 240542 }, - { 151156999, 241000 }, - { 152343890, 241459 }, - { 153537506, 241917 }, - { 154737869, 242376 }, - { 155945002, 242835 }, - { 157158929, 243294 }, - { 158379673, 243753 }, - { 159607257, 244213 }, - { 160841704, 244673 }, - { 162083037, 245133 }, - { 163331279, 245593 }, - { 164586455, 246054 }, - { 165848586, 246514 }, - { 167117696, 246975 }, - { 168393810, 247437 }, - { 169676949, 247898 }, - { 170967138, 248360 }, - { 172264399, 248822 }, - { 173568757, 249284 }, - { 174880235, 249747 }, - { 176198856, 250209 }, - { 177524643, 250672 }, - { 178857621, 251136 }, - { 180197813, 251599 }, - { 181545242, 252063 }, - { 182899933, 252527 }, - { 184261908, 252991 }, - { 185631191, 253456 }, - { 187007807, 253920 }, - { 188391778, 254385 }, - { 189783129, 254851 }, - { 191181884, 255316 }, - { 192588065, 255782 }, - { 194001698, 256248 }, - { 195422805, 256714 }, - { 196851411, 257181 }, - { 198287540, 257648 }, - { 199731215, 258115 }, - { 201182461, 258582 }, - { 202641302, 259050 }, - { 204107760, 259518 }, - { 205581862, 259986 }, - { 207063630, 260454 }, - { 208553088, 260923 }, - { 210050262, 261392 }, - { 211555174, 261861 }, - { 213067849, 262331 }, - { 214588312, 262800 }, - { 216116586, 263270 }, - { 217652696, 263741 }, - { 219196666, 264211 }, - { 220748520, 264682 }, - { 222308282, 265153 }, - { 223875978, 265625 }, - { 225451630, 266097 }, - { 227035265, 266569 }, - { 228626905, 267041 }, - { 230226576, 267514 }, - { 231834302, 267986 }, - { 233450107, 268460 }, - { 235074016, 268933 }, - { 236706054, 269407 }, - { 238346244, 269881 }, - { 239994613, 270355 }, - { 241651183, 270830 }, - { 243315981, 271305 } -}; - -/* return largest index i such that fval <= lookup[i][small] */ -static inline u32 tfrc_binsearch(u32 fval, u8 small) -{ - u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; - - while (low < high) { - try = (low + high) / 2; - if (fval <= tfrc_calc_x_lookup[try][small]) - high = try; - else - low = try + 1; - } - return high; -} - -/** - * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 - * @s: packet size in bytes - * @R: RTT scaled by 1000000 (i.e., microseconds) - * @p: loss ratio estimate scaled by 1000000 - * - * Returns X_calc in bytes per second (not scaled). - */ -u32 tfrc_calc_x(u16 s, u32 R, u32 p) -{ - u16 index; - u32 f; - u64 result; - - /* check against invalid parameters and divide-by-zero */ - BUG_ON(p > 1000000); /* p must not exceed 100% */ - BUG_ON(p == 0); /* f(0) = 0, divide by zero */ - if (R == 0) { /* possible divide by zero */ - DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); - return ~0U; - } - - if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ - if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ - DCCP_WARN("Value of p (%d) below resolution. " - "Substituting %d\n", p, TFRC_SMALLEST_P); - index = 0; - } else /* 0.0001 <= p <= 0.05 */ - index = p/TFRC_SMALLEST_P - 1; - - f = tfrc_calc_x_lookup[index][1]; - - } else { /* 0.05 < p <= 1.00 */ - index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; - - f = tfrc_calc_x_lookup[index][0]; - } - - /* - * Compute X = s/(R*f(p)) in bytes per second. - * Since f(p) and R are both scaled by 1000000, we need to multiply by - * 1000000^2. To avoid overflow, the result is computed in two stages. - * This works under almost all reasonable operational conditions, for a - * wide range of parameters. Yet, should some strange combination of - * parameters result in overflow, the use of scaled_div32 will catch - * this and return UINT_MAX - which is a logically adequate consequence. - */ - result = scaled_div(s, R); - return scaled_div32(result, f); -} - -/** - * tfrc_calc_x_reverse_lookup - try to find p given f(p) - * @fvalue: function value to match, scaled by 1000000 - * - * Returns closest match for p, also scaled by 1000000 - */ -u32 tfrc_calc_x_reverse_lookup(u32 fvalue) -{ - int index; - - if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ - return 0; - - /* Error cases. */ - if (fvalue < tfrc_calc_x_lookup[0][1]) { - DCCP_WARN("fvalue %u smaller than resolution\n", fvalue); - return TFRC_SMALLEST_P; - } - if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { - DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue); - return 1000000; - } - - if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { - index = tfrc_binsearch(fvalue, 1); - return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; - } - - /* else ... it must be in the coarse-grained column */ - index = tfrc_binsearch(fvalue, 0); - return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; -} - -/** - * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% - * @loss_event_rate: loss event rate to invert - * When @loss_event_rate is large, there is a chance that p is truncated to 0. - * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. - */ -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) -{ - if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ - return 0; - if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ - return 1000000; - return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); -} diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h deleted file mode 100644 index 1f748ed1279d3..0000000000000 --- a/net/dccp/dccp.h +++ /dev/null @@ -1,483 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_H -#define _DCCP_H -/* - * net/dccp/dccp.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz> - */ - -#include <linux/dccp.h> -#include <linux/ktime.h> -#include <net/snmp.h> -#include <net/sock.h> -#include <net/tcp.h> -#include "ackvec.h" - -/* - * DCCP - specific warning and debugging macros. - */ -#define DCCP_WARN(fmt, ...) \ - net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__) -#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \ - __FILE__, __LINE__, __func__) -#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0) -#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \ - DCCP_BUG("\"%s\" holds (exception!)", \ - __stringify(cond)); \ - } while (0) - -#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \ - printk(fmt, ##args); \ - } while(0) -#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \ - "%s: " fmt, __func__, ##a) - -#ifdef CONFIG_IP_DCCP_DEBUG -extern bool dccp_debug; -#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) -#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) -#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) -#else -#define dccp_pr_debug(format, a...) do {} while (0) -#define dccp_pr_debug_cat(format, a...) do {} while (0) -#define dccp_debug(format, a...) do {} while (0) -#endif - -extern struct inet_hashinfo dccp_hashinfo; - -DECLARE_PER_CPU(unsigned int, dccp_orphan_count); - -void dccp_time_wait(struct sock *sk, int state, int timeo); - -/* - * Set safe upper bounds for header and option length. Since Data Offset is 8 - * bits (RFC 4340, sec. 5.1), the total header length can never be more than - * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1): - * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR - * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields - * Hence a safe upper bound for the maximum option length is 1020-28 = 992 - */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) -#define DCCP_MAX_PACKET_HDR 28 -#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) -#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) - -/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ -#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) - -#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT - * state, about 60 seconds */ - -/* RFC 1122, 4.2.3.1 initial RTO value */ -#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ)) - -/* - * The maximum back-off value for retransmissions. This is needed for - * - retransmitting client-Requests (sec. 8.1.1), - * - retransmitting Close/CloseReq when closing (sec. 8.3), - * - feature-negotiation retransmission (sec. 6.6.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5). - */ -#define DCCP_RTO_MAX ((unsigned int)(64 * HZ)) - -/* - * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 - */ -#define DCCP_SANE_RTT_MIN 100 -#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) -#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) - -/* sysctl variables for DCCP */ -extern int sysctl_dccp_request_retries; -extern int sysctl_dccp_retries1; -extern int sysctl_dccp_retries2; -extern int sysctl_dccp_tx_qlen; -extern int sysctl_dccp_sync_ratelimit; - -/* - * 48-bit sequence number arithmetic (signed and unsigned) - */ -#define INT48_MIN 0x800000000000LL /* 2^47 */ -#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */ -#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */ -#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x))) -#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x))) -#define ADD48(a, b) (((a) + (b)) & UINT48_MAX) -#define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) - -static inline void dccp_inc_seqno(u64 *seqno) -{ - *seqno = ADD48(*seqno, 1); -} - -/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */ -static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2) -{ - u64 delta = SUB48(seqno2, seqno1); - - return TO_SIGNED48(delta); -} - -/* is seq1 < seq2 ? */ -static inline int before48(const u64 seq1, const u64 seq2) -{ - return (s64)((seq2 << 16) - (seq1 << 16)) > 0; -} - -/* is seq1 > seq2 ? */ -#define after48(seq1, seq2) before48(seq2, seq1) - -/* is seq2 <= seq1 <= seq3 ? */ -static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) -{ - return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); -} - -/** - * dccp_loss_count - Approximate the number of lost data packets in a burst loss - * @s1: last known sequence number before the loss ('hole') - * @s2: first sequence number seen after the 'hole' - * @ndp: NDP count on packet with sequence number @s2 - */ -static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) -{ - s64 delta = dccp_delta_seqno(s1, s2); - - WARN_ON(delta < 0); - delta -= ndp + 1; - - return delta > 0 ? delta : 0; -} - -/** - * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 - */ -static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) -{ - return dccp_loss_count(s1, s2, ndp) == 0; -} - -enum { - DCCP_MIB_NUM = 0, - DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ - DCCP_MIB_ESTABRESETS, /* EstabResets */ - DCCP_MIB_CURRESTAB, /* CurrEstab */ - DCCP_MIB_OUTSEGS, /* OutSegs */ - DCCP_MIB_OUTRSTS, - DCCP_MIB_ABORTONTIMEOUT, - DCCP_MIB_TIMEOUTS, - DCCP_MIB_ABORTFAILED, - DCCP_MIB_PASSIVEOPENS, - DCCP_MIB_ATTEMPTFAILS, - DCCP_MIB_OUTDATAGRAMS, - DCCP_MIB_INERRS, - DCCP_MIB_OPTMANDATORYERROR, - DCCP_MIB_INVALIDOPT, - __DCCP_MIB_MAX -}; - -#define DCCP_MIB_MAX __DCCP_MIB_MAX -struct dccp_mib { - unsigned long mibs[DCCP_MIB_MAX]; -}; - -DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); -#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) -#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field) -#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) - -/* - * Checksumming routines - */ -static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb) -{ - const struct dccp_hdr* dh = dccp_hdr(skb); - - if (dh->dccph_cscov == 0) - return skb->len; - return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32); -} - -static inline void dccp_csum_outgoing(struct sk_buff *skb) -{ - unsigned int cov = dccp_csum_coverage(skb); - - if (cov >= skb->len) - dccp_hdr(skb)->dccph_cscov = 0; - - skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); - -int dccp_retransmit_skb(struct sock *sk); - -void dccp_send_ack(struct sock *sk); -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk); - -void dccp_send_sync(struct sock *sk, const u64 seq, - const enum dccp_pkt_type pkt_type); - -/* - * TX Packet Dequeueing Interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); -bool dccp_qpolicy_full(struct sock *sk); -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); -struct sk_buff *dccp_qpolicy_top(struct sock *sk); -struct sk_buff *dccp_qpolicy_pop(struct sock *sk); -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); - -/* - * TX Packet Output and TX Timers - */ -void dccp_write_xmit(struct sock *sk); -void dccp_write_space(struct sock *sk); -void dccp_flush_write_queue(struct sock *sk, long *time_budget); - -void dccp_init_xmit_timers(struct sock *sk); -static inline void dccp_clear_xmit_timers(struct sock *sk) -{ - inet_csk_clear_xmit_timers(sk); -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); - -const char *dccp_packet_name(const int type); - -void dccp_set_state(struct sock *sk, const int state); -void dccp_done(struct sock *sk); - -int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, - struct sk_buff const *skb); - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req); -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req); - -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb); -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len); -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len); - -void dccp_destruct_common(struct sock *sk); -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); -void dccp_destroy_sock(struct sock *sk); - -void dccp_close(struct sock *sk, long timeout); -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req); - -int dccp_connect(struct sock *sk); -int dccp_disconnect(struct sock *sk, int flags); -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -int dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); -int dccp_ioctl(struct sock *sk, int cmd, int *karg); -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len); -void dccp_shutdown(struct sock *sk, int how); -int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait); -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); -void dccp_req_err(struct sock *sk, u64 seq); - -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); -void dccp_send_close(struct sock *sk, const int active); -int dccp_invalid_packet(struct sk_buff *skb); -u32 dccp_sample_rtt(struct sock *sk, long delta); - -static inline bool dccp_bad_service_code(const struct sock *sk, - const __be32 service) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - if (dp->dccps_service == service) - return false; - return !dccp_list_has_service(dp->dccps_service_list, service); -} - -/** - * dccp_skb_cb - DCCP per-packet control information - * @dccpd_type: one of %dccp_pkt_type (or unknown) - * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1 - * @dccpd_reset_code: one of %dccp_reset_codes - * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code) - * @dccpd_opt_len: total length of all options (5.8) in the packet - * @dccpd_seq: sequence number - * @dccpd_ack_seq: acknowledgment number subheader field value - * - * This is used for transmission as well as for reception. - */ -struct dccp_skb_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; - __u8 dccpd_type:4; - __u8 dccpd_ccval:4; - __u8 dccpd_reset_code, - dccpd_reset_data[3]; - __u16 dccpd_opt_len; - __u64 dccpd_seq; - __u64 dccpd_ack_seq; -}; - -#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_non_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_ACK || - type == DCCP_PKT_CLOSE || - type == DCCP_PKT_CLOSEREQ || - type == DCCP_PKT_RESET || - type == DCCP_PKT_SYNC || - type == DCCP_PKT_SYNCACK; -} - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || - type == DCCP_PKT_DATAACK || - type == DCCP_PKT_REQUEST || - type == DCCP_PKT_RESPONSE; -} - -static inline int dccp_packet_without_ack(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; -} - -#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2) - -static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) -{ - struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + - sizeof(*dh)); - dh->dccph_seq2 = 0; - dh->dccph_seq = htons((gss >> 32) & 0xfffff); - dhx->dccph_seq_low = htonl(gss & 0xffffffff); -} - -static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, - const u64 gsr) -{ - dhack->dccph_reserved1 = 0; - dhack->dccph_ack_nr_high = htons(gsr >> 32); - dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); -} - -static inline void dccp_update_gsr(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (after48(seq, dp->dccps_gsr)) - dp->dccps_gsr = seq; - /* Sequence validity window depends on remote Sequence Window (7.5.1) */ - dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); - /* - * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, - * 7.5.1 we perform this check beyond the initial handshake: W/W' are - * always > 32, so for the first W/W' packets in the lifetime of a - * connection we always have to adjust SWL. - * A second reason why we are doing this is that the window depends on - * the feature-remote value of Sequence Window: nothing stops the peer - * from updating this value while we are busy adjusting SWL for the - * first W packets (we would have to count from scratch again then). - * Therefore it is safer to always make sure that the Sequence Window - * is not artificially extended by a peer who grows SWL downwards by - * continually updating the feature-remote Sequence-Window. - * If sequence numbers wrap it is bad luck. But that will take a while - * (48 bit), and this measure prevents Sequence-number attacks. - */ - if (before48(dp->dccps_swl, dp->dccps_isr)) - dp->dccps_swl = dp->dccps_isr; - dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); -} - -static inline void dccp_update_gss(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_gss = seq; - /* Ack validity window depends on local Sequence Window value (7.5.1) */ - dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); - /* Adjust AWL so that it is not below ISS - see comment above for SWL */ - if (before48(dp->dccps_awl, dp->dccps_iss)) - dp->dccps_awl = dp->dccps_iss; - dp->dccps_awh = dp->dccps_gss; -} - -static inline int dccp_ackvec_pending(const struct sock *sk) -{ - return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && - !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); -} - -static inline int dccp_ack_pending(const struct sock *sk) -{ - return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); -} - -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); -int dccp_feat_finalise_settings(struct dccp_sock *dp); -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); -int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, - struct sk_buff *skb); -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); -void dccp_feat_list_purge(struct list_head *fn_list); - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb); -int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *); -u32 dccp_timestamp(void); -void dccp_timestamping_init(void); -int dccp_insert_option(struct sk_buff *skb, unsigned char option, - const void *value, unsigned char len); - -#ifdef CONFIG_SYSCTL -int dccp_sysctl_init(void); -void dccp_sysctl_exit(void); -#else -static inline int dccp_sysctl_init(void) -{ - return 0; -} - -static inline void dccp_sysctl_exit(void) -{ -} -#endif - -#endif /* _DCCP_H */ diff --git a/net/dccp/diag.c b/net/dccp/diag.c deleted file mode 100644 index f5019d95c3ae5..0000000000000 --- a/net/dccp/diag.c +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/diag.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@mandriva.com> - */ - - -#include <linux/module.h> -#include <linux/inet_diag.h> - -#include "ccid.h" -#include "dccp.h" - -static void dccp_get_info(struct sock *sk, struct tcp_info *info) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - - memset(info, 0, sizeof(*info)); - - info->tcpi_state = sk->sk_state; - info->tcpi_retransmits = icsk->icsk_retransmits; - info->tcpi_probes = icsk->icsk_probes_out; - info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - - if (dp->dccps_hc_rx_ackvec != NULL) - info->tcpi_options |= TCPI_OPT_SACK; - - if (dp->dccps_hc_rx_ccid != NULL) - ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); - - if (dp->dccps_hc_tx_ccid != NULL) - ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); -} - -static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *_info) -{ - r->idiag_rqueue = r->idiag_wqueue = 0; - - if (_info != NULL) - dccp_get_info(sk, _info); -} - -static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r); -} - -static int dccp_diag_dump_one(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req); -} - -static const struct inet_diag_handler dccp_diag_handler = { - .owner = THIS_MODULE, - .dump = dccp_diag_dump, - .dump_one = dccp_diag_dump_one, - .idiag_get_info = dccp_diag_get_info, - .idiag_type = IPPROTO_DCCP, - .idiag_info_size = sizeof(struct tcp_info), -}; - -static int __init dccp_diag_init(void) -{ - return inet_diag_register(&dccp_diag_handler); -} - -static void __exit dccp_diag_fini(void) -{ - inet_diag_unregister(&dccp_diag_handler); -} - -module_init(dccp_diag_init); -module_exit(dccp_diag_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); -MODULE_DESCRIPTION("DCCP inet_diag handler"); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */); diff --git a/net/dccp/feat.c b/net/dccp/feat.c deleted file mode 100644 index f7554dcdaaba9..0000000000000 --- a/net/dccp/feat.c +++ /dev/null @@ -1,1581 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/feat.c - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * - * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> - * Rewrote from scratch, some bits from earlier code by - * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> - * - * ASSUMPTIONS - * ----------- - * o Feature negotiation is coordinated with connection setup (as in TCP), wild - * changes of parameters of an established connection are not supported. - * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN. - * o All currently known SP features have 1-byte quantities. If in the future - * extensions of RFCs 4340..42 define features with item lengths larger than - * one byte, a feature-specific extension of the code will be required. - */ -#include <linux/module.h> -#include <linux/slab.h> -#include "ccid.h" -#include "feat.h" - -/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ -unsigned long sysctl_dccp_sequence_window __read_mostly = 100; -int sysctl_dccp_rx_ccid __read_mostly = 2, - sysctl_dccp_tx_ccid __read_mostly = 2; - -/* - * Feature activation handlers. - * - * These all use an u64 argument, to provide enough room for NN/SP features. At - * this stage the negotiated values have been checked to be within their range. - */ -static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid *new_ccid = ccid_new(ccid, sk, rx); - - if (new_ccid == NULL) - return -ENOMEM; - - if (rx) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = new_ccid; - } else { - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = new_ccid; - } - return 0; -} - -static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - dp->dccps_r_seq_win = seq_win; - /* propagate changes to update SWL/SWH */ - dccp_update_gsr(sk, dp->dccps_gsr); - } else { - dp->dccps_l_seq_win = seq_win; - /* propagate changes to update AWL */ - dccp_update_gss(sk, dp->dccps_gss); - } - return 0; -} - -static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) -{ - if (rx) - dccp_sk(sk)->dccps_r_ack_ratio = ratio; - else - dccp_sk(sk)->dccps_l_ack_ratio = ratio; - return 0; -} - -static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - if (enable && dp->dccps_hc_rx_ackvec == NULL) { - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); - if (dp->dccps_hc_rx_ackvec == NULL) - return -ENOMEM; - } else if (!enable) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - } - return 0; -} - -static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) -{ - if (!rx) - dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); - return 0; -} - -/* - * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that - * `rx' holds when the sending peer informs about his partial coverage via a - * ChangeR() option. In the other case, we are the sender and the receiver - * announces its coverage via ChangeL() options. The policy here is to honour - * such communication by enabling the corresponding partial coverage - but only - * if it has not been set manually before; the warning here means that all - * packets will be dropped. - */ -static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) - dp->dccps_pcrlen = cscov; - else { - if (dp->dccps_pcslen == 0) - dp->dccps_pcslen = cscov; - else if (cscov > dp->dccps_pcslen) - DCCP_WARN("CsCov %u too small, peer requires >= %u\n", - dp->dccps_pcslen, (u8)cscov); - } - return 0; -} - -static const struct { - u8 feat_num; /* DCCPF_xxx */ - enum dccp_feat_type rxtx; /* RX or TX */ - enum dccp_feat_type reconciliation; /* SP or NN */ - u8 default_value; /* as in 6.4 */ - int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); -/* - * Lookup table for location and type of features (from RFC 4340/4342) - * +--------------------------+----+-----+----+----+---------+-----------+ - * | Feature | Location | Reconc. | Initial | Section | - * | | RX | TX | SP | NN | Value | Reference | - * +--------------------------+----+-----+----+----+---------+-----------+ - * | DCCPF_CCID | | X | X | | 2 | 10 | - * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | - * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | - * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | - * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | - * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | - * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | - * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | - * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | - * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | - * +--------------------------+----+-----+----+----+---------+-----------+ - */ -} dccp_feat_table[] = { - { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, - { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, - { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, - { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, - { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, - { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, - { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, - { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, -}; -#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) - -/** - * dccp_feat_index - Hash function to map feature number into array position - * @feat_num: feature to hash, one of %dccp_feature_numbers - * - * Returns consecutive array index or -1 if the feature is not understood. - */ -static int dccp_feat_index(u8 feat_num) -{ - /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ - if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) - return feat_num - 1; - - /* - * Other features: add cases for new feature types here after adding - * them to the above table. - */ - switch (feat_num) { - case DCCPF_SEND_LEV_RATE: - return DCCP_FEAT_SUPPORTED_MAX - 1; - } - return -1; -} - -static u8 dccp_feat_type(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - - if (idx < 0) - return FEAT_UNKNOWN; - return dccp_feat_table[idx].reconciliation; -} - -static int dccp_feat_default_value(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - /* - * There are no default values for unknown features, so encountering a - * negative index here indicates a serious problem somewhere else. - */ - DCCP_BUG_ON(idx < 0); - - return idx < 0 ? 0 : dccp_feat_table[idx].default_value; -} - -/* - * Debugging and verbose-printing section - */ -static const char *dccp_feat_fname(const u8 feat) -{ - static const char *const feature_names[] = { - [DCCPF_RESERVED] = "Reserved", - [DCCPF_CCID] = "CCID", - [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", - [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", - [DCCPF_ECN_INCAPABLE] = "ECN Incapable", - [DCCPF_ACK_RATIO] = "Ack Ratio", - [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", - [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", - [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", - [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", - }; - if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) - return feature_names[DCCPF_RESERVED]; - - if (feat == DCCPF_SEND_LEV_RATE) - return "Send Loss Event Rate"; - if (feat >= DCCPF_MIN_CCID_SPECIFIC) - return "CCID-specific"; - - return feature_names[feat]; -} - -static const char *const dccp_feat_sname[] = { - "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE", -}; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_feat_oname(const u8 opt) -{ - switch (opt) { - case DCCPO_CHANGE_L: return "Change_L"; - case DCCPO_CONFIRM_L: return "Confirm_L"; - case DCCPO_CHANGE_R: return "Change_R"; - case DCCPO_CONFIRM_R: return "Confirm_R"; - } - return NULL; -} - -static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) -{ - u8 i, type = dccp_feat_type(feat_num); - - if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) - dccp_pr_debug_cat("(NULL)"); - else if (type == FEAT_SP) - for (i = 0; i < val->sp.len; i++) - dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); - else if (type == FEAT_NN) - dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); - else - dccp_pr_debug_cat("unknown type %u", type); -} - -static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) -{ - u8 type = dccp_feat_type(feat_num); - dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; - - if (type == FEAT_NN) - fval.nn = dccp_decode_value_var(list, len); - dccp_feat_printval(feat_num, &fval); -} - -static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) -{ - dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", - dccp_feat_fname(entry->feat_num)); - dccp_feat_printval(entry->feat_num, &entry->val); - dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], - entry->needs_confirm ? "(Confirm pending)" : ""); -} - -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ - dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ - dccp_feat_printvals(feat, val, len); \ - dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) - -#define dccp_feat_print_fnlist(fn_list) { \ - const struct dccp_feat_entry *___entry; \ - \ - dccp_pr_debug("List Dump:\n"); \ - list_for_each_entry(___entry, fn_list, node) \ - dccp_feat_print_entry(___entry); \ -} -#else /* ! CONFIG_IP_DCCP_DEBUG */ -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) -#define dccp_feat_print_fnlist(fn_list) -#endif - -static int __dccp_feat_activate(struct sock *sk, const int idx, - const bool is_local, dccp_feat_val const *fval) -{ - bool rx; - u64 val; - - if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) - return -1; - if (dccp_feat_table[idx].activation_hdlr == NULL) - return 0; - - if (fval == NULL) { - val = dccp_feat_table[idx].default_value; - } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { - if (fval->sp.vec == NULL) { - /* - * This can happen when an empty Confirm is sent - * for an SP (i.e. known) feature. In this case - * we would be using the default anyway. - */ - DCCP_CRIT("Feature #%d undefined: using default", idx); - val = dccp_feat_table[idx].default_value; - } else { - val = fval->sp.vec[0]; - } - } else { - val = fval->nn; - } - - /* Location is RX if this is a local-RX or remote-TX feature */ - rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); - - dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", - dccp_feat_fname(dccp_feat_table[idx].feat_num), - fval ? "" : "default ", (unsigned long long)val); - - return dccp_feat_table[idx].activation_hdlr(sk, val, rx); -} - -/** - * dccp_feat_activate - Activate feature value on socket - * @sk: fully connected DCCP socket (after handshake is complete) - * @feat_num: feature to activate, one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @fval: the value (SP or NN) to activate, or NULL to use the default value - * - * For general use this function is preferable over __dccp_feat_activate(). - */ -static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, - dccp_feat_val const *fval) -{ - return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); -} - -/* Test for "Req'd" feature (RFC 4340, 6.4) */ -static inline int dccp_feat_must_be_understood(u8 feat_num) -{ - return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || - feat_num == DCCPF_SEQUENCE_WINDOW; -} - -/* copy constructor, fval must not already contain allocated memory */ -static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) -{ - fval->sp.len = len; - if (fval->sp.len > 0) { - fval->sp.vec = kmemdup(val, len, gfp_any()); - if (fval->sp.vec == NULL) { - fval->sp.len = 0; - return -ENOMEM; - } - } - return 0; -} - -static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) -{ - if (unlikely(val == NULL)) - return; - if (dccp_feat_type(feat_num) == FEAT_SP) - kfree(val->sp.vec); - memset(val, 0, sizeof(*val)); -} - -static struct dccp_feat_entry * - dccp_feat_clone_entry(struct dccp_feat_entry const *original) -{ - struct dccp_feat_entry *new; - u8 type = dccp_feat_type(original->feat_num); - - if (type == FEAT_UNKNOWN) - return NULL; - - new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); - if (new == NULL) - return NULL; - - if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, - original->val.sp.vec, - original->val.sp.len)) { - kfree(new); - return NULL; - } - return new; -} - -static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) -{ - if (entry != NULL) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - kfree(entry); - } -} - -/* - * List management functions - * - * Feature negotiation lists rely on and maintain the following invariants: - * - each feat_num in the list is known, i.e. we know its type and default value - * - each feat_num/is_local combination is unique (old entries are overwritten) - * - SP values are always freshly allocated - * - list is sorted in increasing order of feature number (faster lookup) - */ -static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, - u8 feat_num, bool is_local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, fn_list, node) { - if (entry->feat_num == feat_num && entry->is_local == is_local) - return entry; - else if (entry->feat_num > feat_num) - break; - } - return NULL; -} - -/** - * dccp_feat_entry_new - Central list update routine (called by all others) - * @head: list to add to - * @feat: feature number - * @local: whether the local (1) or remote feature with number @feat is meant - * - * This is the only constructor and serves to ensure the above invariants. - */ -static struct dccp_feat_entry * - dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, head, node) - if (entry->feat_num == feat && entry->is_local == local) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - return entry; - } else if (entry->feat_num > feat) { - head = &entry->node; - break; - } - - entry = kmalloc(sizeof(*entry), gfp_any()); - if (entry != NULL) { - entry->feat_num = feat; - entry->is_local = local; - list_add_tail(&entry->node, head); - } - return entry; -} - -/** - * dccp_feat_push_change - Add/overwrite a Change option in the list - * @fn_list: feature-negotiation list to update - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @mandatory: whether to use Mandatory feature negotiation options - * @fval: pointer to NN/SP value to be inserted (will be copied) - */ -static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, - u8 mandatory, dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return -ENOMEM; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_INITIALISING; - new->needs_confirm = false; - new->empty_confirm = false; - new->val = *fval; - new->needs_mandatory = mandatory; - - return 0; -} - -/** - * dccp_feat_push_confirm - Add a Confirm entry to the FN list - * @fn_list: feature-negotiation list to add to - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is being confirmed - * @fval: pointer to NN/SP value to be inserted or NULL - * - * Returns 0 on success, a Reset code for further processing otherwise. - */ -static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, - dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return DCCP_RESET_CODE_TOO_BUSY; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_STABLE; /* transition in 6.6.2 */ - new->needs_confirm = true; - new->empty_confirm = (fval == NULL); - new->val.nn = 0; /* zeroes the whole structure */ - if (!new->empty_confirm) - new->val = *fval; - new->needs_mandatory = false; - - return 0; -} - -static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) -{ - return dccp_feat_push_confirm(fn_list, feat, local, NULL); -} - -static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) -{ - list_del(&entry->node); - dccp_feat_entry_destructor(entry); -} - -void dccp_feat_list_purge(struct list_head *fn_list) -{ - struct dccp_feat_entry *entry, *next; - - list_for_each_entry_safe(entry, next, fn_list, node) - dccp_feat_entry_destructor(entry); - INIT_LIST_HEAD(fn_list); -} -EXPORT_SYMBOL_GPL(dccp_feat_list_purge); - -/* generate @to as full clone of @from - @to must not contain any nodes */ -int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) -{ - struct dccp_feat_entry *entry, *new; - - INIT_LIST_HEAD(to); - list_for_each_entry(entry, from, node) { - new = dccp_feat_clone_entry(entry); - if (new == NULL) - goto cloning_failed; - list_add_tail(&new->node, to); - } - return 0; - -cloning_failed: - dccp_feat_list_purge(to); - return -ENOMEM; -} - -/** - * dccp_feat_valid_nn_length - Enforce length constraints on NN options - * @feat_num: feature to return length of, one of %dccp_feature_numbers - * - * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, - * incoming options are accepted as long as their values are valid. - */ -static u8 dccp_feat_valid_nn_length(u8 feat_num) -{ - if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ - return 2; - if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ - return 6; - return 0; -} - -static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) -{ - switch (feat_num) { - case DCCPF_ACK_RATIO: - return val <= DCCPF_ACK_RATIO_MAX; - case DCCPF_SEQUENCE_WINDOW: - return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; - } - return 0; /* feature unknown - so we can't tell */ -} - -/* check that SP values are within the ranges defined in RFC 4340 */ -static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) -{ - switch (feat_num) { - case DCCPF_CCID: - return val == DCCPC_CCID2 || val == DCCPC_CCID3; - /* Type-check Boolean feature values: */ - case DCCPF_SHORT_SEQNOS: - case DCCPF_ECN_INCAPABLE: - case DCCPF_SEND_ACK_VECTOR: - case DCCPF_SEND_NDP_COUNT: - case DCCPF_DATA_CHECKSUM: - case DCCPF_SEND_LEV_RATE: - return val < 2; - case DCCPF_MIN_CSUM_COVER: - return val < 16; - } - return 0; /* feature unknown */ -} - -static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) -{ - if (sp_list == NULL || sp_len < 1) - return 0; - while (sp_len--) - if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) - return 0; - return 1; -} - -/** - * dccp_feat_insert_opts - Generate FN options from current list state - * @skb: next sk_buff to be sent to the peer - * @dp: for client during handshake and general negotiation - * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) - */ -int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - struct dccp_feat_entry *pos, *next; - u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; - bool rpt; - - /* put entries into @skb in the order they appear in the list */ - list_for_each_entry_safe_reverse(pos, next, fn, node) { - opt = dccp_feat_genopt(pos); - type = dccp_feat_type(pos->feat_num); - rpt = false; - - if (pos->empty_confirm) { - len = 0; - ptr = NULL; - } else { - if (type == FEAT_SP) { - len = pos->val.sp.len; - ptr = pos->val.sp.vec; - rpt = pos->needs_confirm; - } else if (type == FEAT_NN) { - len = dccp_feat_valid_nn_length(pos->feat_num); - ptr = nn_in_nbo; - dccp_encode_value_var(pos->val.nn, ptr, len); - } else { - DCCP_BUG("unknown feature %u", pos->feat_num); - return -1; - } - } - dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); - - if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) - return -1; - if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) - return -1; - - if (skb->sk->sk_state == DCCP_OPEN && - (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) { - /* - * Confirms don't get retransmitted (6.6.3) once the - * connection is in state OPEN - */ - dccp_feat_list_pop(pos); - } else { - /* - * Enter CHANGING after transmitting the Change - * option (6.6.2). - */ - if (pos->state == FEAT_INITIALISING) - pos->state = FEAT_CHANGING; - } - } - return 0; -} - -/** - * __feat_register_nn - Register new NN value on socket - * @fn: feature-negotiation list to register with - * @feat: an NN feature from %dccp_feature_numbers - * @mandatory: use Mandatory option if 1 - * @nn_val: value to register (restricted to 4 bytes) - * - * Note that NN features are local by definition (RFC 4340, 6.3.2). - */ -static int __feat_register_nn(struct list_head *fn, u8 feat, - u8 mandatory, u64 nn_val) -{ - dccp_feat_val fval = { .nn = nn_val }; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - /* Don't bother with default values, they will be activated anyway. */ - if (nn_val - (u64)dccp_feat_default_value(feat) == 0) - return 0; - - return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); -} - -/** - * __feat_register_sp - Register new SP value/list on socket - * @fn: feature-negotiation list to register with - * @feat: an SP feature from %dccp_feature_numbers - * @is_local: whether the local (1) or the remote (0) @feat is meant - * @mandatory: use Mandatory option if 1 - * @sp_val: SP value followed by optional preference list - * @sp_len: length of @sp_val in bytes - */ -static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, - u8 mandatory, u8 const *sp_val, u8 sp_len) -{ - dccp_feat_val fval; - - if (dccp_feat_type(feat) != FEAT_SP || - !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) - return -EINVAL; - - /* Avoid negotiating alien CCIDs by only advertising supported ones */ - if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) - return -EOPNOTSUPP; - - if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) - return -ENOMEM; - - if (dccp_feat_push_change(fn, feat, is_local, mandatory, &fval)) { - kfree(fval.sp.vec); - return -ENOMEM; - } - - return 0; -} - -/** - * dccp_feat_register_sp - Register requests to change SP feature values - * @sk: client or listening socket - * @feat: one of %dccp_feature_numbers - * @is_local: whether the local (1) or remote (0) @feat is meant - * @list: array of preferred values, in descending order of preference - * @len: length of @list in bytes - */ -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len) -{ /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_SP) - return -EINVAL; - return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, - 0, list, len); -} - -/** - * dccp_feat_nn_get - Query current/pending value of NN feature - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * - * For a known NN feature, returns value currently being negotiated, or - * current (confirmed) value if no negotiation is going on. - */ -u64 dccp_feat_nn_get(struct sock *sk, u8 feat) -{ - if (dccp_feat_type(feat) == FEAT_NN) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *entry; - - entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1); - if (entry != NULL) - return entry->val.nn; - - switch (feat) { - case DCCPF_ACK_RATIO: - return dp->dccps_l_ack_ratio; - case DCCPF_SEQUENCE_WINDOW: - return dp->dccps_l_seq_win; - } - } - DCCP_BUG("attempt to look up unsupported feature %u", feat); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_feat_nn_get); - -/** - * dccp_feat_signal_nn_change - Update NN values for an established connection - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * @nn_val: the new value to use - * - * This function is used to communicate NN updates out-of-band. - */ -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - dccp_feat_val fval = { .nn = nn_val }; - struct dccp_feat_entry *entry; - - if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) - return 0; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - if (nn_val == dccp_feat_nn_get(sk, feat)) - return 0; /* already set or negotiation under way */ - - entry = dccp_feat_list_lookup(fn, feat, 1); - if (entry != NULL) { - dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n", - (unsigned long long)entry->val.nn, - (unsigned long long)nn_val); - dccp_feat_list_pop(entry); - } - - inet_csk_schedule_ack(sk); - return dccp_feat_push_change(fn, feat, 1, 0, &fval); -} -EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); - -/* - * Tracking features whose value depend on the choice of CCID - * - * This is designed with an extension in mind so that a list walk could be done - * before activating any features. However, the existing framework was found to - * work satisfactorily up until now, the automatic verification is left open. - * When adding new CCIDs, add a corresponding dependency table here. - */ -static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) -{ - static const struct ccid_dependency ccid2_dependencies[2][2] = { - /* - * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX - * feature and Send Ack Vector is an RX feature, `is_local' - * needs to be reversed. - */ - { /* Dependencies of the receiver-side (remote) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - }, - { /* Dependencies of the sender-side (local) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - static const struct ccid_dependency ccid3_dependencies[2][5] = { - { /* - * Dependencies of the receiver-side CCID3 - */ - { /* locally disable Ack Vectors */ - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* see below why Send Loss Event Rate is on */ - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { /* NDP Count is needed as per RFC 4342, 6.1.1 */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 }, - }, - { /* - * CCID3 at the TX side: we request that the HC-receiver - * will not send Ack Vectors (they will be ignored, so - * Mandatory is not set); we enable Send Loss Event Rate - * (Mandatory since the implementation does not support - * the Loss Intervals option of RFC 4342, 8.6). - * The last two options are for peer's information only. - */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = false, - .val = 0 - }, - { - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { /* this CCID does not support Ack Ratio */ - .dependent_feat = DCCPF_ACK_RATIO, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* tell receiver we are sending NDP counts */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = true, - .is_mandatory = false, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - switch (ccid) { - case DCCPC_CCID2: - return ccid2_dependencies[is_local]; - case DCCPC_CCID3: - return ccid3_dependencies[is_local]; - default: - return NULL; - } -} - -/** - * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID - * @fn: feature-negotiation list to update - * @id: CCID number to track - * @is_local: whether TX CCID (1) or RX CCID (0) is meant - * - * This function needs to be called after registering all other features. - */ -static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) -{ - const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); - int i, rc = (table == NULL); - - for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) - if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) - rc = __feat_register_sp(fn, table[i].dependent_feat, - table[i].is_local, - table[i].is_mandatory, - &table[i].val, 1); - else - rc = __feat_register_nn(fn, table[i].dependent_feat, - table[i].is_mandatory, - table[i].val); - return rc; -} - -/** - * dccp_feat_finalise_settings - Finalise settings before starting negotiation - * @dp: client or listening socket (settings will be inherited) - * - * This is called after all registrations (socket initialisation, sysctls, and - * sockopt calls), and before sending the first packet containing Change options - * (ie. client-Request or server-Response), to ensure internal consistency. - */ -int dccp_feat_finalise_settings(struct dccp_sock *dp) -{ - struct list_head *fn = &dp->dccps_featneg; - struct dccp_feat_entry *entry; - int i = 2, ccids[2] = { -1, -1 }; - - /* - * Propagating CCIDs: - * 1) not useful to propagate CCID settings if this host advertises more - * than one CCID: the choice of CCID may still change - if this is - * the client, or if this is the server and the client sends - * singleton CCID values. - * 2) since is that propagate_ccid changes the list, we defer changing - * the sorted list until after the traversal. - */ - list_for_each_entry(entry, fn, node) - if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) - ccids[entry->is_local] = entry->val.sp.vec[0]; - while (i--) - if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) - return -1; - dccp_feat_print_fnlist(fn); - return 0; -} - -/** - * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features - * @dreq: server socket to resolve - * - * It is the server which resolves the dependencies once the CCID has been - * fully negotiated. If no CCID has been negotiated, it uses the default CCID. - */ -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) -{ - struct list_head *fn = &dreq->dreq_featneg; - struct dccp_feat_entry *entry; - u8 is_local, ccid; - - for (is_local = 0; is_local <= 1; is_local++) { - entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); - - if (entry != NULL && !entry->empty_confirm) - ccid = entry->val.sp.vec[0]; - else - ccid = dccp_feat_default_value(DCCPF_CCID); - - if (dccp_feat_propagate_ccid(fn, ccid, is_local)) - return -1; - } - return 0; -} - -/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ -static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) -{ - u8 c, s; - - for (s = 0; s < slen; s++) - for (c = 0; c < clen; c++) - if (servlist[s] == clilist[c]) - return servlist[s]; - return -1; -} - -/** - * dccp_feat_prefer - Move preferred entry to the start of array - * @preferred_value: entry to move to start of array - * @array: array of preferred entries - * @array_len: size of the array - * - * Reorder the @array_len elements in @array so that @preferred_value comes - * first. Returns >0 to indicate that @preferred_value does occur in @array. - */ -static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) -{ - u8 i, does_occur = 0; - - if (array != NULL) { - for (i = 0; i < array_len; i++) - if (array[i] == preferred_value) { - array[i] = array[0]; - does_occur++; - } - if (does_occur) - array[0] = preferred_value; - } - return does_occur; -} - -/** - * dccp_feat_reconcile - Reconcile SP preference lists - * @fv: SP list to reconcile into - * @arr: received SP preference list - * @len: length of @arr in bytes - * @is_server: whether this side is the server (and @fv is the server's list) - * @reorder: whether to reorder the list in @fv after reconciling with @arr - * When successful, > 0 is returned and the reconciled list is in @fval. - * A value of 0 means that negotiation failed (no shared entry). - */ -static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, - bool is_server, bool reorder) -{ - int rc; - - if (!fv->sp.vec || !arr) { - DCCP_CRIT("NULL feature value or array"); - return 0; - } - - if (is_server) - rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); - else - rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); - - if (!reorder) - return rc; - if (rc < 0) - return 0; - - /* - * Reorder list: used for activating features and in dccp_insert_fn_opt. - */ - return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); -} - -/** - * dccp_feat_change_recv - Process incoming ChangeL/R options - * @fn: feature-negotiation list to update - * @is_mandatory: whether the Change was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is the server (1) or the client (0) - */ -static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 defval, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CHANGE_R); - struct dccp_feat_entry *entry; - dccp_feat_val fval; - - if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ - goto unknown_feature_or_value; - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - /* - * Negotiation of NN features: Change R is invalid, so there is no - * simultaneous negotiation; hence we do not look up in the list. - */ - if (type == FEAT_NN) { - if (local || len > sizeof(fval.nn)) - goto unknown_feature_or_value; - - /* 6.3.2: "The feature remote MUST accept any valid value..." */ - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto unknown_feature_or_value; - - return dccp_feat_push_confirm(fn, feat, local, &fval); - } - - /* - * Unidirectional/simultaneous negotiation of SP features (6.3.1) - */ - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL) { - /* - * No particular preferences have been registered. We deal with - * this situation by assuming that all valid values are equally - * acceptable, and apply the following checks: - * - if the peer's list is a singleton, we accept a valid value; - * - if we are the server, we first try to see if the peer (the - * client) advertises the default value. If yes, we use it, - * otherwise we accept the preferred value; - * - else if we are the client, we use the first list element. - */ - if (dccp_feat_clone_sp_val(&fval, val, 1)) - return DCCP_RESET_CODE_TOO_BUSY; - - if (len > 1 && server) { - defval = dccp_feat_default_value(feat); - if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) - fval.sp.vec[0] = defval; - } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { - kfree(fval.sp.vec); - goto unknown_feature_or_value; - } - - /* Treat unsupported CCIDs like invalid values */ - if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { - kfree(fval.sp.vec); - goto not_valid_or_not_known; - } - - if (dccp_feat_push_confirm(fn, feat, local, &fval)) { - kfree(fval.sp.vec); - return DCCP_RESET_CODE_TOO_BUSY; - } - - return 0; - } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ - return 0; - } - - if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { - entry->empty_confirm = false; - } else if (is_mandatory) { - return DCCP_RESET_CODE_MANDATORY_ERROR; - } else if (entry->state == FEAT_INITIALISING) { - /* - * Failed simultaneous negotiation (server only): try to `save' - * the connection by checking whether entry contains the default - * value for @feat. If yes, send an empty Confirm to signal that - * the received Change was not understood - which implies using - * the default value. - * If this also fails, we use Reset as the last resort. - */ - WARN_ON(!server); - defval = dccp_feat_default_value(feat); - if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) - return DCCP_RESET_CODE_OPTION_ERROR; - entry->empty_confirm = true; - } - entry->needs_confirm = true; - entry->needs_mandatory = false; - entry->state = FEAT_STABLE; - return 0; - -unknown_feature_or_value: - if (!is_mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -not_valid_or_not_known: - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_confirm_recv - Process received Confirm options - * @fn: feature-negotiation list to update - * @is_mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is server (1) or client (0) - */ -static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 *plist, plen, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - if (entry == NULL) { /* nothing queued: ignore or handle error */ - if (is_mandatory && type == FEAT_UNKNOWN) - return DCCP_RESET_CODE_MANDATORY_ERROR; - - if (!local && type == FEAT_NN) /* 6.3.2 */ - goto confirmation_failed; - return 0; - } - - if (entry->state != FEAT_CHANGING) /* 6.6.2 */ - return 0; - - if (len == 0) { - if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ - goto confirmation_failed; - /* - * Empty Confirm during connection setup: this means reverting - * to the `old' value, which in this case is the default. Since - * we handle default values automatically when no other values - * have been set, we revert to the old value by removing this - * entry from the list. - */ - dccp_feat_list_pop(entry); - return 0; - } - - if (type == FEAT_NN) { - if (len > sizeof(entry->val.nn)) - goto confirmation_failed; - - if (entry->val.nn == dccp_decode_value_var(val, len)) - goto confirmation_succeeded; - - DCCP_WARN("Bogus Confirm for non-existing value\n"); - goto confirmation_failed; - } - - /* - * Parsing SP Confirms: the first element of @val is the preferred - * SP value which the peer confirms, the remainder depends on @len. - * Note that only the confirmed value need to be a valid SP value. - */ - if (!dccp_feat_is_valid_sp_val(feat, *val)) - goto confirmation_failed; - - if (len == 1) { /* peer didn't supply a preference list */ - plist = val; - plen = len; - } else { /* preferred value + preference list */ - plist = val + 1; - plen = len - 1; - } - - /* Check whether the peer got the reconciliation right (6.6.8) */ - if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { - DCCP_WARN("Confirm selected the wrong value %u\n", *val); - return DCCP_RESET_CODE_OPTION_ERROR; - } - entry->val.sp.vec[0] = *val; - -confirmation_succeeded: - entry->state = FEAT_STABLE; - return 0; - -confirmation_failed: - DCCP_WARN("Confirmation failed\n"); - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_handle_nn_established - Fast-path reception of NN options - * @sk: socket of an established DCCP connection - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) - * @feat: NN number, one of %dccp_feature_numbers - * @val: NN value - * @len: length of @val in bytes - * - * This function combines the functionality of change_recv/confirm_recv, with - * the following differences (reset codes are the same): - * - cleanup after receiving the Confirm; - * - values are directly activated after successful parsing; - * - deliberately restricted to NN features. - * The restriction to NN features is essential since SP features can have non- - * predictable outcomes (depending on the remote configuration), and are inter- - * dependent (CCIDs for instance cause further dependencies). - */ -static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, - u8 feat, u8 *val, u8 len) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry; - u8 type = dccp_feat_type(feat); - dccp_feat_val fval; - - dccp_feat_print_opt(opt, feat, val, len, mandatory); - - /* Ignore non-mandatory unknown and non-NN features */ - if (type == FEAT_UNKNOWN) { - if (local && !mandatory) - return 0; - goto fast_path_unknown; - } else if (type != FEAT_NN) { - return 0; - } - - /* - * We don't accept empty Confirms, since in fast-path feature - * negotiation the values are enabled immediately after sending - * the Change option. - * Empty Changes on the other hand are invalid (RFC 4340, 6.1). - */ - if (len == 0 || len > sizeof(fval.nn)) - goto fast_path_unknown; - - if (opt == DCCPO_CHANGE_L) { - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto fast_path_unknown; - - if (dccp_feat_push_confirm(fn, feat, local, &fval) || - dccp_feat_activate(sk, feat, local, &fval)) - return DCCP_RESET_CODE_TOO_BUSY; - - /* set the `Ack Pending' flag to piggyback a Confirm */ - inet_csk_schedule_ack(sk); - - } else if (opt == DCCPO_CONFIRM_R) { - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL || entry->state != FEAT_CHANGING) - return 0; - - fval.nn = dccp_decode_value_var(val, len); - /* - * Just ignore a value that doesn't match our current value. - * If the option changes twice within two RTTs, then at least - * one CONFIRM will be received for the old value after a - * new CHANGE was sent. - */ - if (fval.nn != entry->val.nn) - return 0; - - /* Only activate after receiving the Confirm option (6.6.1). */ - dccp_feat_activate(sk, feat, local, &fval); - - /* It has been confirmed - so remove the entry */ - dccp_feat_list_pop(entry); - - } else { - DCCP_WARN("Received illegal option %u\n", opt); - goto fast_path_failed; - } - return 0; - -fast_path_unknown: - if (!mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -fast_path_failed: - return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_parse_options - Process Feature-Negotiation Options - * @sk: for general use and used by the client during connection setup - * @dreq: used by the server during connection setup - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: value contents of @opt - * @len: length of @val in bytes - * - * Returns 0 on success, a Reset code for ending the connection otherwise. - */ -int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - bool server = false; - - switch (sk->sk_state) { - /* - * Negotiation during connection setup - */ - case DCCP_LISTEN: - server = true; - fallthrough; - case DCCP_REQUESTING: - switch (opt) { - case DCCPO_CHANGE_L: - case DCCPO_CHANGE_R: - return dccp_feat_change_recv(fn, mandatory, opt, feat, - val, len, server); - case DCCPO_CONFIRM_R: - case DCCPO_CONFIRM_L: - return dccp_feat_confirm_recv(fn, mandatory, opt, feat, - val, len, server); - } - break; - /* - * Support for exchanging NN options on an established connection. - */ - case DCCP_OPEN: - case DCCP_PARTOPEN: - return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, - val, len); - } - return 0; /* ignore FN options in all other states */ -} - -/** - * dccp_feat_init - Seed feature negotiation with host-specific defaults - * @sk: Socket to initialize. - * - * This initialises global defaults, depending on the value of the sysctls. - * These can later be overridden by registering changes via setsockopt calls. - * The last link in the chain is finalise_settings, to make sure that between - * here and the start of actual feature negotiation no inconsistencies enter. - * - * All features not appearing below use either defaults or are otherwise - * later adjusted through dccp_feat_finalise_settings(). - */ -int dccp_feat_init(struct sock *sk) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - u8 on = 1, off = 0; - int rc; - struct { - u8 *val; - u8 len; - } tx, rx; - - /* Non-negotiable (NN) features */ - rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, - sysctl_dccp_sequence_window); - if (rc) - return rc; - - /* Server-priority (SP) features */ - - /* Advertise that short seqnos are not supported (7.6.1) */ - rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); - if (rc) - return rc; - - /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ - rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); - if (rc) - return rc; - - /* - * We advertise the available list of CCIDs and reorder according to - * preferences, to avoid failure resulting from negotiating different - * singleton values (which always leads to failure). - * These settings can still (later) be overridden via sockopts. - */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len)) - return -ENOBUFS; - if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { - kfree(tx.val); - return -ENOBUFS; - } - - if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || - !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); - if (rc) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); - -free_ccid_lists: - kfree(tx.val); - kfree(rx.val); - return rc; -} - -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *cur, *next; - int idx; - dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { - [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } - }; - - list_for_each_entry(cur, fn_list, node) { - /* - * An empty Confirm means that either an unknown feature type - * or an invalid value was present. In the first case there is - * nothing to activate, in the other the default value is used. - */ - if (cur->empty_confirm) - continue; - - idx = dccp_feat_index(cur->feat_num); - if (idx < 0) { - DCCP_BUG("Unknown feature %u", cur->feat_num); - goto activation_failed; - } - if (cur->state != FEAT_STABLE) { - DCCP_CRIT("Negotiation of %s %s failed in state %s", - cur->is_local ? "local" : "remote", - dccp_feat_fname(cur->feat_num), - dccp_feat_sname[cur->state]); - goto activation_failed; - } - fvals[idx][cur->is_local] = &cur->val; - } - - /* - * Activate in decreasing order of index, so that the CCIDs are always - * activated as the last feature. This avoids the case where a CCID - * relies on the initialisation of one or more features that it depends - * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). - */ - for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) - if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || - __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { - DCCP_CRIT("Could not activate %d", idx); - goto activation_failed; - } - - /* Clean up Change options which have been confirmed already */ - list_for_each_entry_safe(cur, next, fn_list, node) - if (!cur->needs_confirm) - dccp_feat_list_pop(cur); - - dccp_pr_debug("Activation OK\n"); - return 0; - -activation_failed: - /* - * We clean up everything that may have been allocated, since - * it is difficult to track at which stage negotiation failed. - * This is ok, since all allocation functions below are robust - * against NULL arguments. - */ - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - return -1; -} diff --git a/net/dccp/feat.h b/net/dccp/feat.h deleted file mode 100644 index 57d9c026aa3f5..0000000000000 --- a/net/dccp/feat.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_FEAT_H -#define _DCCP_FEAT_H -/* - * net/dccp/feat.h - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> - * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> - */ -#include <linux/types.h> -#include "dccp.h" - -/* - * Known limit values - */ -/* Ack Ratio takes 2-byte integer values (11.3) */ -#define DCCPF_ACK_RATIO_MAX 0xFFFF -/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ -#define DCCPF_SEQ_WMIN 32 -#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull -/* Maximum number of SP values that fit in a single (Confirm) option */ -#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) - -enum dccp_feat_type { - FEAT_AT_RX = 1, /* located at RX side of half-connection */ - FEAT_AT_TX = 2, /* located at TX side of half-connection */ - FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ - FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ - FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ -}; - -enum dccp_feat_state { - FEAT_DEFAULT = 0, /* using default values from 6.4 */ - FEAT_INITIALISING, /* feature is being initialised */ - FEAT_CHANGING, /* Change sent but not confirmed yet */ - FEAT_UNSTABLE, /* local modification in state CHANGING */ - FEAT_STABLE /* both ends (think they) agree */ -}; - -/** - * dccp_feat_val - Container for SP or NN feature values - * @nn: single NN value - * @sp.vec: single SP value plus optional preference list - * @sp.len: length of @sp.vec in bytes - */ -typedef union { - u64 nn; - struct { - u8 *vec; - u8 len; - } sp; -} dccp_feat_val; - -/** - * struct feat_entry - Data structure to perform feature negotiation - * @val: feature's current value (SP features may have preference list) - * @state: feature's current state - * @feat_num: one of %dccp_feature_numbers - * @needs_mandatory: whether Mandatory options should be sent - * @needs_confirm: whether to send a Confirm instead of a Change - * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) - * @is_local: feature location (1) or feature-remote (0) - * @node: list pointers, entries arranged in FIFO order - */ -struct dccp_feat_entry { - dccp_feat_val val; - enum dccp_feat_state state:8; - u8 feat_num; - - bool needs_mandatory, - needs_confirm, - empty_confirm, - is_local; - - struct list_head node; -}; - -static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) -{ - if (entry->needs_confirm) - return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; - return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; -} - -/** - * struct ccid_dependency - Track changes resulting from choosing a CCID - * @dependent_feat: one of %dccp_feature_numbers - * @is_local: local (1) or remote (0) @dependent_feat - * @is_mandatory: whether presence of @dependent_feat is mission-critical or not - * @val: corresponding default value for @dependent_feat (u8 is sufficient here) - */ -struct ccid_dependency { - u8 dependent_feat; - bool is_local:1, - is_mandatory:1; - u8 val; -}; - -/* - * Sysctls to seed defaults for feature negotiation - */ -extern unsigned long sysctl_dccp_sequence_window; -extern int sysctl_dccp_rx_ccid; -extern int sysctl_dccp_tx_ccid; - -int dccp_feat_init(struct sock *sk); -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len); -int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, - u8 mand, u8 opt, u8 feat, u8 *val, u8 len); -int dccp_feat_clone_list(struct list_head const *, struct list_head *); - -/* - * Encoding variable-length options and their maximum length. - * - * This affects NN options (SP options are all u8) and other variable-length - * options (see table 3 in RFC 4340). The limit is currently given the Sequence - * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other - * options consume less than 6 bytes (timestamps are 4 bytes). - * When updating this constant (e.g. due to new internet drafts / RFCs), make - * sure that you also update all code which refers to it. - */ -#define DCCP_OPTVAL_MAXLEN 6 - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); -u64 dccp_decode_value_var(const u8 *bf, const u8 len); -u64 dccp_feat_nn_get(struct sock *sk, u8 feat); - -int dccp_insert_option_mandatory(struct sk_buff *skb); -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len, - bool repeat_first); -#endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c deleted file mode 100644 index 2cbb757a894f8..0000000000000 --- a/net/dccp/input.c +++ /dev/null @@ -1,739 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/input.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/skbuff.h> -#include <linux/slab.h> - -#include <net/sock.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ -int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; - -static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) -{ - __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - sk->sk_data_ready(sk); -} - -static void dccp_fin(struct sock *sk, struct sk_buff *skb) -{ - /* - * On receiving Close/CloseReq, both RD/WR shutdown are performed. - * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after - * receiving the closing segment, but there is no guarantee that such - * data will be processed at all. - */ - sk->sk_shutdown = SHUTDOWN_MASK; - sock_set_flag(sk, SOCK_DONE); - dccp_enqueue_skb(sk, skb); -} - -static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - switch (sk->sk_state) { - /* - * We ignore Close when received in one of the following states: - * - CLOSED (may be a late or duplicate packet) - * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) - * - RESPOND (already handled by dccp_check_req) - */ - case DCCP_CLOSING: - /* - * Simultaneous-close: receiving a Close after sending one. This - * can happen if both client and server perform active-close and - * will result in an endless ping-pong of crossing and retrans- - * mitted Close packets, which only terminates when one of the - * nodes times out (min. 64 seconds). Quicker convergence can be - * achieved when one of the nodes acts as tie-breaker. - * This is ok as both ends are done with data transfer and each - * end is just waiting for the other to acknowledge termination. - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) - break; - fallthrough; - case DCCP_REQUESTING: - case DCCP_ACTIVE_CLOSEREQ: - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_done(sk); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSE); - fallthrough; - case DCCP_PASSIVE_CLOSE: - /* - * Retransmitted Close: we have already enqueued the first one. - */ - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == CloseReq) - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); - return queued; - } - - /* Step 13: process relevant Client states < CLOSEREQ */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - dccp_send_close(sk, 0); - dccp_set_state(sk, DCCP_CLOSING); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ); - fallthrough; - case DCCP_PASSIVE_CLOSEREQ: - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static u16 dccp_reset_code_convert(const u8 code) -{ - static const u16 error_code[] = { - [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ - [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ - [DCCP_RESET_CODE_ABORTED] = ECONNRESET, - - [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED, - [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED, - [DCCP_RESET_CODE_TOO_BUSY] = EUSERS, - [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT, - - [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG, - [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR, - [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC, - [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ, - [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP, - }; - - return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code]; -} - -static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) -{ - u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code); - - sk->sk_err = err; - - /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ - dccp_fin(sk, skb); - - if (err && !sock_flag(sk, SOCK_DEAD)) - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); - dccp_time_wait(sk, DCCP_TIME_WAIT, 0); -} - -static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; - - if (av == NULL) - return; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); - dccp_ackvec_input(av, skb); -} - -static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - /* Don't deliver to RX CCID when node has shut down read end. */ - if (!(sk->sk_shutdown & RCV_SHUTDOWN)) - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - /* - * Until the TX queue has been drained, we can not honour SHUT_WR, since - * we need received feedback as input to adjust congestion control. - */ - if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN)) - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); -} - -static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - struct dccp_sock *dp = dccp_sk(sk); - u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq, - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - - /* - * Step 5: Prepare sequence numbers for Sync - * If P.type == Sync or P.type == SyncAck, - * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, - * / * P is valid, so update sequence number variables - * accordingly. After this update, P will pass the tests - * in Step 6. A SyncAck is generated if necessary in - * Step 15 * / - * Update S.GSR, S.SWL, S.SWH - * Otherwise, - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_SYNC || - dh->dccph_type == DCCP_PKT_SYNCACK) { - if (between48(ackno, dp->dccps_awl, dp->dccps_awh) && - dccp_delta_seqno(dp->dccps_swl, seqno) >= 0) - dccp_update_gsr(sk, seqno); - else - return -1; - } - - /* - * Step 6: Check sequence numbers - * Let LSWL = S.SWL and LAWL = S.AWL - * If P.type == CloseReq or P.type == Close or P.type == Reset, - * LSWL := S.GSR + 1, LAWL := S.GAR - * If LSWL <= P.seqno <= S.SWH - * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), - * Update S.GSR, S.SWL, S.SWH - * If P.type != Sync, - * Update S.GAR - */ - lswl = dp->dccps_swl; - lawl = dp->dccps_awl; - - if (dh->dccph_type == DCCP_PKT_CLOSEREQ || - dh->dccph_type == DCCP_PKT_CLOSE || - dh->dccph_type == DCCP_PKT_RESET) { - lswl = ADD48(dp->dccps_gsr, 1); - lawl = dp->dccps_gar; - } - - if (between48(seqno, lswl, dp->dccps_swh) && - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ || - between48(ackno, lawl, dp->dccps_awh))) { - dccp_update_gsr(sk, seqno); - - if (dh->dccph_type != DCCP_PKT_SYNC && - ackno != DCCP_PKT_WITHOUT_ACK_SEQ && - after48(ackno, dp->dccps_gar)) - dp->dccps_gar = ackno; - } else { - unsigned long now = jiffies; - /* - * Step 6: Check sequence numbers - * Otherwise, - * If P.type == Reset, - * Send Sync packet acknowledging S.GSR - * Otherwise, - * Send Sync packet acknowledging P.seqno - * Drop packet and return - * - * These Syncs are rate-limited as per RFC 4340, 7.5.4: - * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second. - */ - if (time_before(now, (dp->dccps_rate_last + - sysctl_dccp_sync_ratelimit))) - return -1; - - DCCP_WARN("Step 6 failed for %s packet, " - "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " - "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " - "sending SYNC...\n", dccp_packet_name(dh->dccph_type), - (unsigned long long) lswl, (unsigned long long) seqno, - (unsigned long long) dp->dccps_swh, - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" - : "exists", - (unsigned long long) lawl, (unsigned long long) ackno, - (unsigned long long) dp->dccps_awh); - - dp->dccps_rate_last = now; - - if (dh->dccph_type == DCCP_PKT_RESET) - seqno = dp->dccps_gsr; - dccp_send_sync(sk, seqno, DCCP_PKT_SYNC); - return -1; - } - - return 0; -} - -static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - - switch (dccp_hdr(skb)->dccph_type) { - case DCCP_PKT_DATAACK: - case DCCP_PKT_DATA: - /* - * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when - * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening" - * - sk_receive_queue is full, use Code 2, "Receive Buffer" - */ - dccp_enqueue_skb(sk, skb); - return 0; - case DCCP_PKT_ACK: - goto discard; - case DCCP_PKT_RESET: - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - dccp_rcv_reset(sk, skb); - return 0; - case DCCP_PKT_CLOSEREQ: - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_CLOSE: - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_REQUEST: - /* Step 7 - * or (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state >= OPEN and P.type == Request - * and P.seqno >= S.OSR) - * or (S.state >= OPEN and P.type == Response - * and P.seqno >= S.OSR) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dp->dccps_role != DCCP_ROLE_LISTEN) - goto send_sync; - goto check_seq; - case DCCP_PKT_RESPONSE: - if (dp->dccps_role != DCCP_ROLE_CLIENT) - goto send_sync; -check_seq: - if (dccp_delta_seqno(dp->dccps_osr, - DCCP_SKB_CB(skb)->dccpd_seq) >= 0) { -send_sync: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNC); - } - break; - case DCCP_PKT_SYNC: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNCACK); - /* - * From RFC 4340, sec. 5.7 - * - * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets - * MAY have non-zero-length application data areas, whose - * contents receivers MUST ignore. - */ - goto discard; - } - - DCCP_INC_STATS(DCCP_MIB_INERRS); -discard: - __kfree_skb(skb); - return 0; -} - -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - if (dccp_check_seqno(sk, skb)) - goto discard; - - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - - return __dccp_rcv_established(sk, skb, dh, len); -discard: - __kfree_skb(skb); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_established); - -static int dccp_rcv_request_sent_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - /* - * Step 4: Prepare sequence numbers in REQUEST - * If S.state == REQUEST, - * If (P.type == Response or P.type == Reset) - * and S.AWL <= P.ackno <= S.AWH, - * / * Set sequence number variables corresponding to the - * other endpoint, so P will pass the tests in Step 6 * / - * Set S.GSR, S.ISR, S.SWL, S.SWH - * / * Response processing continues in Step 10; Reset - * processing continues in Step 9 * / - */ - if (dh->dccph_type == DCCP_PKT_RESPONSE) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - long tstamp = dccp_timestamp(); - - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dp->dccps_awl, dp->dccps_awh)) { - dccp_pr_debug("invalid ackno: S.AWL=%llu, " - "P.ackno=%llu, S.AWH=%llu\n", - (unsigned long long)dp->dccps_awl, - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long)dp->dccps_awh); - goto out_invalid_packet; - } - - /* - * If option processing (Step 8) failed, return 1 here so that - * dccp_v4_do_rcv() sends a Reset. The Reset code depends on - * the option type and is set in dccp_parse_options(). - */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - - dp->dccps_options_received.dccpor_timestamp_echo)); - - /* Stop the REQUEST timer */ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); - WARN_ON(sk->sk_send_head == NULL); - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - - /* - * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect - * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH - * is done as part of activating the feature values below, since - * these settings depend on the local/remote Sequence Window - * features, which were undefined or not confirmed until now. - */ - dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - - dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); - - /* - * Step 10: Process REQUEST state (second part) - * If S.state == REQUEST, - * / * If we get here, P is a valid Response from the - * server (see Step 4), and we should move to - * PARTOPEN state. PARTOPEN means send an Ack, - * don't send Data packets, retransmit Acks - * periodically, and always include any Init Cookie - * from the Response * / - * S.state := PARTOPEN - * Set PARTOPEN timer - * Continue with S.state == PARTOPEN - * / * Step 12 will send the Ack completing the - * three-way handshake * / - */ - dccp_set_state(sk, DCCP_PARTOPEN); - - /* - * If feature negotiation was successful, activate features now; - * an activation failure means that this host could not activate - * one ore more features (e.g. insufficient memory), which would - * leave at least one feature in an undefined state. - */ - if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) - goto unable_to_proceed; - - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - } - - if (sk->sk_write_pending || inet_csk_in_pingpong_mode(sk) || - icsk->icsk_accept_queue.rskq_defer_accept) { - /* Save one ACK. Data will be ready after - * several ticks, if write_pending is set. - * - * It may be deleted, but with this feature tcpdumps - * look so _wonderfully_ clever, that I was not able - * to stand against the temptation 8) --ANK - */ - /* - * OK, in DCCP we can as well do a similar trick, its - * even in the draft, but there is no need for us to - * schedule an ack here, as dccp_sendmsg does this for - * us, also stated in the draft. -acme - */ - __kfree_skb(skb); - return 0; - } - dccp_send_ack(sk); - return -1; - } - -out_invalid_packet: - /* dccp_v4_do_rcv will send a reset */ - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - return 1; - -unable_to_proceed: - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; - /* - * We mark this socket as no longer usable, so that the loop in - * dccp_sendmsg() terminates and the application gets notified. - */ - dccp_set_state(sk, DCCP_CLOSED); - sk->sk_err = ECOMM; - return 1; -} - -static int dccp_rcv_respond_partopen_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - u32 sample = dp->dccps_options_received.dccpor_timestamp_echo; - int queued = 0; - - switch (dh->dccph_type) { - case DCCP_PKT_RESET: - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - break; - case DCCP_PKT_DATA: - if (sk->sk_state == DCCP_RESPOND) - break; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_ACK: - /* - * FIXME: we should be resetting the PARTOPEN (DELACK) timer - * here but only if we haven't used the DELACK timer for - * something else, like sending a delayed ack for a TIMESTAMP - * echo, etc, for now were not clearing it, sending an extra - * ACK when there is nothing else to do in DELACK is not a big - * deal after all. - */ - - /* Stop the PARTOPEN timer */ - if (sk->sk_state == DCCP_PARTOPEN) - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(sample)) { - long delta = dccp_timestamp() - sample; - - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta); - } - - dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_set_state(sk, DCCP_OPEN); - - if (dh->dccph_type == DCCP_PKT_DATAACK || - dh->dccph_type == DCCP_PKT_DATA) { - __dccp_rcv_established(sk, skb, dh, len); - queued = 1; /* packet was queued - (by __dccp_rcv_established) */ - } - break; - } - - return queued; -} - -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int old_state = sk->sk_state; - bool acceptable; - int queued = 0; - - /* - * Step 3: Process LISTEN state - * - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init - * Cookies Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_LISTEN) { - if (dh->dccph_type == DCCP_PKT_REQUEST) { - /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH and RCU right there. - */ - rcu_read_lock(); - local_bh_disable(); - acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; - local_bh_enable(); - rcu_read_unlock(); - if (!acceptable) - return 1; - consume_skb(skb); - return 0; - } - if (dh->dccph_type == DCCP_PKT_RESET) - goto discard; - - /* Caller (dccp_v4_do_rcv) will send Reset */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } else if (sk->sk_state == DCCP_CLOSED) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } - - /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ - if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) - goto discard; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if ((dp->dccps_role != DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_RESPONSE) || - (dp->dccps_role == DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); - goto discard; - } - - /* Step 8: Process options */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_RESET) { - dccp_rcv_reset(sk, skb); - return 0; - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); - if (queued >= 0) - return queued; - - __kfree_skb(skb); - return 0; - - case DCCP_PARTOPEN: - /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - fallthrough; - case DCCP_RESPOND: - queued = dccp_rcv_respond_partopen_state_process(sk, skb, - dh, len); - break; - } - - if (dh->dccph_type == DCCP_PKT_ACK || - dh->dccph_type == DCCP_PKT_DATAACK) { - switch (old_state) { - case DCCP_PARTOPEN: - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - break; - } - } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK); - goto discard; - } - - if (!queued) { -discard: - __kfree_skb(skb); - } - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_state_process); - -/** - * dccp_sample_rtt - Validate and finalise computation of RTT sample - * @sk: socket structure - * @delta: number of microseconds between packet and acknowledgment - * - * The routine is kept generic to work in different contexts. It should be - * called immediately when the ACK used for the RTT sample arrives. - */ -u32 dccp_sample_rtt(struct sock *sk, long delta) -{ - /* dccpor_elapsed_time is either zeroed out or set and > 0 */ - delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; - - if (unlikely(delta <= 0)) { - DCCP_WARN("unusable RTT sample %ld, using min\n", delta); - return DCCP_SANE_RTT_MIN; - } - if (unlikely(delta > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %ld too large, using max\n", delta); - return DCCP_SANE_RTT_MAX; - } - - return delta; -} diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c deleted file mode 100644 index 2045ddac0fe9f..0000000000000 --- a/net/dccp/ipv4.c +++ /dev/null @@ -1,1101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/ipv4.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/icmp.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/random.h> - -#include <net/icmp.h> -#include <net/inet_common.h> -#include <net/inet_dscp.h> -#include <net/inet_hashtables.h> -#include <net/inet_sock.h> -#include <net/protocol.h> -#include <net/sock.h> -#include <net/timewait_sock.h> -#include <net/tcp_states.h> -#include <net/xfrm.h> -#include <net/secure_seq.h> -#include <net/netns/generic.h> -#include <net/rstreason.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct dccp_v4_pernet { - struct sock *v4_ctl_sk; -}; - -static unsigned int dccp_v4_pernet_id __read_mostly; - -/* - * The per-net v4_ctl_sk socket is used for responding to - * the Out-of-the-blue (OOTB) packets. A control sock will be created - * for this socket at the initialization time. - */ - -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) -{ - const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - __be16 orig_sport, orig_dport; - __be32 daddr, nexthop; - struct flowi4 *fl4; - struct rtable *rt; - int err; - struct ip_options_rcu *inet_opt; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - if (usin->sin_family != AF_INET) - return -EAFNOSUPPORT; - - nexthop = daddr = usin->sin_addr.s_addr; - - inet_opt = rcu_dereference_protected(inet->inet_opt, - lockdep_sock_is_held(sk)); - if (inet_opt != NULL && inet_opt->opt.srr) { - if (daddr == 0) - return -EINVAL; - nexthop = inet_opt->opt.faddr; - } - - orig_sport = inet->inet_sport; - orig_dport = usin->sin_port; - fl4 = &inet->cork.fl.u.ip4; - rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, - sk->sk_bound_dev_if, IPPROTO_DCCP, orig_sport, - orig_dport, sk); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { - ip_rt_put(rt); - return -ENETUNREACH; - } - - if (inet_opt == NULL || !inet_opt->opt.srr) - daddr = fl4->daddr; - - if (inet->inet_saddr == 0) { - err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); - if (err) { - ip_rt_put(rt); - return err; - } - } else { - sk_rcv_saddr_set(sk, inet->inet_saddr); - } - - inet->inet_dport = usin->sin_port; - sk_daddr_set(sk, daddr); - - inet_csk(sk)->icsk_ext_hdr_len = 0; - if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - /* - * Socket identity is still unknown (sport may be zero). - * However we set state to DCCP_REQUESTING and not releasing socket - * lock select source port, enter ourselves into the hash tables and - * complete initialization after this. - */ - dccp_set_state(sk, DCCP_REQUESTING); - err = inet_hash_connect(&dccp_death_row, sk); - if (err != 0) - goto failure; - - rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, - inet->inet_sport, inet->inet_dport, sk); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto failure; - } - /* OK, now commit destination to socket. */ - sk_setup_caps(sk, &rt->dst); - - dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - inet->inet_dport); - atomic_set(&inet->inet_id, get_random_u16()); - - err = dccp_connect(sk); - rt = NULL; - if (err != 0) - goto failure; -out: - return err; -failure: - /* - * This unhashes the socket and releases the local port, if necessary. - */ - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - ip_rt_put(rt); - sk->sk_route_caps = 0; - inet->inet_dport = 0; - goto out; -} -EXPORT_SYMBOL_GPL(dccp_v4_connect); - -/* - * This routine does path mtu discovery as defined in RFC1191. - */ -static inline void dccp_do_pmtu_discovery(struct sock *sk, - const struct iphdr *iph, - u32 mtu) -{ - struct dst_entry *dst; - const struct inet_sock *inet = inet_sk(sk); - const struct dccp_sock *dp = dccp_sk(sk); - - /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs - * send out by Linux are always < 576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == DCCP_LISTEN) - return; - - dst = inet_csk_update_pmtu(sk, mtu); - if (!dst) - return; - - /* Something is about to be wrong... Remember soft error - * for the case, if this connection will not able to recover. - */ - if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) - WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); - - mtu = dst_mtu(dst); - - if (inet->pmtudisc != IP_PMTUDISC_DONT && - ip_sk_accept_pmtu(sk) && - inet_csk(sk)->icsk_pmtu_cookie > mtu) { - dccp_sync_mss(sk, mtu); - - /* - * From RFC 4340, sec. 14.1: - * - * DCCP-Sync packets are the best choice for upward - * probing, since DCCP-Sync probes do not risk application - * data loss. - */ - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); - } /* else let the usual retransmit timer handle it */ -} - -static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_check(sk, 0); - - if (dst) - dst->ops->redirect(dst, sk, skb); -} - -void dccp_req_err(struct sock *sk, u64 seq) - { - struct request_sock *req = inet_reqsk(sk); - struct net *net = sock_net(sk); - - /* - * ICMPs are not backlogged, hence we cannot get an established - * socket here. - */ - if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - } else { - /* - * Still in RESPOND, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(req->rsk_listener, req); - } - reqsk_put(req); -} -EXPORT_SYMBOL(dccp_req_err); - -/* - * This routine is called by the ICMP module when it gets some sort of error - * condition. If err < 0 then the socket should be closed and the error - * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. - * After adjustment header points to the first 8 bytes of the tcp header. We - * need to find the appropriate port. - * - * The locking strategy used here is very "optimistic". When someone else - * accesses the socket the ICMP is just dropped and for some paths there is no - * check at all. A more general error queue to queue errors for later handling - * is probably better. - */ -static int dccp_v4_err(struct sk_buff *skb, u32 info) -{ - const struct iphdr *iph = (struct iphdr *)skb->data; - const u8 offset = iph->ihl << 2; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - struct sock *sk; - __u64 seq; - int err; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - iph = (struct iphdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet_lookup_established(net, &dccp_hashinfo, - iph->daddr, dh->dccph_dport, - iph->saddr, ntohs(dh->dccph_sport), - inet_iif(skb), 0); - if (!sk) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - /* If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - */ - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - switch (type) { - case ICMP_REDIRECT: - if (!sock_owned_by_user(sk)) - dccp_do_redirect(skb, sk); - goto out; - case ICMP_SOURCE_QUENCH: - /* Just silently ignore these. */ - goto out; - case ICMP_PARAMETERPROB: - err = EPROTO; - break; - case ICMP_DEST_UNREACH: - if (code > NR_ICMP_UNREACH) - goto out; - - if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ - if (!sock_owned_by_user(sk)) - dccp_do_pmtu_discovery(sk, iph, info); - goto out; - } - - err = icmp_err_convert[code].errno; - break; - case ICMP_TIME_EXCEEDED: - err = EHOSTUNREACH; - break; - default: - goto out; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - - sk_error_report(sk); - - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - /* If we've already connected we will keep trying - * until we time out, or the user gives up. - * - * rfc1122 4.2.3.9 allows to consider as hard errors - * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, - * but it is obsoleted by pmtu discovery). - * - * Note, that in modern internet, where routing is unreliable - * and in each dark corner broken firewalls sit, sending random - * errors ordered by their masters even this two messages finally lose - * their original sense (even Linux sends invalid PORT_UNREACHs) - * - * Now we are in compliance with RFCs. - * --ANK (980905) - */ - - if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { /* Only an error on timeout */ - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - -static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, - __be32 src, __be32 dst) -{ - return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb) -{ - const struct inet_sock *inet = inet_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v4_csum_finish(skb, - inet->inet_saddr, - inet->inet_daddr); -} -EXPORT_SYMBOL_GPL(dccp_v4_send_check); - -static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) -{ - return secure_dccp_sequence_number(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport); -} - -/* - * The three way handshake has completed - we got a valid ACK or DATAACK - - * now create the new socket. - * - * This is the equivalent of TCP's tcp_v4_syn_recv_sock - */ -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq; - struct inet_sock *newinet; - struct sock *newsk; - - if (sk_acceptq_is_full(sk)) - goto exit_overflow; - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto exit_nonewsk; - - newinet = inet_sk(newsk); - ireq = inet_rsk(req); - RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); - newinet->mc_index = inet_iif(skb); - newinet->mc_ttl = ip_hdr(skb)->ttl; - atomic_set(&newinet->inet_id, get_random_u16()); - - if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) - goto put_and_exit; - - sk_setup_caps(newsk, dst); - - dccp_sync_mss(newsk, dst_mtu(dst)); - - if (__inet_inherit_port(sk, newsk) < 0) - goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - if (*own_req) - ireq->ireq_opt = NULL; - else - newinet->inet_opt = NULL; - return newsk; - -exit_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -exit_nonewsk: - dst_release(dst); -exit: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -put_and_exit: - newinet->inet_opt = NULL; - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto exit; -} -EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); - -static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, - struct sk_buff *skb) -{ - struct rtable *rt; - const struct iphdr *iph = ip_hdr(skb); - struct flowi4 fl4 = { - .flowi4_oif = inet_iif(skb), - .daddr = iph->saddr, - .saddr = iph->daddr, - .flowi4_tos = inet_dscp_to_dsfield(inet_sk_dscp(inet_sk(sk))), - .flowi4_scope = ip_sock_rt_scope(sk), - .flowi4_proto = sk->sk_protocol, - .fl4_sport = dccp_hdr(skb)->dccph_dport, - .fl4_dport = dccp_hdr(skb)->dccph_sport, - }; - - security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); - if (IS_ERR(rt)) { - IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - - return &rt->dst; -} - -static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req) -{ - int err = -1; - struct sk_buff *skb; - struct dst_entry *dst; - struct flowi4 fl4; - - dst = inet_csk_route_req(sk, &fl4, req); - if (dst == NULL) - goto out; - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct dccp_hdr *dh = dccp_hdr(skb); - - dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, - ireq->ir_rmt_addr); - rcu_read_lock(); - err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt), - READ_ONCE(inet_sk(sk)->tos)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -out: - dst_release(dst); - return err; -} - -static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - int err; - const struct iphdr *rxiph; - struct sk_buff *skb; - struct dst_entry *dst; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v4_pernet *pn; - struct sock *ctl_sk; - - /* Never send a reset in response to a reset. */ - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (skb_rtable(rxskb)->rt_type != RTN_LOCAL) - return; - - pn = net_generic(net, dccp_v4_pernet_id); - ctl_sk = pn->v4_ctl_sk; - dst = dccp_v4_route_skb(net, ctl_sk, rxskb); - if (dst == NULL) - return; - - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - goto out; - - rxiph = ip_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr, - rxiph->daddr); - skb_dst_set(skb, dst_clone(dst)); - - local_bh_disable(); - bh_lock_sock(ctl_sk); - err = ip_build_and_send_pkt(skb, ctl_sk, - rxiph->daddr, rxiph->saddr, NULL, - inet_sk(ctl_sk)->tos); - bh_unlock_sock(ctl_sk); - - if (net_xmit_eval(err) == 0) { - __DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - __DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - } - local_bh_enable(); -out: - dst_release(dst); -} - -static void dccp_v4_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); -} - -void dccp_syn_ack_timeout(const struct request_sock *req) -{ -} -EXPORT_SYMBOL(dccp_syn_ack_timeout); - -static struct request_sock_ops dccp_request_sock_ops __read_mostly = { - .family = PF_INET, - .obj_size = sizeof(struct dccp_request_sock), - .rtx_syn_ack = dccp_v4_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v4_reqsk_destructor, - .send_reset = dccp_v4_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct inet_request_sock *ireq; - struct request_sock *req; - struct dccp_request_sock *dreq; - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ - if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return 0; /* discard, don't send a reset here */ - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); - sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->ir_mark = inet_request_mark(sk, skb); - ireq->ireq_family = AF_INET; - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v4_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v4_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} -EXPORT_SYMBOL_GPL(dccp_v4_conn_request); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_hdr *dh = dccp_hdr(skb); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dh, skb->len)) - goto reset; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dh, skb->len)) - goto reset; - return 0; - -reset: - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - kfree_skb(skb); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); - -/** - * dccp_invalid_packet - check for malformed packets - * @skb: Packet to validate - * - * Implements RFC 4340, 8.5: Step 1: Check header basics - * Packets that fail these checks are ignored and do not receive Resets. - */ -int dccp_invalid_packet(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - unsigned int cscov; - u8 dccph_doff; - - if (skb->pkt_type != PACKET_HOST) - return 1; - - /* If the packet is shorter than 12 bytes, drop packet and return */ - if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { - DCCP_WARN("pskb_may_pull failed\n"); - return 1; - } - - dh = dccp_hdr(skb); - - /* If P.type is not understood, drop packet and return */ - if (dh->dccph_type >= DCCP_PKT_INVALID) { - DCCP_WARN("invalid packet type\n"); - return 1; - } - - /* - * If P.Data Offset is too small for packet type, drop packet and return - */ - dccph_doff = dh->dccph_doff; - if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { - DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff); - return 1; - } - /* - * If P.Data Offset is too large for packet, drop packet and return - */ - if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) { - DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff); - return 1; - } - dh = dccp_hdr(skb); - /* - * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet - * has short sequence numbers), drop packet and return - */ - if ((dh->dccph_type < DCCP_PKT_DATA || - dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) { - DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n", - dccp_packet_name(dh->dccph_type)); - return 1; - } - - /* - * If P.CsCov is too large for the packet size, drop packet and return. - * This must come _before_ checksumming (not as RFC 4340 suggests). - */ - cscov = dccp_csum_coverage(skb); - if (cscov > skb->len) { - DCCP_WARN("P.CsCov %u exceeds packet length %d\n", - dh->dccph_cscov, skb->len); - return 1; - } - - /* If header checksum is incorrect, drop packet and return. - * (This step is completed in the AF-dependent functions.) */ - skb->csum = skb_checksum(skb, 0, cscov, 0); - - return 0; -} -EXPORT_SYMBOL_GPL(dccp_invalid_packet); - -/* this is called when real data arrives */ -static int dccp_v4_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - const struct iphdr *iph; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - iph = ip_hdr(skb); - /* Step 1: If header checksum is incorrect, drop packet and return */ - if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu", - dccp_packet_name(dh->dccph_type), - &iph->saddr, ntohs(dh->dccph_sport), - &iph->daddr, ntohs(dh->dccph_dport), - (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); - - if (dccp_packet_without_ack(skb)) { - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - dccp_pr_debug_cat("\n"); - } else { - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - } - -lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: "Such packets SHOULD be reported using Data Dropped - * options (Section 11.7) with Drop Code 0, Protocol - * Constraints." */ - goto discard_and_relse; - } - - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); - -no_dccp_socket: - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v4_conn_request, - .syn_recv_sock = dccp_v4_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ip_setsockopt, - .getsockopt = ip_getsockopt, -}; - -static int dccp_v4_init_sock(struct sock *sk) -{ - static __u8 dccp_v4_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v4_ctl_sock_initialized)) - dccp_v4_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; - } - - return err; -} - -static struct timewait_sock_ops dccp_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct inet_timewait_sock), -}; - -static struct proto dccp_v4_prot = { - .name = "DCCP", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v4_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v4_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v4_do_rcv, - .hash = inet_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp_sock), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp_request_sock_ops, - .twsk_prot = &dccp_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct net_protocol dccp_v4_protocol = { - .handler = dccp_v4_rcv, - .err_handler = dccp_v4_err, - .no_policy = 1, - .icmp_strict_tag_validation = 1, -}; - -static const struct proto_ops inet_dccp_ops = { - .family = PF_INET, - .owner = THIS_MODULE, - .release = inet_release, - .bind = inet_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet_getname, - /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll = dccp_poll, - .ioctl = inet_ioctl, - .gettstamp = sock_gettstamp, - /* FIXME: work on inet_listen to rename it to sock_common_listen */ - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -}; - -static struct inet_protosw dccp_v4_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v4_prot, - .ops = &inet_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v4_init_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v4_exit_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - inet_ctl_sock_destroy(pn->v4_ctl_sk); -} - -static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo); -} - -static struct pernet_operations dccp_v4_ops = { - .init = dccp_v4_init_net, - .exit = dccp_v4_exit_net, - .exit_batch = dccp_v4_exit_batch, - .id = &dccp_v4_pernet_id, - .size = sizeof(struct dccp_v4_pernet), -}; - -static int __init dccp_v4_init(void) -{ - int err = proto_register(&dccp_v4_prot, 1); - - if (err) - goto out; - - inet_register_protosw(&dccp_v4_protosw); - - err = register_pernet_subsys(&dccp_v4_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - if (err) - goto out_proto_unregister; - -out: - return err; -out_proto_unregister: - unregister_pernet_subsys(&dccp_v4_ops); -out_destroy_ctl_sock: - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); - goto out; -} - -static void __exit dccp_v4_exit(void) -{ - inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v4_ops); - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); -} - -module_init(dccp_v4_init); -module_exit(dccp_v4_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c deleted file mode 100644 index e24dbffabfc19..0000000000000 --- a/net/dccp/ipv6.c +++ /dev/null @@ -1,1174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DCCP over IPv6 - * Linux INET6 implementation - * - * Based on net/dccp6/ipv6.c - * - * Arnaldo Carvalho de Melo <acme@ghostprotocols.net> - */ - -#include <linux/module.h> -#include <linux/random.h> -#include <linux/slab.h> -#include <linux/xfrm.h> -#include <linux/string.h> - -#include <net/addrconf.h> -#include <net/inet_common.h> -#include <net/inet_hashtables.h> -#include <net/inet_sock.h> -#include <net/inet6_connection_sock.h> -#include <net/inet6_hashtables.h> -#include <net/ip6_route.h> -#include <net/ipv6.h> -#include <net/protocol.h> -#include <net/transp_v6.h> -#include <net/ip6_checksum.h> -#include <net/xfrm.h> -#include <net/secure_seq.h> -#include <net/netns/generic.h> -#include <net/sock.h> -#include <net/rstreason.h> - -#include "dccp.h" -#include "ipv6.h" -#include "feat.h" - -struct dccp_v6_pernet { - struct sock *v6_ctl_sk; -}; - -static unsigned int dccp_v6_pernet_id __read_mostly; - -/* The per-net v6_ctl_sk is used for sending RSTs and ACKs */ - -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; - -/* add pseudo-header to DCCP checksum stored in skb->csum */ -static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum); -} - -static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr); -} - -static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) -{ - return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport ); - -} - -static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - const struct ipv6hdr *hdr; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - struct ipv6_pinfo *np; - struct sock *sk; - int err; - __u64 seq; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - hdr = (const struct ipv6hdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet6_lookup_established(net, &dccp_hashinfo, - &hdr->daddr, dh->dccph_dport, - &hdr->saddr, ntohs(dh->dccph_sport), - inet6_iif(skb), 0); - - if (!sk) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - np = inet6_sk(sk); - - if (type == NDISC_REDIRECT) { - if (!sock_owned_by_user(sk)) { - struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst) - dst->ops->redirect(dst, sk, skb); - } - goto out; - } - - if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst = NULL; - - if (!ip6_sk_accept_pmtu(sk)) - goto out; - - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) - goto out; - - dst = inet6_csk_update_pmtu(sk, ntohl(info)); - if (!dst) - goto out; - - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) - dccp_sync_mss(sk, dst_mtu(dst)); - goto out; - } - - icmpv6_err_convert(type, code, &err); - - /* Might be for an request_sock */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: /* Cannot happen. - It can, it SYNs are crossed. --ANK */ - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - /* - * Wake people up to see the error - * (see connect in sock.c) - */ - sk_error_report(sk); - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - - -static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *skb; - struct in6_addr *final_p, final; - struct flowi6 fl6; - int err = -1; - struct dst_entry *dst; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = ireq->ir_v6_rmt_addr; - fl6.saddr = ireq->ir_v6_loc_addr; - fl6.flowlabel = 0; - fl6.flowi6_oif = ireq->ir_iif; - fl6.fl6_dport = ireq->ir_rmt_port; - fl6.fl6_sport = htons(ireq->ir_num); - security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); - - - rcu_read_lock(); - final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); - rcu_read_unlock(); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto done; - } - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - struct dccp_hdr *dh = dccp_hdr(skb); - struct ipv6_txoptions *opt; - - dh->dccph_checksum = dccp_v6_csum_finish(skb, - &ireq->ir_v6_loc_addr, - &ireq->ir_v6_rmt_addr); - fl6.daddr = ireq->ir_v6_rmt_addr; - rcu_read_lock(); - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt, - np->tclass, READ_ONCE(sk->sk_priority)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -done: - dst_release(dst); - return err; -} - -static void dccp_v6_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(inet_rsk(req)->ipv6_opt); - kfree_skb(inet_rsk(req)->pktopts); -} - -static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - const struct ipv6hdr *rxip6h; - struct sk_buff *skb; - struct flowi6 fl6; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v6_pernet *pn; - struct sock *ctl_sk; - struct dst_entry *dst; - - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (!ipv6_unicast_destination(rxskb)) - return; - - pn = net_generic(net, dccp_v6_pernet_id); - ctl_sk = pn->v6_ctl_sk; - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - return; - - rxip6h = ipv6_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr, - &rxip6h->daddr); - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = rxip6h->saddr; - fl6.saddr = rxip6h->daddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.flowi6_oif = inet6_iif(rxskb); - fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; - fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; - security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6)); - - /* sk = NULL, but it is safe for now. RST socket required. */ - dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); - if (!IS_ERR(dst)) { - skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - return; - } - - kfree_skb(skb); -} - -static struct request_sock_ops dccp6_request_sock_ops = { - .family = AF_INET6, - .obj_size = sizeof(struct dccp6_request_sock), - .rtx_syn_ack = dccp_v6_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v6_reqsk_destructor, - .send_reset = dccp_v6_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct request_sock *req; - struct dccp_request_sock *dreq; - struct inet_request_sock *ireq; - struct ipv6_pinfo *np = inet6_sk(sk); - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_conn_request(sk, skb); - - if (!ipv6_unicast_destination(skb)) - return 0; /* discard, don't send a reset here */ - - if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { - __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); - return 0; - } - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * There are no SYN attacks on IPv6, yet... - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; - ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - ireq->ir_rmt_addr = LOOPBACK4_IPV6; - ireq->ir_loc_addr = LOOPBACK4_IPV6; - - ireq->ireq_family = AF_INET6; - ireq->ir_mark = inet_request_mark(sk, skb); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || - np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { - refcount_inc(&skb->users); - ireq->pktopts = skb; - } - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - /* So that link locals have meaning */ - if (!ireq->ir_iif && - ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v6_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v6_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} - -static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *newnp; - const struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_txoptions *opt; - struct inet_sock *newinet; - struct dccp6_sock *newdp6; - struct sock *newsk; - - if (skb->protocol == htons(ETH_P_IP)) { - /* - * v6 mapped - */ - newsk = dccp_v4_request_recv_sock(sk, skb, req, dst, - req_unhash, own_req); - if (newsk == NULL) - return NULL; - - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = newsk->sk_v6_rcv_saddr; - - inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; - newsk->sk_backlog_rcv = dccp_v4_do_rcv; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->mcast_oif = inet_iif(skb); - newnp->mcast_hops = ip_hdr(skb)->ttl; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks count - * here, dccp_create_openreq_child now does this for us, see the comment in - * that function for the gory details. -acme - */ - - /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 icsk.icsk_af_ops. - Sync it now. - */ - dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); - - return newsk; - } - - - if (sk_acceptq_is_full(sk)) - goto out_overflow; - - if (!dst) { - struct flowi6 fl6; - - dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP); - if (!dst) - goto out; - } - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto out_nonewsk; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks - * count here, dccp_create_openreq_child now does this for us, see the - * comment in that function for the gory details. -acme - */ - - ip6_dst_store(newsk, dst, NULL, NULL); - newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | - NETIF_F_TSO); - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = ireq->ir_v6_loc_addr; - - /* Now IPv6 options... - - First: no IPv4 options. - */ - newinet->inet_opt = NULL; - - /* Clone RX bits */ - newnp->rxopt.all = np->rxopt.all; - - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - - /* - * Clone native IPv6 options from listening socket (if any) - * - * Yes, keeping reference count would be much more clever, but we make - * one more one thing there: reattach optmem to newsk. - */ - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - if (opt) { - opt = ipv6_dup_options(newsk, opt); - RCU_INIT_POINTER(newnp->opt, opt); - } - inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (opt) - inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + - opt->opt_flen; - - dccp_sync_mss(newsk, dst_mtu(dst)); - - if (__inet_inherit_port(sk, newsk) < 0) { - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto out; - } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - /* Clone pktoptions received with SYN, if we own the req */ - if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); - consume_skb(ireq->pktopts); - ireq->pktopts = NULL; - } - - return newsk; - -out_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out_nonewsk: - dst_release(dst); -out: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -} - -/* The socket must have it's spinlock held when we get - * here. - * - * We have a potential double-lock case here, so even when - * doing backlog processing we use the BH locking scheme. - * This is because we cannot sleep with the original spinlock - * held. - */ -static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *opt_skb = NULL; - - /* Imagine: socket is IPv6. IPv4 packet arrives, - goes to IPv4 receive handler and backlogged. - From backlog it always goes here. Kerboom... - Fortunately, dccp_rcv_established and rcv_established - handle them correctly, but it is not case with - dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK - */ - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_do_rcv(sk, skb); - - if (sk_filter(sk, skb)) - goto discard; - - /* - * socket locking is here for SMP purposes as backlog rcv is currently - * called with bh processing disabled. - */ - - /* Do Stevens' IPV6_PKTOPTIONS. - - Yes, guys, it is the only place in our code, where we - may make it not affecting IPv4. - The rest of code is protocol independent, - and I do not like idea to uglify IPv4. - - Actually, all the idea behind IPV6_PKTOPTIONS - looks not very well thought. For now we latch - options, received in the last packet, enqueued - by tcp. Feel free to propose better solution. - --ANK (980728) - */ - if (np->rxopt.all && sk->sk_state != DCCP_LISTEN) - opt_skb = skb_clone_and_charge_r(skb, sk); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - -reset: - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); -discard: - if (opt_skb != NULL) - __kfree_skb(opt_skb); - kfree_skb(skb); - return 0; - -/* Handling IPV6_PKTOPTIONS skb the similar - * way it's done for net/ipv6/tcp_ipv6.c - */ -ipv6_pktoptions: - if (!((1 << sk->sk_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) { - if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) - WRITE_ONCE(np->mcast_oif, inet6_iif(opt_skb)); - if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) - WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); - if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) - np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); - if (inet6_test_bit(REPFLOW, sk)) - np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb, - &DCCP_SKB_CB(opt_skb)->header.h6)) { - memmove(IP6CB(opt_skb), - &DCCP_SKB_CB(opt_skb)->header.h6, - sizeof(struct inet6_skb_parm)); - opt_skb = xchg(&np->pktoptions, opt_skb); - } else { - __kfree_skb(opt_skb); - opt_skb = xchg(&np->pktoptions, NULL); - } - } - - kfree_skb(opt_skb); - return 0; -} - -static int dccp_v6_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - /* Step 1: If header checksum is incorrect, drop packet and return. */ - if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - if (dccp_packet_without_ack(skb)) - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - else - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - -lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, - inet6_iif(skb), 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */ - goto discard_and_relse; - } - - if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, - refcounted) ? -1 : 0; - -no_dccp_socket: - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) -{ - struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct in6_addr *saddr = NULL, *final_p, final; - struct ipv6_txoptions *opt; - struct flowi6 fl6; - struct dst_entry *dst; - int addr_type; - int err; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - if (usin->sin6_family != AF_INET6) - return -EAFNOSUPPORT; - - memset(&fl6, 0, sizeof(fl6)); - - if (inet6_test_bit(SNDFLOW, sk)) { - fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; - IP6_ECN_flow_init(fl6.flowlabel); - if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { - struct ip6_flowlabel *flowlabel; - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (IS_ERR(flowlabel)) - return -EINVAL; - fl6_sock_release(flowlabel); - } - } - /* - * connect() to INADDR_ANY means loopback (BSD'ism). - */ - if (ipv6_addr_any(&usin->sin6_addr)) - usin->sin6_addr.s6_addr[15] = 1; - - addr_type = ipv6_addr_type(&usin->sin6_addr); - - if (addr_type & IPV6_ADDR_MULTICAST) - return -ENETUNREACH; - - if (addr_type & IPV6_ADDR_LINKLOCAL) { - if (addr_len >= sizeof(struct sockaddr_in6) && - usin->sin6_scope_id) { - /* If interface is set while binding, indices - * must coincide. - */ - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) - return -EINVAL; - - sk->sk_bound_dev_if = usin->sin6_scope_id; - } - - /* Connect to link-local address requires an interface */ - if (!sk->sk_bound_dev_if) - return -EINVAL; - } - - sk->sk_v6_daddr = usin->sin6_addr; - np->flow_label = fl6.flowlabel; - - /* - * DCCP over IPv4 - */ - if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = icsk->icsk_ext_hdr_len; - struct sockaddr_in sin; - - net_dbg_ratelimited("connect: ipv4 mapped\n"); - - if (ipv6_only_sock(sk)) - return -ENETUNREACH; - - sin.sin_family = AF_INET; - sin.sin_port = usin->sin6_port; - sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - - icsk->icsk_af_ops = &dccp_ipv6_mapped; - sk->sk_backlog_rcv = dccp_v4_do_rcv; - - err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); - if (err) { - icsk->icsk_ext_hdr_len = exthdrlen; - icsk->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_backlog_rcv = dccp_v6_do_rcv; - goto failure; - } - np->saddr = sk->sk_v6_rcv_saddr; - return err; - } - - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) - saddr = &sk->sk_v6_rcv_saddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = saddr ? *saddr : np->saddr; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.fl6_dport = usin->sin6_port; - fl6.fl6_sport = inet->inet_sport; - security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - - opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - final_p = fl6_update_dst(&fl6, opt, &final); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - goto failure; - } - - if (saddr == NULL) { - saddr = &fl6.saddr; - - err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); - if (err) - goto failure; - } - - /* set the source address */ - np->saddr = *saddr; - inet->inet_rcv_saddr = LOOPBACK4_IPV6; - - ip6_dst_store(sk, dst, NULL, NULL); - - icsk->icsk_ext_hdr_len = 0; - if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; - - inet->inet_dport = usin->sin6_port; - - dccp_set_state(sk, DCCP_REQUESTING); - err = inet6_hash_connect(&dccp_death_row, sk); - if (err) - goto late_failure; - - dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport); - err = dccp_connect(sk); - if (err) - goto late_failure; - - return 0; - -late_failure: - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - __sk_dst_reset(sk); -failure: - inet->inet_dport = 0; - sk->sk_route_caps = 0; - return err; -} - -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { - .queue_xmit = inet6_csk_xmit, - .send_check = dccp_v6_send_check, - .rebuild_header = inet6_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct ipv6hdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, -}; - -/* - * DCCP over IPv4 via INET6 API - */ -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, -}; - -static void dccp_v6_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet6_sock_destruct(sk); -} - -/* NOTE: A lot of things set to zero explicitly by call to - * sk_alloc() so need not be done here. - */ -static int dccp_v6_init_sock(struct sock *sk) -{ - static __u8 dccp_v6_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v6_ctl_sock_initialized)) - dccp_v6_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_destruct = dccp_v6_sk_destruct; - } - - return err; -} - -static struct timewait_sock_ops dccp6_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct dccp6_timewait_sock), -}; - -static struct proto dccp_v6_prot = { - .name = "DCCPv6", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v6_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v6_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v6_do_rcv, - .hash = inet6_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp6_sock), - .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp6_request_sock_ops, - .twsk_prot = &dccp6_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct inet6_protocol dccp_v6_protocol = { - .handler = dccp_v6_rcv, - .err_handler = dccp_v6_err, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, -}; - -static const struct proto_ops inet6_dccp_ops = { - .family = PF_INET6, - .owner = THIS_MODULE, - .release = inet6_release, - .bind = inet6_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet6_getname, - .poll = dccp_poll, - .ioctl = inet6_ioctl, - .gettstamp = sock_gettstamp, - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -#ifdef CONFIG_COMPAT - .compat_ioctl = inet6_compat_ioctl, -#endif -}; - -static struct inet_protosw dccp_v6_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v6_prot, - .ops = &inet6_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v6_init_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v6_ctl_sk, PF_INET6, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v6_exit_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - inet_ctl_sock_destroy(pn->v6_ctl_sk); -} - -static struct pernet_operations dccp_v6_ops = { - .init = dccp_v6_init_net, - .exit = dccp_v6_exit_net, - .id = &dccp_v6_pernet_id, - .size = sizeof(struct dccp_v6_pernet), -}; - -static int __init dccp_v6_init(void) -{ - int err = proto_register(&dccp_v6_prot, 1); - - if (err) - goto out; - - inet6_register_protosw(&dccp_v6_protosw); - - err = register_pernet_subsys(&dccp_v6_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - if (err) - goto out_unregister_proto; - -out: - return err; -out_unregister_proto: - unregister_pernet_subsys(&dccp_v6_ops); -out_destroy_ctl_sock: - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); - goto out; -} - -static void __exit dccp_v6_exit(void) -{ - inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v6_ops); - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); -} - -module_init(dccp_v6_init); -module_exit(dccp_v6_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); -MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h deleted file mode 100644 index c5d14c48def17..0000000000000 --- a/net/dccp/ipv6.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_IPV6_H -#define _DCCP_IPV6_H -/* - * net/dccp/ipv6.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> - */ - -#include <linux/dccp.h> -#include <linux/ipv6.h> - -struct dccp6_sock { - struct dccp_sock dccp; - struct ipv6_pinfo inet6; -}; - -struct dccp6_request_sock { - struct dccp_request_sock dccp; -}; - -struct dccp6_timewait_sock { - struct inet_timewait_sock inet; -}; - -#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c deleted file mode 100644 index fecc8190064f8..0000000000000 --- a/net/dccp/minisocks.c +++ /dev/null @@ -1,266 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/minisocks.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/timer.h> - -#include <net/sock.h> -#include <net/xfrm.h> -#include <net/inet_timewait_sock.h> -#include <net/rstreason.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct inet_timewait_death_row dccp_death_row = { - .tw_refcount = REFCOUNT_INIT(1), - .sysctl_max_tw_buckets = NR_FILE * 2, - .hashinfo = &dccp_hashinfo, -}; - -EXPORT_SYMBOL_GPL(dccp_death_row); - -void dccp_time_wait(struct sock *sk, int state, int timeo) -{ - struct inet_timewait_sock *tw; - - tw = inet_twsk_alloc(sk, &dccp_death_row, state); - - if (tw != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); -#if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == PF_INET6) { - tw->tw_v6_daddr = sk->sk_v6_daddr; - tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - tw->tw_ipv6only = sk->sk_ipv6only; - } -#endif - - /* Get the TIME_WAIT timeout firing. */ - if (timeo < rto) - timeo = rto; - - if (state == DCCP_TIME_WAIT) - timeo = DCCP_TIMEWAIT_LEN; - - /* Linkage updates. - * Note that access to tw after this point is illegal. - */ - inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo); - } else { - /* Sorry, if we're out of memory, just CLOSE this - * socket up. We've got bigger problems than - * non-graceful socket closings. - */ - DCCP_WARN("time wait bucket table overflow\n"); - } - - dccp_done(sk); -} - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb) -{ - /* - * Step 3: Process LISTEN state - * - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - */ - struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); - - if (newsk != NULL) { - struct dccp_request_sock *dreq = dccp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct dccp_sock *newdp = dccp_sk(newsk); - - newdp->dccps_role = DCCP_ROLE_SERVER; - newdp->dccps_hc_rx_ackvec = NULL; - newdp->dccps_service_list = NULL; - newdp->dccps_hc_rx_ccid = NULL; - newdp->dccps_hc_tx_ccid = NULL; - newdp->dccps_service = dreq->dreq_service; - newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; - newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; - newicsk->icsk_rto = DCCP_TIMEOUT_INIT; - - INIT_LIST_HEAD(&newdp->dccps_featneg); - /* - * Step 3: Process LISTEN state - * - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR from packet (or Init Cookies) - * - * Setting AWL/AWH and SWL/SWH happens as part of the feature - * activation below, as these windows all depend on the local - * and remote Sequence Window feature values (7.5.2). - */ - newdp->dccps_iss = dreq->dreq_iss; - newdp->dccps_gss = dreq->dreq_gss; - newdp->dccps_gar = newdp->dccps_iss; - newdp->dccps_isr = dreq->dreq_isr; - newdp->dccps_gsr = dreq->dreq_gsr; - - /* - * Activate features: initialise CCIDs, sequence windows etc. - */ - if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - sk_free_unlock_clone(newsk); - return NULL; - } - dccp_init_xmit_timers(newsk); - - __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS); - } - return newsk; -} - -EXPORT_SYMBOL_GPL(dccp_create_openreq_child); - -/* - * Process an incoming packet for RESPOND sockets represented - * as an request_sock. - */ -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req) -{ - struct sock *child = NULL; - struct dccp_request_sock *dreq = dccp_rsk(req); - bool own_req; - - /* TCP/DCCP listeners became lockless. - * DCCP stores complex state in its request_sock, so we need - * a protection for them, now this code runs without being protected - * by the parent (listener) lock. - */ - spin_lock_bh(&dreq->dreq_lock); - - /* Check for retransmitted REQUEST */ - if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { - - if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) { - dccp_pr_debug("Retransmitted REQUEST\n"); - dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq; - /* - * Send another RESPONSE packet - * To protect against Request floods, increment retrans - * counter (backoff, monitored by dccp_response_timer). - */ - inet_rtx_syn_ack(sk, req); - } - /* Network Duplicate, discard packet */ - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK && - dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK) - goto drop; - - /* Invalid ACK */ - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dreq->dreq_iss, dreq->dreq_gss)) { - dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " - "dreq_iss=%llu, dreq_gss=%llu\n", - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long) dreq->dreq_iss, - (unsigned long long) dreq->dreq_gss); - goto drop; - } - - if (dccp_parse_options(sk, dreq, skb)) - goto drop; - - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, - req, &own_req); - if (child) { - child = inet_csk_complete_hashdance(sk, child, req, own_req); - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; -drop: - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) - req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - - inet_csk_reqsk_queue_drop(sk, req); -out: - spin_unlock_bh(&dreq->dreq_lock); - return child; -} - -EXPORT_SYMBOL_GPL(dccp_check_req); - -/* - * Queue segment on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket. - */ -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb) - __releases(child) -{ - int ret = 0; - const int state = child->sk_state; - - if (!sock_owned_by_user(child)) { - ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), - skb->len); - - /* Wakeup parent, send SIGIO */ - if (state == DCCP_RESPOND && child->sk_state != state) - parent->sk_data_ready(parent); - } else { - /* Alas, it is possible again, because we do lookup - * in main socket hash table and lock on listening - * socket does not protect us more. - */ - __sk_add_backlog(child, skb); - } - - bh_unlock_sock(child); - sock_put(child); - return ret; -} - -EXPORT_SYMBOL_GPL(dccp_child_process); - -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk) -{ - DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); - -int dccp_reqsk_init(struct request_sock *req, - struct dccp_sock const *dp, struct sk_buff const *skb) -{ - struct dccp_request_sock *dreq = dccp_rsk(req); - - spin_lock_init(&dreq->dreq_lock); - inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport; - inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport); - inet_rsk(req)->acked = 0; - dreq->dreq_timestamp_echo = 0; - - /* inherit feature negotiation options from listening socket */ - return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c deleted file mode 100644 index db62d47670249..0000000000000 --- a/net/dccp/options.c +++ /dev/null @@ -1,609 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/options.c - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> - * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> - */ -#include <linux/dccp.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/unaligned.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -u64 dccp_decode_value_var(const u8 *bf, const u8 len) -{ - u64 value = 0; - - if (len >= DCCP_OPTVAL_MAXLEN) - value += ((u64)*bf++) << 40; - if (len > 4) - value += ((u64)*bf++) << 32; - if (len > 3) - value += ((u64)*bf++) << 24; - if (len > 2) - value += ((u64)*bf++) << 16; - if (len > 1) - value += ((u64)*bf++) << 8; - if (len > 0) - value += *bf; - - return value; -} - -/** - * dccp_parse_options - Parse DCCP options present in @skb - * @sk: client|server|listening dccp socket (when @dreq != NULL) - * @dreq: request socket to use during connection setup, or NULL - * @skb: frame to parse - */ -int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_hdr *dh = dccp_hdr(skb); - const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; - unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); - unsigned char *opt_ptr = options; - const unsigned char *opt_end = (unsigned char *)dh + - (dh->dccph_doff * 4); - struct dccp_options_received *opt_recv = &dp->dccps_options_received; - unsigned char opt, len; - unsigned char *value; - u32 elapsed_time; - __be32 opt_val; - int rc; - int mandatory = 0; - - memset(opt_recv, 0, sizeof(*opt_recv)); - - opt = len = 0; - while (opt_ptr != opt_end) { - opt = *opt_ptr++; - len = 0; - value = NULL; - - /* Check if this isn't a single byte option */ - if (opt > DCCPO_MAX_RESERVED) { - if (opt_ptr == opt_end) - goto out_nonsensical_length; - - len = *opt_ptr++; - if (len < 2) - goto out_nonsensical_length; - /* - * Remove the type and len fields, leaving - * just the value size - */ - len -= 2; - value = opt_ptr; - opt_ptr += len; - - if (opt_ptr > opt_end) - goto out_nonsensical_length; - } - - /* - * CCID-specific options are ignored during connection setup, as - * negotiation may still be in progress (see RFC 4340, 10.3). - * The same applies to Ack Vectors, as these depend on the CCID. - */ - if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || - opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) - goto ignore_option; - - switch (opt) { - case DCCPO_PADDING: - break; - case DCCPO_MANDATORY: - if (mandatory) - goto out_invalid_option; - if (pkt_type != DCCP_PKT_DATA) - mandatory = 1; - break; - case DCCPO_NDP_COUNT: - if (len > 6) - goto out_invalid_option; - - opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); - dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), - (unsigned long long)opt_recv->dccpor_ndp); - break; - case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: - if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ - break; - if (len == 0) - goto out_invalid_option; - rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, - *value, value + 1, len - 1); - if (rc) - goto out_featneg_failed; - break; - case DCCPO_TIMESTAMP: - if (len != 4) - goto out_invalid_option; - /* - * RFC 4340 13.1: "The precise time corresponding to - * Timestamp Value zero is not specified". We use - * zero to indicate absence of a meaningful timestamp. - */ - opt_val = get_unaligned((__be32 *)value); - if (unlikely(opt_val == 0)) { - DCCP_WARN("Timestamp with zero value\n"); - break; - } - - if (dreq != NULL) { - dreq->dreq_timestamp_echo = ntohl(opt_val); - dreq->dreq_timestamp_time = dccp_timestamp(); - } else { - opt_recv->dccpor_timestamp = - dp->dccps_timestamp_echo = ntohl(opt_val); - dp->dccps_timestamp_time = dccp_timestamp(); - } - dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n", - dccp_role(sk), ntohl(opt_val), - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - /* schedule an Ack in case this sender is quiescent */ - inet_csk_schedule_ack(sk); - break; - case DCCPO_TIMESTAMP_ECHO: - if (len != 4 && len != 6 && len != 8) - goto out_invalid_option; - - opt_val = get_unaligned((__be32 *)value); - opt_recv->dccpor_timestamp_echo = ntohl(opt_val); - - dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, " - "ackno=%llu", dccp_role(sk), - opt_recv->dccpor_timestamp_echo, - len + 2, - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - - value += 4; - - if (len == 4) { /* no elapsed time included */ - dccp_pr_debug_cat("\n"); - break; - } - - if (len == 6) { /* 2-byte elapsed time */ - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else { /* 4-byte elapsed time */ - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } - - dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time); - - /* Give precedence to the biggest ELAPSED_TIME */ - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - break; - case DCCPO_ELAPSED_TIME: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */ - break; - - if (len == 2) { - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else if (len == 4) { - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } else { - goto out_invalid_option; - } - - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - - dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", - dccp_role(sk), elapsed_time); - break; - case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: - if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ - break; - /* - * Ack vectors are processed by the TX CCID if it is - * interested. The RX CCID need not parse Ack Vectors, - * since it is only interested in clearing old state. - */ - fallthrough; - case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: - if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - default: - DCCP_CRIT("DCCP(%p): option %d(len=%d) not " - "implemented, ignoring", sk, opt, len); - break; - } -ignore_option: - if (opt != DCCPO_MANDATORY) - mandatory = 0; - } - - /* mandatory was the last byte in option list -> reset connection */ - if (mandatory) - goto out_invalid_option; - -out_nonsensical_length: - /* RFC 4340, 5.8: ignore option and all remaining option space */ - return 0; - -out_invalid_option: - DCCP_INC_STATS(DCCP_MIB_INVALIDOPT); - rc = DCCP_RESET_CODE_OPTION_ERROR; -out_featneg_failed: - DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); - DCCP_SKB_CB(skb)->dccpd_reset_code = rc; - DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; - DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; - DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; - return -1; -} - -EXPORT_SYMBOL_GPL(dccp_parse_options); - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) -{ - if (len >= DCCP_OPTVAL_MAXLEN) - *to++ = (value & 0xFF0000000000ull) >> 40; - if (len > 4) - *to++ = (value & 0xFF00000000ull) >> 32; - if (len > 3) - *to++ = (value & 0xFF000000) >> 24; - if (len > 2) - *to++ = (value & 0xFF0000) >> 16; - if (len > 1) - *to++ = (value & 0xFF00) >> 8; - if (len > 0) - *to++ = (value & 0xFF); -} - -static inline u8 dccp_ndp_len(const u64 ndp) -{ - if (likely(ndp <= 0xFF)) - return 1; - return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6); -} - -int dccp_insert_option(struct sk_buff *skb, const unsigned char option, - const void *value, const unsigned char len) -{ - unsigned char *to; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; - - to = skb_push(skb, len + 2); - *to++ = option; - *to++ = len + 2; - - memcpy(to, value, len); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_insert_option); - -static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - u64 ndp = dp->dccps_ndp_count; - - if (dccp_non_data_packet(skb)) - ++dp->dccps_ndp_count; - else - dp->dccps_ndp_count = 0; - - if (ndp > 0) { - unsigned char *ptr; - const int ndp_len = dccp_ndp_len(ndp); - const int len = ndp_len + 2; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - ptr = skb_push(skb, len); - *ptr++ = DCCPO_NDP_COUNT; - *ptr++ = len; - dccp_encode_value_var(ndp, ptr, ndp_len); - } - - return 0; -} - -static inline int dccp_elapsed_time_len(const u32 elapsed_time) -{ - return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; -} - -static int dccp_insert_option_timestamp(struct sk_buff *skb) -{ - __be32 now = htonl(dccp_timestamp()); - /* yes this will overflow but that is the point as we want a - * 10 usec 32 bit timer which mean it wraps every 11.9 hours */ - - return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); -} - -static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, - struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - __be32 tstamp_echo; - unsigned char *to; - u32 elapsed_time, elapsed_time_len, len; - - if (dreq != NULL) { - elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time; - tstamp_echo = htonl(dreq->dreq_timestamp_echo); - dreq->dreq_timestamp_echo = 0; - } else { - elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time; - tstamp_echo = htonl(dp->dccps_timestamp_echo); - dp->dccps_timestamp_echo = 0; - } - - elapsed_time_len = dccp_elapsed_time_len(elapsed_time); - len = 6 + elapsed_time_len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - to = skb_push(skb, len); - *to++ = DCCPO_TIMESTAMP_ECHO; - *to++ = len; - - memcpy(to, &tstamp_echo, 4); - to += 4; - - if (elapsed_time_len == 2) { - const __be16 var16 = htons((u16)elapsed_time); - memcpy(to, &var16, 2); - } else if (elapsed_time_len == 4) { - const __be32 var32 = htonl(elapsed_time); - memcpy(to, &var32, 4); - } - - return 0; -} - -static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const u16 buflen = dccp_ackvec_buflen(av); - /* Figure out how many options do we need to represent the ackvec */ - const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); - u16 len = buflen + 2 * nr_opts; - u8 i, nonce = 0; - const unsigned char *tail, *from; - unsigned char *to; - - if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, - dccp_packet_name(dcb->dccpd_type)); - return -1; - } - /* - * Since Ack Vectors are variable-length, we can not always predict - * their size. To catch exception cases where the space is running out - * on the skb, a separate Sync is scheduled to carry the Ack Vector. - */ - if (len > DCCPAV_MIN_OPTLEN && - len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { - DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " - "MPS=%u ==> reduce payload size?\n", len, skb->len, - dcb->dccpd_opt_len, dp->dccps_mss_cache); - dp->dccps_sync_scheduled = 1; - return 0; - } - dcb->dccpd_opt_len += len; - - to = skb_push(skb, len); - len = buflen; - from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; - - for (i = 0; i < nr_opts; ++i) { - int copylen = len; - - if (len > DCCP_SINGLE_OPT_MAXLEN) - copylen = DCCP_SINGLE_OPT_MAXLEN; - - /* - * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via - * its type; ack_nonce is the sum of all individual buf_nonce's. - */ - nonce ^= av->av_buf_nonce[i]; - - *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; - *to++ = copylen + 2; - - /* Check if buf_head wraps */ - if (from + copylen > tail) { - const u16 tailsize = tail - from; - - memcpy(to, from, tailsize); - to += tailsize; - len -= tailsize; - copylen -= tailsize; - from = av->av_buf; - } - - memcpy(to, from, copylen); - from += copylen; - to += copylen; - len -= copylen; - } - /* - * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. - */ - if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) - return -ENOBUFS; - return 0; -} - -/** - * dccp_insert_option_mandatory - Mandatory option (5.8.2) - * @skb: frame into which to insert option - * - * Note that since we are using skb_push, this function needs to be called - * _after_ inserting the option it is supposed to influence (stack order). - */ -int dccp_insert_option_mandatory(struct sk_buff *skb) -{ - if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len++; - *(u8 *)skb_push(skb, 1) = DCCPO_MANDATORY; - return 0; -} - -/** - * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb - * @skb: frame to insert feature negotiation option into - * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R - * @feat: one out of %dccp_feature_numbers - * @val: NN value or SP array (preferred element first) to copy - * @len: true length of @val in bytes (excluding first element repetition) - * @repeat_first: whether to copy the first element of @val twice - * - * The last argument is used to construct Confirm options, where the preferred - * value and the preference list appear separately (RFC 4340, 6.3.1). Preference - * lists are kept such that the preferred entry is always first, so we only need - * to copy twice, and avoid the overhead of cloning into a bigger array. - */ -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len, bool repeat_first) -{ - u8 tot_len, *to; - - /* take the `Feature' field and possible repetition into account */ - if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { - DCCP_WARN("length %u for feature %u too large\n", len, feat); - return -1; - } - - if (unlikely(val == NULL || len == 0)) - len = repeat_first = false; - tot_len = 3 + repeat_first + len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("packet too small for feature %d option!\n", feat); - return -1; - } - DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; - - to = skb_push(skb, tot_len); - *to++ = type; - *to++ = tot_len; - *to++ = feat; - - if (repeat_first) - *to++ = *val; - if (len) - memcpy(to, val, len); - return 0; -} - -/* The length of all options needs to be a multiple of 4 (5.8) */ -static void dccp_insert_option_padding(struct sk_buff *skb) -{ - int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; - - if (padding != 0) { - padding = 4 - padding; - memset(skb_push(skb, padding), 0, padding); - DCCP_SKB_CB(skb)->dccpd_opt_len += padding; - } -} - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { - - /* Feature Negotiation */ - if (dccp_feat_insert_opts(dp, NULL, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { - /* - * Obtain RTT sample from Request/Response exchange. - * This is currently used for TFRC initialisation. - */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - } else if (dccp_ackvec_pending(sk) && - dccp_insert_option_ackvec(sk, skb)) { - return -1; - } - } - - if (dp->dccps_hc_rx_insert_options) { - if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb)) - return -1; - dp->dccps_hc_rx_insert_options = 0; - } - - if (dp->dccps_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(dp, NULL, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} - -int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) -{ - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dccp_feat_insert_opts(NULL, dreq, skb)) - return -1; - - /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - if (dreq->dreq_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(NULL, dreq, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} diff --git a/net/dccp/output.c b/net/dccp/output.c deleted file mode 100644 index 39cf3430177ac..0000000000000 --- a/net/dccp/output.c +++ /dev/null @@ -1,708 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/output.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/sched/signal.h> - -#include <net/inet_sock.h> -#include <net/sock.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -static inline void dccp_event_ack_sent(struct sock *sk) -{ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); -} - -/* enqueue @skb on sk_send_head for retransmission, return clone to send now */ -static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) -{ - skb_set_owner_w(skb, sk); - WARN_ON(sk->sk_send_head); - sk->sk_send_head = skb; - return skb_clone(sk->sk_send_head, gfp_any()); -} - -/* - * All SKB's seen here are completely headerless. It is our - * job to build the DCCP header, and pass the packet down to - * IP so it can do the same plus pass the packet off to the - * device. - */ -static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) -{ - if (likely(skb != NULL)) { - struct inet_sock *inet = inet_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - struct dccp_hdr *dh; - /* XXX For now we're using only 48 bits sequence numbers */ - const u32 dccp_header_size = sizeof(*dh) + - sizeof(struct dccp_hdr_ext) + - dccp_packet_hdr_len(dcb->dccpd_type); - int err, set_ack = 1; - u64 ackno = dp->dccps_gsr; - /* - * Increment GSS here already in case the option code needs it. - * Update GSS for real only if option processing below succeeds. - */ - dcb->dccpd_seq = ADD48(dp->dccps_gss, 1); - - switch (dcb->dccpd_type) { - case DCCP_PKT_DATA: - set_ack = 0; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_RESET: - break; - - case DCCP_PKT_REQUEST: - set_ack = 0; - /* Use ISS on the first (non-retransmitted) Request. */ - if (icsk->icsk_retransmits == 0) - dcb->dccpd_seq = dp->dccps_iss; - fallthrough; - - case DCCP_PKT_SYNC: - case DCCP_PKT_SYNCACK: - ackno = dcb->dccpd_ack_seq; - fallthrough; - default: - /* - * Set owner/destructor: some skbs are allocated via - * alloc_skb (e.g. when retransmission may happen). - * Only Data, DataAck, and Reset packets should come - * through here with skb->sk set. - */ - WARN_ON(skb->sk); - skb_set_owner_w(skb, sk); - break; - } - - if (dccp_insert_options(sk, skb)) { - kfree_skb(skb); - return -EPROTO; - } - - - /* Build DCCP header and checksum it. */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - dh->dccph_type = dcb->dccpd_type; - dh->dccph_sport = inet->inet_sport; - dh->dccph_dport = inet->inet_dport; - dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; - dh->dccph_ccval = dcb->dccpd_ccval; - dh->dccph_cscov = dp->dccps_pcslen; - /* XXX For now we're using only 48 bits sequence numbers */ - dh->dccph_x = 1; - - dccp_update_gss(sk, dcb->dccpd_seq); - dccp_hdr_set_seq(dh, dp->dccps_gss); - if (set_ack) - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); - - switch (dcb->dccpd_type) { - case DCCP_PKT_REQUEST: - dccp_hdr_request(skb)->dccph_req_service = - dp->dccps_service; - /* - * Limit Ack window to ISS <= P.ackno <= GSS, so that - * only Responses to Requests we sent are considered. - */ - dp->dccps_awl = dp->dccps_iss; - break; - case DCCP_PKT_RESET: - dccp_hdr_reset(skb)->dccph_reset_code = - dcb->dccpd_reset_code; - break; - } - - icsk->icsk_af_ops->send_check(sk, skb); - - if (set_ack) - dccp_event_ack_sent(sk); - - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); - return net_xmit_eval(err); - } - return -ENOBUFS; -} - -/** - * dccp_determine_ccmps - Find out about CCID-specific packet-size limits - * @dp: socket to find packet size limits of - * - * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), - * since the RX CCID is restricted to feedback packets (Acks), which are small - * in comparison with the data traffic. A value of 0 means "no current CCMPS". - */ -static u32 dccp_determine_ccmps(const struct dccp_sock *dp) -{ - const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid; - - if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL) - return 0; - return tx_ccid->ccid_ops->ccid_ccmps; -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - u32 ccmps = dccp_determine_ccmps(dp); - u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; - - /* Account for header lengths and IPv4/v6 option overhead */ - cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + - sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); - - /* - * Leave enough headroom for common DCCP header options. - * This only considers options which may appear on DCCP-Data packets, as - * per table 3 in RFC 4340, 5.8. When running out of space for other - * options (eg. Ack Vector which can take up to 255 bytes), it is better - * to schedule a separate Ack. Thus we leave headroom for the following: - * - 1 byte for Slow Receiver (11.6) - * - 6 bytes for Timestamp (13.1) - * - 10 bytes for Timestamp Echo (13.3) - * - 8 bytes for NDP count (7.7, when activated) - * - 6 bytes for Data Checksum (9.3) - * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) - */ - cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + - (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); - - /* And store cached results */ - icsk->icsk_pmtu_cookie = pmtu; - WRITE_ONCE(dp->dccps_mss_cache, cur_mps); - - return cur_mps; -} - -EXPORT_SYMBOL_GPL(dccp_sync_mss); - -void dccp_write_space(struct sock *sk) -{ - struct socket_wq *wq; - - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) - wake_up_interruptible(&wq->wait); - /* Should agree with poll, otherwise some programs break */ - if (sock_writeable(sk)) - sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); - - rcu_read_unlock(); -} - -/** - * dccp_wait_for_ccid - Await CCID send permission - * @sk: socket to wait for - * @delay: timeout in jiffies - * - * This is used by CCIDs which need to delay the send time in process context. - */ -static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) -{ - DEFINE_WAIT(wait); - long remaining; - - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - sk->sk_write_pending++; - release_sock(sk); - - remaining = schedule_timeout(delay); - - lock_sock(sk); - sk->sk_write_pending--; - finish_wait(sk_sleep(sk), &wait); - - if (signal_pending(current) || sk->sk_err) - return -1; - return remaining; -} - -/** - * dccp_xmit_packet - Send data packet under control of CCID - * @sk: socket to send data packet on - * - * Transmits next-queued payload and informs CCID to account for the packet. - */ -static void dccp_xmit_packet(struct sock *sk) -{ - int err, len; - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = dccp_qpolicy_pop(sk); - - if (unlikely(skb == NULL)) - return; - len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; - } - - err = dccp_transmit_skb(sk, skb); - if (err) - dccp_pr_debug("transmit_skb() returned err=%d\n", err); - /* - * Register this one as sent even if an error occurred. To the remote - * end a local packet drop is indistinguishable from network loss, i.e. - * any local drop will eventually be reported via receiver feedback. - */ - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - - /* - * If the CCID needs to transfer additional header options out-of-band - * (e.g. Ack Vectors or feature-negotiation options), it activates this - * flag to schedule a Sync. The Sync will automatically incorporate all - * currently pending header options, thus clearing the backlog. - */ - if (dp->dccps_sync_scheduled) - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); -} - -/** - * dccp_flush_write_queue - Drain queue at end of connection - * @sk: socket to be drained - * @time_budget: time allowed to drain the queue - * - * Since dccp_sendmsg queues packets without waiting for them to be sent, it may - * happen that the TX queue is not empty at the end of a connection. We give the - * HC-sender CCID a grace period of up to @time_budget jiffies. If this function - * returns with a non-empty write queue, it will be purged later. - */ -void dccp_flush_write_queue(struct sock *sk, long *time_budget) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - long delay, rc; - - while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - /* - * If the CCID determines when to send, the next sending - * time is unknown or the CCID may not even send again - * (e.g. remote host crashes or lost Ack packets). - */ - DCCP_WARN("CCID did not manage to send all packets\n"); - return; - case CCID_PACKET_DELAY: - delay = msecs_to_jiffies(rc); - if (delay > *time_budget) - return; - rc = dccp_wait_for_ccid(sk, delay); - if (rc < 0) - return; - *time_budget -= (delay - rc); - /* check again if we can send now */ - break; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); - dccp_pr_debug("packet discarded due to err=%ld\n", rc); - } - } -} - -void dccp_write_xmit(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - - while ((skb = dccp_qpolicy_top(sk))) { - int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - return; - case CCID_PACKET_DELAY: - sk_reset_timer(sk, &dp->dccps_xmit_timer, - jiffies + msecs_to_jiffies(rc)); - return; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - dccp_qpolicy_drop(sk, skb); - dccp_pr_debug("packet discarded due to err=%d\n", rc); - } - } -} - -/** - * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets - * @sk: socket to perform retransmit on - * - * There are only four retransmittable packet types in DCCP: - * - Request in client-REQUEST state (sec. 8.1.1), - * - CloseReq in server-CLOSEREQ state (sec. 8.3), - * - Close in node-CLOSING state (sec. 8.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()). - * This function expects sk->sk_send_head to contain the original skb. - */ -int dccp_retransmit_skb(struct sock *sk) -{ - WARN_ON(sk->sk_send_head == NULL); - - if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) - return -EHOSTUNREACH; /* Routing failure or similar. */ - - /* this count is used to distinguish original and retransmitted skb */ - inet_csk(sk)->icsk_retransmits++; - - return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC)); -} - -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req) -{ - struct dccp_hdr *dh; - struct dccp_request_sock *dreq; - const u32 dccp_header_size = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_response); - struct sk_buff *skb; - - /* sk is marked const to clearly express we dont hold socket lock. - * sock_wmalloc() will atomically change sk->sk_wmem_alloc, - * it is safe to promote sk to non const. - */ - skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1, - GFP_ATOMIC); - if (!skb) - return NULL; - - skb_reserve(skb, MAX_DCCP_HEADER); - - skb_dst_set(skb, dst_clone(dst)); - - dreq = dccp_rsk(req); - if (inet_rsk(req)->acked) /* increase GSS upon retransmission */ - dccp_inc_seqno(&dreq->dreq_gss); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; - DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss; - - /* Resolve feature dependencies resulting from choice of CCID */ - if (dccp_feat_server_ccid_dependencies(dreq)) - goto response_failed; - - if (dccp_insert_options_rsk(dreq, skb)) - goto response_failed; - - /* Build and checksum header */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - - dh->dccph_sport = htons(inet_rsk(req)->ir_num); - dh->dccph_dport = inet_rsk(req)->ir_rmt_port; - dh->dccph_doff = (dccp_header_size + - DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; - dh->dccph_type = DCCP_PKT_RESPONSE; - dh->dccph_x = 1; - dccp_hdr_set_seq(dh, dreq->dreq_gss); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr); - dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service; - - dccp_csum_outgoing(skb); - - /* We use `acked' to remember that a Response was already sent. */ - inet_rsk(req)->acked = 1; - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - return skb; -response_failed: - kfree_skb(skb); - return NULL; -} - -EXPORT_SYMBOL_GPL(dccp_make_response); - -/* answer offending packet in @rcv_skb with Reset from control socket @ctl */ -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb) -{ - struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb); - const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_reset); - struct dccp_hdr_reset *dhr; - struct sk_buff *skb; - - skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - if (skb == NULL) - return NULL; - - skb_reserve(skb, sk->sk_prot->max_header); - - /* Swap the send and the receive. */ - dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len); - dh->dccph_type = DCCP_PKT_RESET; - dh->dccph_sport = rxdh->dccph_dport; - dh->dccph_dport = rxdh->dccph_sport; - dh->dccph_doff = dccp_hdr_reset_len / 4; - dh->dccph_x = 1; - - dhr = dccp_hdr_reset(skb); - dhr->dccph_reset_code = dcb->dccpd_reset_code; - - switch (dcb->dccpd_reset_code) { - case DCCP_RESET_CODE_PACKET_ERROR: - dhr->dccph_reset_data[0] = rxdh->dccph_type; - break; - case DCCP_RESET_CODE_OPTION_ERROR: - case DCCP_RESET_CODE_MANDATORY_ERROR: - memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3); - break; - } - /* - * From RFC 4340, 8.3.1: - * If P.ackno exists, set R.seqno := P.ackno + 1. - * Else set R.seqno := 0. - */ - if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1)); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq); - - dccp_csum_outgoing(skb); - return skb; -} - -EXPORT_SYMBOL_GPL(dccp_ctl_make_reset); - -/* send Reset on established socket, to close or abort the connection */ -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) -{ - struct sk_buff *skb; - /* - * FIXME: what if rebuild_header fails? - * Should we be doing a rebuild_header here? - */ - int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk); - - if (err != 0) - return err; - - skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC); - if (skb == NULL) - return -ENOBUFS; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; - DCCP_SKB_CB(skb)->dccpd_reset_code = code; - - return dccp_transmit_skb(sk, skb); -} - -/* - * Do all connect socket setups that can be done AF independent. - */ -int dccp_connect(struct sock *sk) -{ - struct sk_buff *skb; - struct dccp_sock *dp = dccp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - sk->sk_err = 0; - sock_reset_flag(sk, SOCK_DONE); - - dccp_sync_mss(sk, dst_mtu(dst)); - - /* do not connect if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dccp_sk(sk))) - return -EPROTO; - - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ - dp->dccps_gar = dp->dccps_iss; - - skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); - if (unlikely(skb == NULL)) - return -ENOBUFS; - - /* Reserve space for headers. */ - skb_reserve(skb, sk->sk_prot->max_header); - - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; - - dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); - DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); - - /* Timer for repeating the REQUEST until an answer. */ - icsk->icsk_retransmits = 0; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - icsk->icsk_rto, DCCP_RTO_MAX); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_connect); - -void dccp_send_ack(struct sock *sk) -{ - /* If we have been reset, we may not send again. */ - if (sk->sk_state != DCCP_CLOSED) { - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, - GFP_ATOMIC); - - if (skb == NULL) { - inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, - DCCP_RTO_MAX); - return; - } - - /* Reserve space for headers */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; - dccp_transmit_skb(sk, skb); - } -} - -EXPORT_SYMBOL_GPL(dccp_send_ack); - -#if 0 -/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */ -void dccp_send_delayed_ack(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - /* - * FIXME: tune this timer. elapsed time fixes the skew, so no problem - * with using 2s, and active senders also piggyback the ACK into a - * DATAACK packet, so this is really for quiescent senders. - */ - unsigned long timeout = jiffies + 2 * HZ; - - /* Use new timeout only if there wasn't a older one earlier. */ - if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { - /* If delack timer was blocked or is about to expire, - * send ACK now. - * - * FIXME: check the "about to expire" part - */ - if (icsk->icsk_ack.blocked) { - dccp_send_ack(sk); - return; - } - - if (!time_before(timeout, icsk_delack_timeout(icsk))) - timeout = icsk_delack_timeout(icsk); - } - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; - sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); -} -#endif - -void dccp_send_sync(struct sock *sk, const u64 ackno, - const enum dccp_pkt_type pkt_type) -{ - /* - * We are not putting this on the write queue, so - * dccp_transmit_skb() will set the ownership to this - * sock. - */ - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - - if (skb == NULL) { - /* FIXME: how to make sure the sync is sent? */ - DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type)); - return; - } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = pkt_type; - DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; - - /* - * Clear the flag in case the Sync was scheduled for out-of-band data, - * such as carrying a long Ack Vector. - */ - dccp_sk(sk)->dccps_sync_scheduled = 0; - - dccp_transmit_skb(sk, skb); -} - -EXPORT_SYMBOL_GPL(dccp_send_sync); - -/* - * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This - * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under - * any circumstances. - */ -void dccp_send_close(struct sock *sk, const int active) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC; - - skb = alloc_skb(sk->sk_prot->max_header, prio); - if (skb == NULL) - return; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait) - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ; - else - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; - - if (active) { - skb = dccp_skb_entail(sk, skb); - /* - * Retransmission timer for active-close: RFC 4340, 8.3 requires - * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ - * state can be left. The initial timeout is 2 RTTs. - * Since RTT measurement is done by the CCIDs, there is no easy - * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4 - * is too low (200ms); we use a high value to avoid unnecessary - * retransmissions when the link RTT is > 0.2 seconds. - * FIXME: Let main module sample RTTs and use that instead. - */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); - } - dccp_transmit_skb(sk, skb); -} diff --git a/net/dccp/proto.c b/net/dccp/proto.c deleted file mode 100644 index fcc5c9d64f466..0000000000000 --- a/net/dccp/proto.c +++ /dev/null @@ -1,1293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/proto.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/in.h> -#include <linux/if_arp.h> -#include <linux/init.h> -#include <linux/random.h> -#include <linux/slab.h> -#include <net/checksum.h> - -#include <net/inet_sock.h> -#include <net/inet_common.h> -#include <net/sock.h> -#include <net/xfrm.h> - -#include <asm/ioctls.h> -#include <linux/spinlock.h> -#include <linux/timer.h> -#include <linux/delay.h> -#include <linux/poll.h> - -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; - -EXPORT_SYMBOL_GPL(dccp_statistics); - -DEFINE_PER_CPU(unsigned int, dccp_orphan_count); -EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); - -struct inet_hashinfo dccp_hashinfo; -EXPORT_SYMBOL_GPL(dccp_hashinfo); - -/* the maximum queue length for tx in packets. 0 is no limit */ -int sysctl_dccp_tx_qlen __read_mostly = 5; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_state_name(const int state) -{ - static const char *const dccp_state_names[] = { - [DCCP_OPEN] = "OPEN", - [DCCP_REQUESTING] = "REQUESTING", - [DCCP_PARTOPEN] = "PARTOPEN", - [DCCP_LISTEN] = "LISTEN", - [DCCP_RESPOND] = "RESPOND", - [DCCP_CLOSING] = "CLOSING", - [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", - [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", - [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", - [DCCP_TIME_WAIT] = "TIME_WAIT", - [DCCP_CLOSED] = "CLOSED", - }; - - if (state >= DCCP_MAX_STATES) - return "INVALID STATE!"; - else - return dccp_state_names[state]; -} -#endif - -void dccp_set_state(struct sock *sk, const int state) -{ - const int oldstate = sk->sk_state; - - dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, - dccp_state_name(oldstate), dccp_state_name(state)); - WARN_ON(state == oldstate); - - switch (state) { - case DCCP_OPEN: - if (oldstate != DCCP_OPEN) - DCCP_INC_STATS(DCCP_MIB_CURRESTAB); - /* Client retransmits all Confirm options until entering OPEN */ - if (oldstate == DCCP_PARTOPEN) - dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); - break; - - case DCCP_CLOSED: - if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || - oldstate == DCCP_CLOSING) - DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); - - sk->sk_prot->unhash(sk); - if (inet_csk(sk)->icsk_bind_hash != NULL && - !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) - inet_put_port(sk); - fallthrough; - default: - if (oldstate == DCCP_OPEN) - DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); - } - - /* Change state AFTER socket is unhashed to avoid closed - * socket sitting in hash tables. - */ - inet_sk_set_state(sk, state); -} - -EXPORT_SYMBOL_GPL(dccp_set_state); - -static void dccp_finish_passive_close(struct sock *sk) -{ - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - /* Node (client or server) has received Close packet. */ - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_set_state(sk, DCCP_CLOSED); - break; - case DCCP_PASSIVE_CLOSEREQ: - /* - * Client received CloseReq. We set the `active' flag so that - * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. - */ - dccp_send_close(sk, 1); - dccp_set_state(sk, DCCP_CLOSING); - } -} - -void dccp_done(struct sock *sk) -{ - dccp_set_state(sk, DCCP_CLOSED); - dccp_clear_xmit_timers(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); - else - inet_csk_destroy_sock(sk); -} - -EXPORT_SYMBOL_GPL(dccp_done); - -const char *dccp_packet_name(const int type) -{ - static const char *const dccp_packet_names[] = { - [DCCP_PKT_REQUEST] = "REQUEST", - [DCCP_PKT_RESPONSE] = "RESPONSE", - [DCCP_PKT_DATA] = "DATA", - [DCCP_PKT_ACK] = "ACK", - [DCCP_PKT_DATAACK] = "DATAACK", - [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", - [DCCP_PKT_CLOSE] = "CLOSE", - [DCCP_PKT_RESET] = "RESET", - [DCCP_PKT_SYNC] = "SYNC", - [DCCP_PKT_SYNCACK] = "SYNCACK", - }; - - if (type >= DCCP_NR_PKT_TYPES) - return "INVALID"; - else - return dccp_packet_names[type]; -} - -EXPORT_SYMBOL_GPL(dccp_packet_name); - -void dccp_destruct_common(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = NULL; -} -EXPORT_SYMBOL_GPL(dccp_destruct_common); - -static void dccp_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet_sock_destruct(sk); -} - -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, " - "please contact the netdev mailing list\n"); - - icsk->icsk_rto = DCCP_TIMEOUT_INIT; - icsk->icsk_syn_retries = sysctl_dccp_request_retries; - sk->sk_state = DCCP_CLOSED; - sk->sk_write_space = dccp_write_space; - sk->sk_destruct = dccp_sk_destruct; - icsk->icsk_sync_mss = dccp_sync_mss; - dp->dccps_mss_cache = 536; - dp->dccps_rate_last = jiffies; - dp->dccps_role = DCCP_ROLE_UNDEFINED; - dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; - - dccp_init_xmit_timers(sk); - - INIT_LIST_HEAD(&dp->dccps_featneg); - /* control socket doesn't need feat nego */ - if (likely(ctl_sock_initialized)) - return dccp_feat_init(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_init_sock); - -void dccp_destroy_sock(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - /* Clean up a referenced DCCP bind bucket. */ - if (inet_csk(sk)->icsk_bind_hash != NULL) - inet_put_port(sk); - - kfree(dp->dccps_service_list); - dp->dccps_service_list = NULL; - - if (dp->dccps_hc_rx_ackvec != NULL) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - /* clean up feature negotiation state */ - dccp_feat_list_purge(&dp->dccps_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_destroy_sock); - -static inline int dccp_need_reset(int state) -{ - return state != DCCP_CLOSED && state != DCCP_LISTEN && - state != DCCP_REQUESTING; -} - -int dccp_disconnect(struct sock *sk, int flags) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - const int old_state = sk->sk_state; - - if (old_state != DCCP_CLOSED) - dccp_set_state(sk, DCCP_CLOSED); - - /* - * This corresponds to the ABORT function of RFC793, sec. 3.8 - * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". - */ - if (old_state == DCCP_LISTEN) { - inet_csk_listen_stop(sk); - } else if (dccp_need_reset(old_state)) { - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - sk->sk_err = ECONNRESET; - } else if (old_state == DCCP_REQUESTING) - sk->sk_err = ECONNRESET; - - dccp_clear_xmit_timers(sk); - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - __skb_queue_purge(&sk->sk_receive_queue); - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - __kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - inet->inet_dport = 0; - - inet_bhash2_reset_saddr(sk); - - sk->sk_shutdown = 0; - sock_reset_flag(sk, SOCK_DONE); - - icsk->icsk_backoff = 0; - inet_csk_delack_init(sk); - __sk_dst_reset(sk); - - WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); - - sk_error_report(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_disconnect); - -/* - * Wait for a DCCP event. - * - * Note that we don't need to lock the socket, as the upper poll layers - * take care of normal races (between the test and the event) and we don't - * go look at any of the socket buffers directly. - */ -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - __poll_t mask; - u8 shutdown; - int state; - - sock_poll_wait(file, sock, wait); - - state = inet_sk_state_load(sk); - if (state == DCCP_LISTEN) - return inet_csk_listen_poll(sk); - - /* Socket is not locked. We are protected from async events - by poll logic and correct handling of state changes - made by another threads is impossible in any case. - */ - - mask = 0; - if (READ_ONCE(sk->sk_err)) - mask = EPOLLERR; - shutdown = READ_ONCE(sk->sk_shutdown); - - if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) - mask |= EPOLLHUP; - if (shutdown & RCV_SHUTDOWN) - mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - - /* Connected? */ - if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { - if (atomic_read(&sk->sk_rmem_alloc) > 0) - mask |= EPOLLIN | EPOLLRDNORM; - - if (!(shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { - mask |= EPOLLOUT | EPOLLWRNORM; - } else { /* send SIGIO later */ - sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - - /* Race breaker. If space is freed after - * wspace test but before the flags are set, - * IO signal will be lost. - */ - if (sk_stream_is_writeable(sk)) - mask |= EPOLLOUT | EPOLLWRNORM; - } - } - } - return mask; -} -EXPORT_SYMBOL_GPL(dccp_poll); - -int dccp_ioctl(struct sock *sk, int cmd, int *karg) -{ - int rc = -ENOTCONN; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) - goto out; - - switch (cmd) { - case SIOCOUTQ: { - *karg = sk_wmem_alloc_get(sk); - /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and - * always 0, comparably to UDP. - */ - - rc = 0; - } - break; - case SIOCINQ: { - struct sk_buff *skb; - *karg = 0; - - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { - /* - * We will only return the amount of this packet since - * that is all that will be read. - */ - *karg = skb->len; - } - rc = 0; - } - break; - default: - rc = -ENOIOCTLCMD; - break; - } -out: - release_sock(sk); - return rc; -} - -EXPORT_SYMBOL_GPL(dccp_ioctl); - -static int dccp_setsockopt_service(struct sock *sk, const __be32 service, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_service_list *sl = NULL; - - if (service == DCCP_SERVICE_INVALID_VALUE || - optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) - return -EINVAL; - - if (optlen > sizeof(service)) { - sl = kmalloc(optlen, GFP_KERNEL); - if (sl == NULL) - return -ENOMEM; - - sl->dccpsl_nr = optlen / sizeof(u32) - 1; - if (copy_from_sockptr_offset(sl->dccpsl_list, optval, - sizeof(service), optlen - sizeof(service)) || - dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { - kfree(sl); - return -EFAULT; - } - } - - lock_sock(sk); - dp->dccps_service = service; - - kfree(dp->dccps_service_list); - - dp->dccps_service_list = sl; - release_sock(sk); - return 0; -} - -static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) -{ - u8 *list, len; - int i, rc; - - if (cscov < 0 || cscov > 15) - return -EINVAL; - /* - * Populate a list of permissible values, in the range cscov...15. This - * is necessary since feature negotiation of single values only works if - * both sides incidentally choose the same value. Since the list starts - * lowest-value first, negotiation will pick the smallest shared value. - */ - if (cscov == 0) - return 0; - len = 16 - cscov; - - list = kmalloc(len, GFP_KERNEL); - if (list == NULL) - return -ENOBUFS; - - for (i = 0; i < len; i++) - list[i] = cscov++; - - rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); - - if (rc == 0) { - if (rx) - dccp_sk(sk)->dccps_pcrlen = cscov; - else - dccp_sk(sk)->dccps_pcslen = cscov; - } - kfree(list); - return rc; -} - -static int dccp_setsockopt_ccid(struct sock *sk, int type, - sockptr_t optval, unsigned int optlen) -{ - u8 *val; - int rc = 0; - - if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) - return -EINVAL; - - val = memdup_sockptr(optval, optlen); - if (IS_ERR(val)) - return PTR_ERR(val); - - lock_sock(sk); - if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); - - if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); - release_sock(sk); - - kfree(val); - return rc; -} - -static int do_dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - int val, err = 0; - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CHANGE_L: - case DCCP_SOCKOPT_CHANGE_R: - DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CCID: - case DCCP_SOCKOPT_RX_CCID: - case DCCP_SOCKOPT_TX_CCID: - return dccp_setsockopt_ccid(sk, optname, optval, optlen); - } - - if (optlen < (int)sizeof(int)) - return -EINVAL; - - if (copy_from_sockptr(&val, optval, sizeof(int))) - return -EFAULT; - - if (optname == DCCP_SOCKOPT_SERVICE) - return dccp_setsockopt_service(sk, val, optval, optlen); - - lock_sock(sk); - switch (optname) { - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - if (dp->dccps_role != DCCP_ROLE_SERVER) - err = -EOPNOTSUPP; - else - dp->dccps_server_timewait = (val != 0); - break; - case DCCP_SOCKOPT_SEND_CSCOV: - err = dccp_setsockopt_cscov(sk, val, false); - break; - case DCCP_SOCKOPT_RECV_CSCOV: - err = dccp_setsockopt_cscov(sk, val, true); - break; - case DCCP_SOCKOPT_QPOLICY_ID: - if (sk->sk_state != DCCP_CLOSED) - err = -EISCONN; - else if (val < 0 || val >= DCCPQ_POLICY_MAX) - err = -EINVAL; - else - dp->dccps_qpolicy = val; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - if (val < 0) - err = -EINVAL; - else - dp->dccps_tx_qlen = val; - break; - default: - err = -ENOPROTOOPT; - break; - } - release_sock(sk); - - return err; -} - -int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_setsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_setsockopt); - -static int dccp_getsockopt_service(struct sock *sk, int len, - __be32 __user *optval, - int __user *optlen) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_service_list *sl; - int err = -ENOENT, slen = 0, total_len = sizeof(u32); - - lock_sock(sk); - if ((sl = dp->dccps_service_list) != NULL) { - slen = sl->dccpsl_nr * sizeof(u32); - total_len += slen; - } - - err = -EINVAL; - if (total_len > len) - goto out; - - err = 0; - if (put_user(total_len, optlen) || - put_user(dp->dccps_service, optval) || - (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) - err = -EFAULT; -out: - release_sock(sk); - return err; -} - -static int do_dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct dccp_sock *dp; - int val, len; - - if (get_user(len, optlen)) - return -EFAULT; - - if (len < (int)sizeof(int)) - return -EINVAL; - - dp = dccp_sk(sk); - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_SERVICE: - return dccp_getsockopt_service(sk, len, - (__be32 __user *)optval, optlen); - case DCCP_SOCKOPT_GET_CUR_MPS: - val = READ_ONCE(dp->dccps_mss_cache); - break; - case DCCP_SOCKOPT_AVAILABLE_CCIDS: - return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); - case DCCP_SOCKOPT_TX_CCID: - val = ccid_get_current_tx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_RX_CCID: - val = ccid_get_current_rx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - val = dp->dccps_server_timewait; - break; - case DCCP_SOCKOPT_SEND_CSCOV: - val = dp->dccps_pcslen; - break; - case DCCP_SOCKOPT_RECV_CSCOV: - val = dp->dccps_pcrlen; - break; - case DCCP_SOCKOPT_QPOLICY_ID: - val = dp->dccps_qpolicy; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - val = dp->dccps_tx_qlen; - break; - case 128 ... 191: - return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - case 192 ... 255: - return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - default: - return -ENOPROTOOPT; - } - - len = sizeof(val); - if (put_user(len, optlen) || copy_to_user(optval, &val, len)) - return -EFAULT; - - return 0; -} - -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_getsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_getsockopt); - -static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) -{ - struct cmsghdr *cmsg; - - /* - * Assign an (opaque) qpolicy priority value to skb->priority. - * - * We are overloading this skb field for use with the qpolicy subystem. - * The skb->priority is normally used for the SO_PRIORITY option, which - * is initialised from sk_priority. Since the assignment of sk_priority - * to skb->priority happens later (on layer 3), we overload this field - * for use with queueing priorities as long as the skb is on layer 4. - * The default priority value (if nothing is set) is 0. - */ - skb->priority = 0; - - for_each_cmsghdr(cmsg, msg) { - if (!CMSG_OK(msg, cmsg)) - return -EINVAL; - - if (cmsg->cmsg_level != SOL_DCCP) - continue; - - if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && - !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) - return -EINVAL; - - switch (cmsg->cmsg_type) { - case DCCP_SCM_PRIORITY: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) - return -EINVAL; - skb->priority = *(__u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } - } - return 0; -} - -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const int flags = msg->msg_flags; - const int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; - int rc, size; - long timeo; - - trace_dccp_probe(sk, len); - - if (len > READ_ONCE(dp->dccps_mss_cache)) - return -EMSGSIZE; - - lock_sock(sk); - - timeo = sock_sndtimeo(sk, noblock); - - /* - * We have to use sk_stream_wait_connect here to set sk_write_pending, - * so that the trick in dccp_rcv_request_sent_state_process. - */ - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_release; - - size = sk->sk_prot->max_header + len; - release_sock(sk); - skb = sock_alloc_send_skb(sk, size, noblock, &rc); - lock_sock(sk); - if (skb == NULL) - goto out_release; - - if (dccp_qpolicy_full(sk)) { - rc = -EAGAIN; - goto out_discard; - } - - if (sk->sk_state == DCCP_CLOSED) { - rc = -ENOTCONN; - goto out_discard; - } - - /* We need to check dccps_mss_cache after socket is locked. */ - if (len > dp->dccps_mss_cache) { - rc = -EMSGSIZE; - goto out_discard; - } - - skb_reserve(skb, sk->sk_prot->max_header); - rc = memcpy_from_msg(skb_put(skb, len), msg, len); - if (rc != 0) - goto out_discard; - - rc = dccp_msghdr_parse(msg, skb); - if (rc != 0) - goto out_discard; - - dccp_qpolicy_push(sk, skb); - /* - * The xmit_timer is set if the TX CCID is rate-based and will expire - * when congestion control permits to release further packets into the - * network. Window-based CCIDs do not use this timer. - */ - if (!timer_pending(&dp->dccps_xmit_timer)) - dccp_write_xmit(sk); -out_release: - release_sock(sk); - return rc ? : len; -out_discard: - kfree_skb(skb); - goto out_release; -} - -EXPORT_SYMBOL_GPL(dccp_sendmsg); - -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len) -{ - const struct dccp_hdr *dh; - long timeo; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) { - len = -ENOTCONN; - goto out; - } - - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - do { - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - - if (skb == NULL) - goto verify_sock_status; - - dh = dccp_hdr(skb); - - switch (dh->dccph_type) { - case DCCP_PKT_DATA: - case DCCP_PKT_DATAACK: - goto found_ok_skb; - - case DCCP_PKT_CLOSE: - case DCCP_PKT_CLOSEREQ: - if (!(flags & MSG_PEEK)) - dccp_finish_passive_close(sk); - fallthrough; - case DCCP_PKT_RESET: - dccp_pr_debug("found fin (%s) ok!\n", - dccp_packet_name(dh->dccph_type)); - len = 0; - goto found_fin_ok; - default: - dccp_pr_debug("packet_type=%s\n", - dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb); - } -verify_sock_status: - if (sock_flag(sk, SOCK_DONE)) { - len = 0; - break; - } - - if (sk->sk_err) { - len = sock_error(sk); - break; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN) { - len = 0; - break; - } - - if (sk->sk_state == DCCP_CLOSED) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - len = -ENOTCONN; - break; - } - len = 0; - break; - } - - if (!timeo) { - len = -EAGAIN; - break; - } - - if (signal_pending(current)) { - len = sock_intr_errno(timeo); - break; - } - - sk_wait_data(sk, &timeo, NULL); - continue; - found_ok_skb: - if (len > skb->len) - len = skb->len; - else if (len < skb->len) - msg->msg_flags |= MSG_TRUNC; - - if (skb_copy_datagram_msg(skb, 0, msg, len)) { - /* Exception. Bailout! */ - len = -EFAULT; - break; - } - if (flags & MSG_TRUNC) - len = skb->len; - found_fin_ok: - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); - break; - } while (1); -out: - release_sock(sk); - return len; -} - -EXPORT_SYMBOL_GPL(dccp_recvmsg); - -int inet_dccp_listen(struct socket *sock, int backlog) -{ - struct sock *sk = sock->sk; - unsigned char old_state; - int err; - - lock_sock(sk); - - err = -EINVAL; - if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) - goto out; - - old_state = sk->sk_state; - if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) - goto out; - - WRITE_ONCE(sk->sk_max_ack_backlog, backlog); - /* Really, if the socket is already in listen state - * we can only allow the backlog to be adjusted. - */ - if (old_state != DCCP_LISTEN) { - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_role = DCCP_ROLE_LISTEN; - - /* do not start to listen if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dp)) { - err = -EPROTO; - goto out; - } - - err = inet_csk_listen_start(sk); - if (err) - goto out; - } - err = 0; - -out: - release_sock(sk); - return err; -} - -EXPORT_SYMBOL_GPL(inet_dccp_listen); - -static void dccp_terminate_connection(struct sock *sk) -{ - u8 next_state = DCCP_CLOSED; - - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - case DCCP_PASSIVE_CLOSEREQ: - dccp_finish_passive_close(sk); - break; - case DCCP_PARTOPEN: - dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - fallthrough; - case DCCP_OPEN: - dccp_send_close(sk, 1); - - if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && - !dccp_sk(sk)->dccps_server_timewait) - next_state = DCCP_ACTIVE_CLOSEREQ; - else - next_state = DCCP_CLOSING; - fallthrough; - default: - dccp_set_state(sk, next_state); - } -} - -void dccp_close(struct sock *sk, long timeout) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - u32 data_was_unread = 0; - int state; - - lock_sock(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (sk->sk_state == DCCP_LISTEN) { - dccp_set_state(sk, DCCP_CLOSED); - - /* Special case. */ - inet_csk_listen_stop(sk); - - goto adjudge_to_death; - } - - sk_stop_timer(sk, &dp->dccps_xmit_timer); - - /* - * We need to flush the recv. buffs. We do this only on the - * descriptor close, not protocol-sourced closes, because the - *reader process may not have drained the data yet! - */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - data_was_unread += skb->len; - __kfree_skb(skb); - } - - /* If socket has been already reset kill it. */ - if (sk->sk_state == DCCP_CLOSED) - goto adjudge_to_death; - - if (data_was_unread) { - /* Unread data was tossed, send an appropriate Reset Code */ - DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_set_state(sk, DCCP_CLOSED); - } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { - /* Check zero linger _after_ checking for unread data. */ - sk->sk_prot->disconnect(sk, 0); - } else if (sk->sk_state != DCCP_CLOSED) { - /* - * Normal connection termination. May need to wait if there are - * still packets in the TX queue that are delayed by the CCID. - */ - dccp_flush_write_queue(sk, &timeout); - dccp_terminate_connection(sk); - } - - /* - * Flush write queue. This may be necessary in several cases: - * - we have been closed by the peer but still have application data; - * - abortive termination (unread data or zero linger time), - * - normal termination but queue could not be flushed within time limit - */ - __skb_queue_purge(&sk->sk_write_queue); - - sk_stream_wait_close(sk, timeout); - -adjudge_to_death: - state = sk->sk_state; - sock_hold(sk); - sock_orphan(sk); - - /* - * It is the last release_sock in its life. It will remove backlog. - */ - release_sock(sk); - /* - * Now socket is owned by kernel and we acquire BH lock - * to finish close. No need to check for user refs. - */ - local_bh_disable(); - bh_lock_sock(sk); - WARN_ON(sock_owned_by_user(sk)); - - this_cpu_inc(dccp_orphan_count); - - /* Have we already been destroyed by a softirq or backlog? */ - if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) - goto out; - - if (sk->sk_state == DCCP_CLOSED) - inet_csk_destroy_sock(sk); - - /* Otherwise, socket is reprieved until protocol close. */ - -out: - bh_unlock_sock(sk); - local_bh_enable(); - sock_put(sk); -} - -EXPORT_SYMBOL_GPL(dccp_close); - -void dccp_shutdown(struct sock *sk, int how) -{ - dccp_pr_debug("called shutdown(%x)\n", how); -} - -EXPORT_SYMBOL_GPL(dccp_shutdown); - -static inline int __init dccp_mib_init(void) -{ - dccp_statistics = alloc_percpu(struct dccp_mib); - if (!dccp_statistics) - return -ENOMEM; - return 0; -} - -static inline void dccp_mib_exit(void) -{ - free_percpu(dccp_statistics); -} - -static int thash_entries; -module_param(thash_entries, int, 0444); -MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); - -#ifdef CONFIG_IP_DCCP_DEBUG -bool dccp_debug; -module_param(dccp_debug, bool, 0644); -MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); - -EXPORT_SYMBOL_GPL(dccp_debug); -#endif - -static int __init dccp_init(void) -{ - unsigned long goal; - unsigned long nr_pages = totalram_pages(); - int ehash_order, bhash_order, i; - int rc; - - BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > - sizeof_field(struct sk_buff, cb)); - rc = inet_hashinfo2_init_mod(&dccp_hashinfo); - if (rc) - goto out_fail; - rc = -ENOBUFS; - dccp_hashinfo.bind_bucket_cachep = - kmem_cache_create("dccp_bind_bucket", - sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind_bucket_cachep) - goto out_free_hashinfo2; - dccp_hashinfo.bind2_bucket_cachep = - kmem_cache_create("dccp_bind2_bucket", - sizeof(struct inet_bind2_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind2_bucket_cachep) - goto out_free_bind_bucket_cachep; - - /* - * Size and allocate the main established and bind bucket - * hash tables. - * - * The methodology is similar to that of the buffer cache. - */ - if (nr_pages >= (128 * 1024)) - goal = nr_pages >> (21 - PAGE_SHIFT); - else - goal = nr_pages >> (23 - PAGE_SHIFT); - - if (thash_entries) - goal = (thash_entries * - sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; - for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) - ; - do { - unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / - sizeof(struct inet_ehash_bucket); - - while (hash_size & (hash_size - 1)) - hash_size--; - dccp_hashinfo.ehash_mask = hash_size - 1; - dccp_hashinfo.ehash = (struct inet_ehash_bucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); - } while (!dccp_hashinfo.ehash && --ehash_order > 0); - - if (!dccp_hashinfo.ehash) { - DCCP_CRIT("Failed to allocate DCCP established hash table"); - goto out_free_bind2_bucket_cachep; - } - - for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) - INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); - - if (inet_ehash_locks_alloc(&dccp_hashinfo)) - goto out_free_dccp_ehash; - - bhash_order = ehash_order; - - do { - dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / - sizeof(struct inet_bind_hashbucket); - if ((dccp_hashinfo.bhash_size > (64 * 1024)) && - bhash_order > 0) - continue; - dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); - } while (!dccp_hashinfo.bhash && --bhash_order >= 0); - - if (!dccp_hashinfo.bhash) { - DCCP_CRIT("Failed to allocate DCCP bind hash table"); - goto out_free_dccp_locks; - } - - dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); - - if (!dccp_hashinfo.bhash2) { - DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); - goto out_free_dccp_bhash; - } - - for (i = 0; i < dccp_hashinfo.bhash_size; i++) { - spin_lock_init(&dccp_hashinfo.bhash[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); - spin_lock_init(&dccp_hashinfo.bhash2[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); - } - - dccp_hashinfo.pernet = false; - - rc = dccp_mib_init(); - if (rc) - goto out_free_dccp_bhash2; - - rc = dccp_ackvec_init(); - if (rc) - goto out_free_dccp_mib; - - rc = dccp_sysctl_init(); - if (rc) - goto out_ackvec_exit; - - rc = ccid_initialize_builtins(); - if (rc) - goto out_sysctl_exit; - - dccp_timestamping_init(); - - return 0; - -out_sysctl_exit: - dccp_sysctl_exit(); -out_ackvec_exit: - dccp_ackvec_exit(); -out_free_dccp_mib: - dccp_mib_exit(); -out_free_dccp_bhash2: - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); -out_free_dccp_bhash: - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); -out_free_dccp_locks: - inet_ehash_locks_free(&dccp_hashinfo); -out_free_dccp_ehash: - free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); -out_free_bind2_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); -out_free_bind_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); -out_free_hashinfo2: - inet_hashinfo2_free_mod(&dccp_hashinfo); -out_fail: - dccp_hashinfo.bhash = NULL; - dccp_hashinfo.bhash2 = NULL; - dccp_hashinfo.ehash = NULL; - dccp_hashinfo.bind_bucket_cachep = NULL; - dccp_hashinfo.bind2_bucket_cachep = NULL; - return rc; -} - -static void __exit dccp_fini(void) -{ - int bhash_order = get_order(dccp_hashinfo.bhash_size * - sizeof(struct inet_bind_hashbucket)); - - ccid_cleanup_builtins(); - dccp_mib_exit(); - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); - free_pages((unsigned long)dccp_hashinfo.ehash, - get_order((dccp_hashinfo.ehash_mask + 1) * - sizeof(struct inet_ehash_bucket))); - inet_ehash_locks_free(&dccp_hashinfo); - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); - dccp_ackvec_exit(); - dccp_sysctl_exit(); - inet_hashinfo2_free_mod(&dccp_hashinfo); -} - -module_init(dccp_init); -module_exit(dccp_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>"); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c deleted file mode 100644 index 5ba204ec0aca4..0000000000000 --- a/net/dccp/qpolicy.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/qpolicy.c - * - * Policy-based packet dequeueing interface for DCCP. - * - * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net> - */ -#include "dccp.h" - -/* - * Simple Dequeueing Policy: - * If tx_qlen is different from 0, enqueue up to tx_qlen elements. - */ -static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) -{ - skb_queue_tail(&sk->sk_write_queue, skb); -} - -static bool qpolicy_simple_full(struct sock *sk) -{ - return dccp_sk(sk)->dccps_tx_qlen && - sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; -} - -static struct sk_buff *qpolicy_simple_top(struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - -/* - * Priority-based Dequeueing Policy: - * If tx_qlen is different from 0 and the queue has reached its upper bound - * of tx_qlen elements, replace older packets lowest-priority-first. - */ -static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) -{ - struct sk_buff *skb, *best = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (best == NULL || skb->priority > best->priority) - best = skb; - return best; -} - -static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) -{ - struct sk_buff *skb, *worst = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (worst == NULL || skb->priority < worst->priority) - worst = skb; - return worst; -} - -static bool qpolicy_prio_full(struct sock *sk) -{ - if (qpolicy_simple_full(sk)) - dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); - return false; -} - -/** - * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface - * @push: add a new @skb to the write queue - * @full: indicates that no more packets will be admitted - * @top: peeks at whatever the queueing policy defines as its `top' - * @params: parameter passed to policy operation - */ -struct dccp_qpolicy_operations { - void (*push) (struct sock *sk, struct sk_buff *skb); - bool (*full) (struct sock *sk); - struct sk_buff* (*top) (struct sock *sk); - __be32 params; -}; - -static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = { - [DCCPQ_POLICY_SIMPLE] = { - .push = qpolicy_simple_push, - .full = qpolicy_simple_full, - .top = qpolicy_simple_top, - .params = 0, - }, - [DCCPQ_POLICY_PRIO] = { - .push = qpolicy_simple_push, - .full = qpolicy_prio_full, - .top = qpolicy_prio_best_skb, - .params = DCCP_SCM_PRIORITY, - }, -}; - -/* - * Externally visible interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) -{ - qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); -} - -bool dccp_qpolicy_full(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); -} - -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) -{ - if (skb != NULL) { - skb_unlink(skb, &sk->sk_write_queue); - kfree_skb(skb); - } -} - -struct sk_buff *dccp_qpolicy_top(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); -} - -struct sk_buff *dccp_qpolicy_pop(struct sock *sk) -{ - struct sk_buff *skb = dccp_qpolicy_top(sk); - - if (skb != NULL) { - /* Clear any skb fields that we used internally */ - skb->priority = 0; - skb_unlink(skb, &sk->sk_write_queue); - } - return skb; -} - -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) -{ - /* check if exactly one bit is set */ - if (!param || (param & (param - 1))) - return false; - return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; -} diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c deleted file mode 100644 index b15845fd63001..0000000000000 --- a/net/dccp/sysctl.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/sysctl.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@mandriva.com> - */ - -#include <linux/mm.h> -#include <linux/sysctl.h> -#include "dccp.h" -#include "feat.h" - -/* Boundary values */ -static int u8_max = 0xFF; -static unsigned long seqw_min = DCCPF_SEQ_WMIN, - seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */ - -static struct ctl_table dccp_default_table[] = { - { - .procname = "seq_window", - .data = &sysctl_dccp_sequence_window, - .maxlen = sizeof(sysctl_dccp_sequence_window), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ - .extra2 = &seqw_max, - }, - { - .procname = "rx_ccid", - .data = &sysctl_dccp_rx_ccid, - .maxlen = sizeof(sysctl_dccp_rx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "tx_ccid", - .data = &sysctl_dccp_tx_ccid, - .maxlen = sizeof(sysctl_dccp_tx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "request_retries", - .data = &sysctl_dccp_request_retries, - .maxlen = sizeof(sysctl_dccp_request_retries), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = &u8_max, - }, - { - .procname = "retries1", - .data = &sysctl_dccp_retries1, - .maxlen = sizeof(sysctl_dccp_retries1), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "retries2", - .data = &sysctl_dccp_retries2, - .maxlen = sizeof(sysctl_dccp_retries2), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "tx_qlen", - .data = &sysctl_dccp_tx_qlen, - .maxlen = sizeof(sysctl_dccp_tx_qlen), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "sync_ratelimit", - .data = &sysctl_dccp_sync_ratelimit, - .maxlen = sizeof(sysctl_dccp_sync_ratelimit), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, -}; - -static struct ctl_table_header *dccp_table_header; - -int __init dccp_sysctl_init(void) -{ - dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default", - dccp_default_table); - - return dccp_table_header != NULL ? 0 : -ENOMEM; -} - -void dccp_sysctl_exit(void) -{ - if (dccp_table_header != NULL) { - unregister_net_sysctl_table(dccp_table_header); - dccp_table_header = NULL; - } -} diff --git a/net/dccp/timer.c b/net/dccp/timer.c deleted file mode 100644 index 232ac4ae0a73f..0000000000000 --- a/net/dccp/timer.c +++ /dev/null @@ -1,272 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/timer.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/skbuff.h> -#include <linux/export.h> - -#include "dccp.h" - -/* sysctl variables governing numbers of retransmission attempts */ -int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_dccp_retries1 __read_mostly = TCP_RETR1; -int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; - -static void dccp_write_err(struct sock *sk) -{ - sk->sk_err = READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT; - sk_error_report(sk); - - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_done(sk); - __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); -} - -/* A write timeout has occurred. Process the after effects. */ -static int dccp_write_timeout(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - int retry_until; - - if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { - if (icsk->icsk_retransmits != 0) - dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? - : sysctl_dccp_request_retries; - } else { - if (icsk->icsk_retransmits >= sysctl_dccp_retries1) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu - black hole detection. :-( - - It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any - case. Let me to cite the same draft, which requires for - us to implement this: - - "The one security concern raised by this memo is that ICMP black holes - are often caused by over-zealous security administrators who block - all ICMP messages. It is vitally important that those who design and - deploy security systems understand the impact of strict filtering on - upper-layer protocols. The safest web site in the world is worthless - if most TCP implementations cannot transfer data from it. It would - be far nicer to have all of the black holes fixed rather than fixing - all of the TCP implementations." - - Golden words :-). - */ - - dst_negative_advice(sk); - } - - retry_until = sysctl_dccp_retries2; - /* - * FIXME: see tcp_write_timout and tcp_out_of_resources - */ - } - - if (icsk->icsk_retransmits >= retry_until) { - /* Has it gone just too far? */ - dccp_write_err(sk); - return 1; - } - return 0; -} - -/* - * The DCCP retransmit timer. - */ -static void dccp_retransmit_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - /* - * More than 4MSL (8 minutes) has passed, a RESET(aborted) was - * sent, no need to retransmit, this sock is dead. - */ - if (dccp_write_timeout(sk)) - return; - - /* - * We want to know the number of packets retransmitted, not the - * total number of retransmissions of clones of original packets. - */ - if (icsk->icsk_retransmits == 0) - __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); - - if (dccp_retransmit_skb(sk) != 0) { - /* - * Retransmission failed because of local congestion, - * do not backoff. - */ - if (--icsk->icsk_retransmits == 0) - icsk->icsk_retransmits = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, - TCP_RESOURCE_PROBE_INTERVAL), - DCCP_RTO_MAX); - return; - } - - icsk->icsk_backoff++; - - icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, - DCCP_RTO_MAX); - if (icsk->icsk_retransmits > sysctl_dccp_retries1) - __sk_dst_reset(sk); -} - -static void dccp_write_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_retransmit_timer); - struct sock *sk = &icsk->icsk_inet.sk; - int event = 0; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later */ - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - jiffies + (HZ / 20)); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) - goto out; - - if (time_after(icsk_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - icsk_timeout(icsk)); - goto out; - } - - event = icsk->icsk_pending; - icsk->icsk_pending = 0; - - switch (event) { - case ICSK_TIME_RETRANS: - dccp_retransmit_timer(sk); - break; - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_keepalive_timer(struct timer_list *t) -{ - struct sock *sk = from_timer(sk, t, sk_timer); - - pr_err("dccp should not use a keepalive timer !\n"); - sock_put(sk); -} - -/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ -static void dccp_delack_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_delack_timer); - struct sock *sk = &icsk->icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &icsk->icsk_delack_timer, - jiffies + TCP_DELACK_MIN); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || - !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) - goto out; - if (time_after(icsk_delack_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_delack_timer, - icsk_delack_timeout(icsk)); - goto out; - } - - icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; - - if (inet_csk_ack_scheduled(sk)) { - if (!inet_csk_in_pingpong_mode(sk)) { - /* Delayed ACK missed: inflate ATO. */ - icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, - icsk->icsk_rto); - } else { - /* Delayed ACK missed: leave pingpong mode and - * deflate ATO. - */ - inet_csk_exit_pingpong_mode(sk); - icsk->icsk_ack.ato = TCP_ATO_MIN; - } - dccp_send_ack(sk); - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface - * @t: pointer to the tasklet associated with this handler - * - * See the comments above %ccid_dequeueing_decision for supported modes. - */ -static void dccp_write_xmitlet(struct tasklet_struct *t) -{ - struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); - struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); - else - dccp_write_xmit(sk); - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_write_xmit_timer(struct timer_list *t) -{ - struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); - - dccp_write_xmitlet(&dp->dccps_xmitlet); -} - -void dccp_init_xmit_timers(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); - timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); - inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, - &dccp_keepalive_timer); -} - -static ktime_t dccp_timestamp_seed; -/** - * dccp_timestamp - 10s of microseconds time source - * Returns the number of 10s of microseconds since loading DCCP. This is native - * DCCP time difference format (RFC 4340, sec. 13). - * Please note: This will wrap around about circa every 11.9 hours. - */ -u32 dccp_timestamp(void) -{ - u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); - - do_div(delta, 10); - return delta; -} -EXPORT_SYMBOL_GPL(dccp_timestamp); - -void __init dccp_timestamping_init(void) -{ - dccp_timestamp_seed = ktime_get_real(); -} diff --git a/net/dccp/trace.h b/net/dccp/trace.h deleted file mode 100644 index 5a43b3508c7f1..0000000000000 --- a/net/dccp/trace.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM dccp - -#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_DCCP_H - -#include <net/sock.h> -#include "dccp.h" -#include "ccids/ccid3.h" -#include <linux/tracepoint.h> -#include <trace/events/net_probe_common.h> - -TRACE_EVENT(dccp_probe, - - TP_PROTO(struct sock *sk, size_t size), - - TP_ARGS(sk, size), - - TP_STRUCT__entry( - /* sockaddr_in6 is always bigger than sockaddr_in */ - __array(__u8, saddr, sizeof(struct sockaddr_in6)) - __array(__u8, daddr, sizeof(struct sockaddr_in6)) - __field(__u16, sport) - __field(__u16, dport) - __field(__u16, size) - __field(__u16, tx_s) - __field(__u32, tx_rtt) - __field(__u32, tx_p) - __field(__u32, tx_x_calc) - __field(__u64, tx_x_recv) - __field(__u64, tx_x) - __field(__u32, tx_t_ipi) - ), - - TP_fast_assign( - const struct inet_sock *inet = inet_sk(sk); - struct ccid3_hc_tx_sock *hc = NULL; - - if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) - hc = ccid3_hc_tx_sk(sk); - - memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); - memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); - - TP_STORE_ADDR_PORTS(__entry, inet, sk); - - /* For filtering use */ - __entry->sport = ntohs(inet->inet_sport); - __entry->dport = ntohs(inet->inet_dport); - - __entry->size = size; - if (hc) { - __entry->tx_s = hc->tx_s; - __entry->tx_rtt = hc->tx_rtt; - __entry->tx_p = hc->tx_p; - __entry->tx_x_calc = hc->tx_x_calc; - __entry->tx_x_recv = hc->tx_x_recv >> 6; - __entry->tx_x = hc->tx_x >> 6; - __entry->tx_t_ipi = hc->tx_t_ipi; - } else { - __entry->tx_s = 0; - memset_startat(__entry, 0, tx_rtt); - } - ), - - TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " - "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", - __entry->saddr, __entry->daddr, __entry->size, - __entry->tx_s, __entry->tx_rtt, __entry->tx_p, - __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, - __entry->tx_t_ipi) -); - -#endif /* _TRACE_TCP_H */ - -/* This part must be outside protection */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace -#include <trace/define_trace.h> diff --git a/net/devlink/dev.c b/net/devlink/dev.c index d6e3db300acbe..02602704bdeaa 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -775,7 +775,7 @@ static int devlink_info_version_put(struct devlink_info_req *req, int attr, req->version_cb(version_name, version_type, req->version_cb_priv); - if (!req->msg) + if (!req->msg || !*version_value) return 0; nest = nla_nest_start_noflag(req->msg, attr); diff --git a/net/devlink/health.c b/net/devlink/health.c index 57db6799722a9..b3ce8ecbb7fb1 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -735,7 +735,7 @@ static void devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) return; } - item->nla_type = NLA_NUL_STRING; + item->nla_type = DEVLINK_VAR_ATTR_TYPE_NUL_STRING; item->len = strlen(name) + 1; item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; memcpy(&item->value, name, item->len); @@ -822,32 +822,37 @@ static void devlink_fmsg_put_value(struct devlink_fmsg *fmsg, static void devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); + devlink_fmsg_put_value(fmsg, &value, sizeof(value), + DEVLINK_VAR_ATTR_TYPE_FLAG); } static void devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); + devlink_fmsg_put_value(fmsg, &value, sizeof(value), + DEVLINK_VAR_ATTR_TYPE_U8); } void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); + devlink_fmsg_put_value(fmsg, &value, sizeof(value), + DEVLINK_VAR_ATTR_TYPE_U32); } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); static void devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); + devlink_fmsg_put_value(fmsg, &value, sizeof(value), + DEVLINK_VAR_ATTR_TYPE_U64); } void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, NLA_NUL_STRING); + devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, + DEVLINK_VAR_ATTR_TYPE_NUL_STRING); } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); @@ -857,7 +862,8 @@ void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, if (!fmsg->err && !fmsg->putting_binary) fmsg->err = -EINVAL; - devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); + devlink_fmsg_put_value(fmsg, value, value_len, + DEVLINK_VAR_ATTR_TYPE_BINARY); } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); @@ -928,43 +934,26 @@ void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); static int -devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) -{ - switch (msg->nla_type) { - case NLA_FLAG: - case NLA_U8: - case NLA_U32: - case NLA_U64: - case NLA_NUL_STRING: - case NLA_BINARY: - return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, - msg->nla_type); - default: - return -EINVAL; - } -} - -static int devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) { int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; u8 tmp; switch (msg->nla_type) { - case NLA_FLAG: + case DEVLINK_VAR_ATTR_TYPE_FLAG: /* Always provide flag data, regardless of its value */ tmp = *(bool *)msg->value; return nla_put_u8(skb, attrtype, tmp); - case NLA_U8: + case DEVLINK_VAR_ATTR_TYPE_U8: return nla_put_u8(skb, attrtype, *(u8 *)msg->value); - case NLA_U32: + case DEVLINK_VAR_ATTR_TYPE_U32: return nla_put_u32(skb, attrtype, *(u32 *)msg->value); - case NLA_U64: + case DEVLINK_VAR_ATTR_TYPE_U64: return devlink_nl_put_u64(skb, attrtype, *(u64 *)msg->value); - case NLA_NUL_STRING: + case DEVLINK_VAR_ATTR_TYPE_NUL_STRING: return nla_put_string(skb, attrtype, (char *)&msg->value); - case NLA_BINARY: + case DEVLINK_VAR_ATTR_TYPE_BINARY: return nla_put(skb, attrtype, msg->len, (void *)&msg->value); default: return -EINVAL; @@ -998,7 +987,8 @@ devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, err = nla_put_flag(skb, item->attrtype); break; case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: - err = devlink_fmsg_item_fill_type(item, skb); + err = nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, + item->nla_type); if (err) break; err = devlink_fmsg_item_fill_data(item, skb); diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index f9786d51f68f9..d97c326a9045b 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -10,6 +10,33 @@ #include <uapi/linux/devlink.h> +/* Sparse enums validation callbacks */ +static int +devlink_attr_param_type_validate(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + switch (nla_get_u8(attr)) { + case DEVLINK_VAR_ATTR_TYPE_U8: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_U16: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_U32: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_U64: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_STRING: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_FLAG: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_NUL_STRING: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_BINARY: + return 0; + } + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid enum value"); + return -EINVAL; +} + /* Common nested types */ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY, }, @@ -18,6 +45,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), }; +const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1] = { + [DEVLINK_RATE_TC_ATTR_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX), + [DEVLINK_RATE_TC_ATTR_BW] = { .type = NLA_U32, }, +}; + const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, }, }; @@ -273,7 +305,7 @@ static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_VA [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, }, - [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8, }, + [DEVLINK_ATTR_PARAM_TYPE] = NLA_POLICY_VALIDATE_FN(NLA_U8, &devlink_attr_param_type_validate), [DEVLINK_ATTR_PARAM_VALUE_CMODE] = NLA_POLICY_MAX(NLA_U8, 2), }; @@ -496,7 +528,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_ }; /* DEVLINK_CMD_RATE_SET - do */ -static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -505,10 +537,11 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy), }; /* DEVLINK_CMD_RATE_NEW - do */ -static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -517,6 +550,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy), }; /* DEVLINK_CMD_RATE_DEL - do */ @@ -1164,7 +1198,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_set_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BWS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { @@ -1174,7 +1208,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_new_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_new_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BWS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 8f2bd50ddf5e2..09cc6f264ccfa 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -13,6 +13,7 @@ /* Common nested types */ extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; +extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1]; extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; /* Ops table for devlink */ diff --git a/net/devlink/param.c b/net/devlink/param.c index dcf0d1ccebbab..41dcc86cfd944 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -92,6 +92,16 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, + .name = DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, + .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, + .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -167,25 +177,6 @@ static int devlink_param_set(struct devlink *devlink, } static int -devlink_param_type_to_nla_type(enum devlink_param_type param_type) -{ - switch (param_type) { - case DEVLINK_PARAM_TYPE_U8: - return NLA_U8; - case DEVLINK_PARAM_TYPE_U16: - return NLA_U16; - case DEVLINK_PARAM_TYPE_U32: - return NLA_U32; - case DEVLINK_PARAM_TYPE_STRING: - return NLA_STRING; - case DEVLINK_PARAM_TYPE_BOOL: - return NLA_FLAG; - default: - return -EINVAL; - } -} - -static int devlink_nl_param_value_fill_one(struct sk_buff *msg, enum devlink_param_type type, enum devlink_param_cmode cmode, @@ -214,6 +205,11 @@ devlink_nl_param_value_fill_one(struct sk_buff *msg, if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) goto value_nest_cancel; break; + case DEVLINK_PARAM_TYPE_U64: + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, + val.vu64)) + goto value_nest_cancel; + break; case DEVLINK_PARAM_TYPE_STRING: if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vstr)) @@ -247,7 +243,6 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_param_gset_ctx ctx; struct nlattr *param_values_list; struct nlattr *param_attr; - int nla_type; void *hdr; int err; int i; @@ -293,11 +288,7 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, goto param_nest_cancel; if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) goto param_nest_cancel; - - nla_type = devlink_param_type_to_nla_type(param->type); - if (nla_type < 0) - goto param_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, param->type)) goto param_nest_cancel; param_values_list = nla_nest_start_noflag(msg, @@ -419,25 +410,7 @@ devlink_param_type_get_from_info(struct genl_info *info, if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) return -EINVAL; - switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { - case NLA_U8: - *param_type = DEVLINK_PARAM_TYPE_U8; - break; - case NLA_U16: - *param_type = DEVLINK_PARAM_TYPE_U16; - break; - case NLA_U32: - *param_type = DEVLINK_PARAM_TYPE_U32; - break; - case NLA_STRING: - *param_type = DEVLINK_PARAM_TYPE_STRING; - break; - case NLA_FLAG: - *param_type = DEVLINK_PARAM_TYPE_BOOL; - break; - default: - return -EINVAL; - } + *param_type = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE]); return 0; } @@ -471,6 +444,11 @@ devlink_param_value_get_from_info(const struct devlink_param *param, return -EINVAL; value->vu32 = nla_get_u32(param_data); break; + case DEVLINK_PARAM_TYPE_U64: + if (nla_len(param_data) != sizeof(u64)) + return -EINVAL; + value->vu64 = nla_get_u64(param_data); + break; case DEVLINK_PARAM_TYPE_STRING: len = strnlen(nla_data(param_data), nla_len(param_data)); if (len == nla_len(param_data) || diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 8828ffaf6cbc0..110b3fa8a0b1b 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -80,6 +80,29 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) return ERR_PTR(-EINVAL); } +static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) +{ + struct nlattr *nla_tc_bw; + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS); + if (!nla_tc_bw) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) || + nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i])) + goto nla_put_failure; + + nla_nest_end(msg, nla_tc_bw); + } + return 0; + +nla_put_failure: + nla_nest_cancel(msg, nla_tc_bw); + return -EMSGSIZE; +} + static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, @@ -129,6 +152,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, devlink_rate->parent->name)) goto nla_put_failure; + if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; @@ -316,6 +342,87 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, return 0; } +static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, + unsigned long *bitmap, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1]; + u8 tc_index; + int err; + + err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest, + devlink_dl_rate_tc_bws_nl_policy, extack); + if (err) + return err; + + if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) { + NL_SET_ERR_ATTR_MISS(extack, parent_nest, + DEVLINK_RATE_TC_ATTR_INDEX); + return -EINVAL; + } + + tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]); + + if (!tb[DEVLINK_RATE_TC_ATTR_BW]) { + NL_SET_ERR_ATTR_MISS(extack, parent_nest, + DEVLINK_RATE_TC_ATTR_BW); + return -EINVAL; + } + + if (test_and_set_bit(tc_index, bitmap)) { + NL_SET_ERR_MSG_FMT(extack, + "Duplicate traffic class index specified (%u)", + tc_index); + return -EINVAL; + } + + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]); + + return 0; +} + +static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate, + struct genl_info *info) +{ + DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {}; + struct devlink *devlink = devlink_rate->devlink; + const struct devlink_ops *ops = devlink->ops; + u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {}; + int rem, err = -EOPNOTSUPP, i; + struct nlattr *attr; + + nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr, + GENL_HDRLEN, rem) { + err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, + info->extack); + if (err) + return err; + } + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + if (!test_bit(i, bitmap)) { + NL_SET_ERR_MSG_FMT(info->extack, + "Bandwidth values must be specified for all %u traffic classes", + DEVLINK_RATE_TCS_MAX); + return -EINVAL; + } + } + + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + + if (err) + return err; + + memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); + + return 0; +} + static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) @@ -388,6 +495,12 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, return err; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) { + err = devlink_nl_rate_tc_bw_set(devlink_rate, info); + if (err) + return err; + } + return 0; } @@ -423,6 +536,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && + !ops->rate_leaf_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BWS], + "TC bandwidth set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); @@ -449,6 +569,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && + !ops->rate_node_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BWS], + "TC bandwidth set isn't supported for the nodes"); + return false; + } } else { WARN(1, "Unknown type of rate object"); return false; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 2dfe9063613fe..869cbe57162f9 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -42,12 +42,24 @@ config NET_DSA_TAG_BRCM Broadcom switches which place the tag after the MAC source address. config NET_DSA_TAG_BRCM_LEGACY - tristate "Tag driver for Broadcom legacy switches using in-frame headers" + tristate "Tag driver for BCM63xx legacy switches using in-frame headers" select NET_DSA_TAG_BRCM_COMMON help Say Y if you want to enable support for tagging frames for the - Broadcom legacy switches which place the tag after the MAC source + BCM63xx legacy switches which place the tag after the MAC source address. + This tag is used in BCM63xx legacy switches which work without the + original FCS and length before the tag insertion. + +config NET_DSA_TAG_BRCM_LEGACY_FCS + tristate "Tag driver for BCM53xx legacy switches using in-frame headers" + select NET_DSA_TAG_BRCM_COMMON + help + Say Y if you want to enable support for tagging frames for the + BCM53xx legacy switches which place the tag after the MAC source + address. + This tag is used in BCM53xx legacy switches which expect original + FCS and length before the tag insertion to be present. config NET_DSA_TAG_BRCM_PREPEND tristate "Tag driver for Broadcom switches using prepended headers" diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 436a7e1b412ad..5b01a0e43ebe8 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1621,7 +1621,7 @@ void dsa_switch_shutdown(struct dsa_switch *ds) dsa_switch_for_each_cpu_port(dp, ds) list_add(&dp->conduit->close_list, &close_list); - dev_close_many(&close_list, true); + netif_close_many(&close_list, true); dsa_switch_for_each_user_port(dp, ds) { conduit = dsa_port_to_conduit(dp); @@ -1829,3 +1829,4 @@ MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>"); MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:dsa"); +MODULE_IMPORT_NS("NETDEV_INTERNAL"); diff --git a/net/dsa/port.c b/net/dsa/port.c index 5c9d1798e830a..082573ae6864a 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -116,19 +116,15 @@ static bool dsa_port_can_configure_learning(struct dsa_port *dp) bool dsa_port_supports_hwtstamp(struct dsa_port *dp) { + struct kernel_hwtstamp_config config = {}; struct dsa_switch *ds = dp->ds; - struct ifreq ifr = {}; int err; if (!ds->ops->port_hwtstamp_get || !ds->ops->port_hwtstamp_set) return false; - /* "See through" shim implementations of the "get" method. - * Since we can't cook up a complete ioctl request structure, this will - * fail in copy_to_user() with -EFAULT, which hopefully is enough to - * detect a valid implementation. - */ - err = ds->ops->port_hwtstamp_get(ds, dp->index, &ifr); + /* "See through" shim implementations of the "get" method. */ + err = ds->ops->port_hwtstamp_get(ds, dp->index, &config); return err != -EOPNOTSUPP; } diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 8c3c068728e51..26bb657ceac36 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -15,6 +15,7 @@ #define BRCM_NAME "brcm" #define BRCM_LEGACY_NAME "brcm-legacy" +#define BRCM_LEGACY_FCS_NAME "brcm-legacy-fcs" #define BRCM_PREPEND_NAME "brcm-prepend" /* Legacy Broadcom tag (6 bytes) */ @@ -32,6 +33,10 @@ #define BRCM_LEG_MULTICAST (1 << 5) #define BRCM_LEG_EGRESS (2 << 5) #define BRCM_LEG_INGRESS (3 << 5) +#define BRCM_LEG_LEN_HI(x) (((x) >> 8) & 0x7) + +/* 4th byte in the tag */ +#define BRCM_LEG_LEN_LO(x) ((x) & 0xff) /* 6th byte in the tag */ #define BRCM_LEG_PORT_ID (0xf) @@ -212,6 +217,41 @@ DSA_TAG_DRIVER(brcm_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM, BRCM_NAME); #endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) || \ + IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS) +static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + int len = BRCM_LEG_TAG_LEN; + int source_port; + u8 *brcm_tag; + + if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN))) + return NULL; + + brcm_tag = dsa_etype_header_pos_rx(skb); + + source_port = brcm_tag[5] & BRCM_LEG_PORT_ID; + + skb->dev = dsa_conduit_find_user(dev, 0, source_port); + if (!skb->dev) + return NULL; + + /* VLAN tag is added by BCM63xx internal switch */ + if (netdev_uses_dsa(skb->dev)) + len += VLAN_HLEN; + + /* Remove Broadcom tag and update checksum */ + skb_pull_rcsum(skb, len); + + dsa_default_offload_fwd_mark(skb); + + dsa_strip_etype_header(skb, len); + + return skb; +} +#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY || CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */ + #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb, struct net_device *dev) @@ -250,49 +290,77 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb, return skb; } -static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, - struct net_device *dev) +static const struct dsa_device_ops brcm_legacy_netdev_ops = { + .name = BRCM_LEGACY_NAME, + .proto = DSA_TAG_PROTO_BRCM_LEGACY, + .xmit = brcm_leg_tag_xmit, + .rcv = brcm_leg_tag_rcv, + .needed_headroom = BRCM_LEG_TAG_LEN, +}; + +DSA_TAG_DRIVER(brcm_legacy_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY, BRCM_LEGACY_NAME); +#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */ + +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS) +static struct sk_buff *brcm_leg_fcs_tag_xmit(struct sk_buff *skb, + struct net_device *dev) { - int len = BRCM_LEG_TAG_LEN; - int source_port; + struct dsa_port *dp = dsa_user_to_port(dev); + unsigned int fcs_len; + __le32 fcs_val; u8 *brcm_tag; - if (unlikely(!pskb_may_pull(skb, BRCM_LEG_PORT_ID))) + /* The Ethernet switch we are interfaced with needs packets to be at + * least 64 bytes (including FCS) otherwise they will be discarded when + * they enter the switch port logic. When Broadcom tags are enabled, we + * need to make sure that packets are at least 70 bytes (including FCS + * and tag) because the length verification is done after the Broadcom + * tag is stripped off the ingress packet. + * + * Let dsa_user_xmit() free the SKB. + */ + if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false)) return NULL; - brcm_tag = dsa_etype_header_pos_rx(skb); + fcs_len = skb->len; + fcs_val = cpu_to_le32(crc32_le(~0, skb->data, fcs_len) ^ ~0); - source_port = brcm_tag[5] & BRCM_LEG_PORT_ID; + skb_push(skb, BRCM_LEG_TAG_LEN); - skb->dev = dsa_conduit_find_user(dev, 0, source_port); - if (!skb->dev) - return NULL; + dsa_alloc_etype_header(skb, BRCM_LEG_TAG_LEN); - /* VLAN tag is added by BCM63xx internal switch */ - if (netdev_uses_dsa(skb->dev)) - len += VLAN_HLEN; + brcm_tag = skb->data + 2 * ETH_ALEN; - /* Remove Broadcom tag and update checksum */ - skb_pull_rcsum(skb, len); + /* Broadcom tag type */ + brcm_tag[0] = BRCM_LEG_TYPE_HI; + brcm_tag[1] = BRCM_LEG_TYPE_LO; - dsa_default_offload_fwd_mark(skb); + /* Broadcom tag value */ + brcm_tag[2] = BRCM_LEG_EGRESS | BRCM_LEG_LEN_HI(fcs_len); + brcm_tag[3] = BRCM_LEG_LEN_LO(fcs_len); + brcm_tag[4] = 0; + brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID; - dsa_strip_etype_header(skb, len); + /* Original FCS value */ + if (__skb_pad(skb, ETH_FCS_LEN, false)) + return NULL; + skb_put_data(skb, &fcs_val, ETH_FCS_LEN); return skb; } -static const struct dsa_device_ops brcm_legacy_netdev_ops = { - .name = BRCM_LEGACY_NAME, - .proto = DSA_TAG_PROTO_BRCM_LEGACY, - .xmit = brcm_leg_tag_xmit, +static const struct dsa_device_ops brcm_legacy_fcs_netdev_ops = { + .name = BRCM_LEGACY_FCS_NAME, + .proto = DSA_TAG_PROTO_BRCM_LEGACY_FCS, + .xmit = brcm_leg_fcs_tag_xmit, .rcv = brcm_leg_tag_rcv, .needed_headroom = BRCM_LEG_TAG_LEN, }; -DSA_TAG_DRIVER(brcm_legacy_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY, BRCM_LEGACY_NAME); -#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */ +DSA_TAG_DRIVER(brcm_legacy_fcs_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY_FCS, BRCM_LEGACY_FCS_NAME); +#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */ #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb, @@ -328,6 +396,9 @@ static struct dsa_tag_driver *dsa_tag_driver_array[] = { #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) &DSA_TAG_DRIVER_NAME(brcm_legacy_netdev_ops), #endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS) + &DSA_TAG_DRIVER_NAME(brcm_legacy_fcs_netdev_ops), +#endif #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) &DSA_TAG_DRIVER_NAME(brcm_prepend_netdev_ops), #endif diff --git a/net/dsa/user.c b/net/dsa/user.c index 804dc7dac4f2f..f59d66f0975d7 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -578,20 +578,6 @@ dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_user_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - int port = p->dp->index; - - /* Pass through to switch driver if it supports timestamping */ - switch (cmd) { - case SIOCGHWTSTAMP: - if (ds->ops->port_hwtstamp_get) - return ds->ops->port_hwtstamp_get(ds, port, ifr); - break; - case SIOCSHWTSTAMP: - if (ds->ops->port_hwtstamp_set) - return ds->ops->port_hwtstamp_set(ds, port, ifr); - break; - } return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } @@ -2574,6 +2560,31 @@ static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx, return 0; } +static int dsa_user_hwtstamp_get(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->port_hwtstamp_get) + return -EOPNOTSUPP; + + return ds->ops->port_hwtstamp_get(ds, dp->index, cfg); +} + +static int dsa_user_hwtstamp_set(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->port_hwtstamp_set) + return -EOPNOTSUPP; + + return ds->ops->port_hwtstamp_set(ds, dp->index, cfg, extack); +} + static const struct net_device_ops dsa_user_netdev_ops = { .ndo_open = dsa_user_open, .ndo_stop = dsa_user_close, @@ -2595,6 +2606,8 @@ static const struct net_device_ops dsa_user_netdev_ops = { .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid, .ndo_change_mtu = dsa_user_change_mtu, .ndo_fill_forward_path = dsa_user_fill_forward_path, + .ndo_hwtstamp_get = dsa_user_hwtstamp_get, + .ndo_hwtstamp_set = dsa_user_hwtstamp_set, }; static const struct device_type dsa_type = { @@ -3591,7 +3604,7 @@ static int dsa_user_netdevice_event(struct notifier_block *nb, list_add(&dp->user->close_list, &close_list); } - dev_close_many(&close_list, true); + netif_close_many(&close_list, true); return NOTIFY_OK; } diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 49bea6b45bd5c..4f58648a27ad6 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -707,7 +707,9 @@ static u32 ethtool_get_max_rxfh_channel(struct net_device *dev) if (!rxfh.indir) return U32_MAX; + mutex_lock(&dev->ethtool->rss_lock); ret = dev->ethtool_ops->get_rxfh(dev, &rxfh); + mutex_unlock(&dev->ethtool->rss_lock); if (ret) { current_max = U32_MAX; goto out_free; @@ -804,12 +806,67 @@ out_free: return rc; } +struct ethtool_rxfh_context * +ethtool_rxfh_ctx_alloc(const struct ethtool_ops *ops, + u32 indir_size, u32 key_size) +{ + size_t indir_bytes, flex_len, key_off, size; + struct ethtool_rxfh_context *ctx; + u32 priv_bytes, indir_max; + u16 key_max; + + key_max = max(key_size, ops->rxfh_key_space); + indir_max = max(indir_size, ops->rxfh_indir_space); + + priv_bytes = ALIGN(ops->rxfh_priv_size, sizeof(u32)); + indir_bytes = array_size(indir_max, sizeof(u32)); + + key_off = size_add(priv_bytes, indir_bytes); + flex_len = size_add(key_off, key_max); + size = struct_size_t(struct ethtool_rxfh_context, data, flex_len); + + ctx = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!ctx) + return NULL; + + ctx->indir_size = indir_size; + ctx->key_size = key_size; + ctx->key_off = key_off; + ctx->priv_size = ops->rxfh_priv_size; + + ctx->hfunc = ETH_RSS_HASH_NO_CHANGE; + ctx->input_xfrm = RXH_XFRM_NO_CHANGE; + + return ctx; +} + +/* Check if fields configured for flow hash are symmetric - if src is included + * so is dst and vice versa. + */ +int ethtool_rxfh_config_is_sym(u64 rxfh) +{ + bool sym; + + sym = rxfh == (rxfh & (RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3)); + sym &= !!(rxfh & RXH_IP_SRC) == !!(rxfh & RXH_IP_DST); + sym &= !!(rxfh & RXH_L4_B_0_1) == !!(rxfh & RXH_L4_B_2_3); + + return sym; +} + int ethtool_check_ops(const struct ethtool_ops *ops) { if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params)) return -EINVAL; if (WARN_ON(ops->rxfh_max_num_contexts == 1)) return -EINVAL; + if (WARN_ON(ops->supported_input_xfrm && !ops->get_rxfh_fields)) + return -EINVAL; + if (WARN_ON(ops->supported_input_xfrm && + ops->rxfh_per_ctx_fields != ops->rxfh_per_ctx_key)) + return -EINVAL; + /* NOTE: sufficiently insane drivers may swap ethtool_ops at runtime, * the fact that ops are checked at registration time does not * mean the ops attached to a netdev later on are sane. @@ -921,9 +978,18 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); if (IS_ERR(phy)) - err = PTR_ERR(phy); - else - err = 0; + return PTR_ERR(phy); + + /* Report the phc source only if we have a real + * phc source with an index. + */ + if (info->phc_index >= 0) { + info->phc_source = HWTSTAMP_SOURCE_PHYLIB; + info->phc_phyindex = phy->phyindex; + } + err = 0; + } else if (!err && info->phc_index >= 0) { + info->phc_source = HWTSTAMP_SOURCE_NETDEV; } info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | @@ -947,10 +1013,20 @@ int __ethtool_get_ts_info(struct net_device *dev, ethtool_init_tsinfo(info); if (phy_is_default_hwtstamp(phydev) && - phy_has_tsinfo(phydev)) + phy_has_tsinfo(phydev)) { err = phy_ts_info(phydev, info); - else if (ops->get_ts_info) + /* Report the phc source only if we have a real + * phc source with an index. + */ + if (!err && info->phc_index >= 0) { + info->phc_source = HWTSTAMP_SOURCE_PHYLIB; + info->phc_phyindex = phydev->phyindex; + } + } else if (ops->get_ts_info) { err = ops->get_ts_info(dev, info); + if (!err && info->phc_index >= 0) + info->phc_source = HWTSTAMP_SOURCE_NETDEV; + } info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; @@ -1060,5 +1136,6 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id) netdev_err(dev, "device error, RSS context %d lost\n", context_id); ctx = xa_erase(&dev->ethtool->rss_ctx, context_id); kfree(ctx); + ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_DELETE_NTF, context_id); } EXPORT_SYMBOL(ethtool_rxfh_context_lost); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index b4683d286a5a5..c4d084dde5bf4 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -43,7 +43,11 @@ bool convert_legacy_settings_to_link_ksettings( int ethtool_check_max_channel(struct net_device *dev, struct ethtool_channels channels, struct genl_info *info); +struct ethtool_rxfh_context * +ethtool_rxfh_ctx_alloc(const struct ethtool_ops *ops, + u32 indir_size, u32 key_size); int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context); +int ethtool_rxfh_config_is_sym(u64 rxfh); void ethtool_ringparam_get_cfg(struct net_device *dev, struct ethtool_ringparam *param, @@ -74,4 +78,13 @@ int ethtool_get_module_eeprom_call(struct net_device *dev, bool __ethtool_dev_mm_supported(struct net_device *dev); +#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK) +void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context); +#else +static inline void +ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context) +{ +} +#endif + #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 8262cc10f98db..43a7854e784ef 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -617,8 +617,8 @@ static int ethtool_set_link_ksettings(struct net_device *dev, err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (err >= 0) { - ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); - ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF); + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF); } return err; } @@ -708,8 +708,8 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) __ETHTOOL_LINK_MODE_MASK_NU32; ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (ret >= 0) { - ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); - ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF); + ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF); } return ret; } @@ -978,6 +978,172 @@ static int ethtool_rxnfc_copy_to_user(void __user *useraddr, return 0; } +static bool flow_type_hashable(u32 flow_type) +{ + switch (flow_type) { + case ETHER_FLOW: + case TCP_V4_FLOW: + case UDP_V4_FLOW: + case SCTP_V4_FLOW: + case AH_ESP_V4_FLOW: + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case AH_V4_FLOW: + case ESP_V4_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case IPV4_FLOW: + case IPV6_FLOW: + case GTPU_V4_FLOW: + case GTPU_V6_FLOW: + case GTPC_V4_FLOW: + case GTPC_V6_FLOW: + case GTPC_TEID_V4_FLOW: + case GTPC_TEID_V6_FLOW: + case GTPU_EH_V4_FLOW: + case GTPU_EH_V6_FLOW: + case GTPU_UL_V4_FLOW: + case GTPU_UL_V6_FLOW: + case GTPU_DL_V4_FLOW: + case GTPU_DL_V6_FLOW: + return true; + } + + return false; +} + +/* When adding a new type, update the assert and, if it's hashable, add it to + * the flow_type_hashable switch case. + */ +static_assert(GTPU_DL_V6_FLOW + 1 == __FLOW_TYPE_COUNT); + +static int ethtool_check_xfrm_rxfh(u32 input_xfrm, u64 rxfh) +{ + /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then: + * 1 - no other fields besides IP src/dst and/or L4 src/dst are set + * 2 - If src is set, dst must also be set + */ + if ((input_xfrm != RXH_XFRM_NO_CHANGE && + input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) && + !ethtool_rxfh_config_is_sym(rxfh)) + return -EINVAL; + + return 0; +} + +static int ethtool_check_flow_types(struct net_device *dev, u32 input_xfrm) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + int err; + u32 i; + + if (!input_xfrm || input_xfrm == RXH_XFRM_NO_CHANGE) + return 0; + + for (i = 0; i < __FLOW_TYPE_COUNT; i++) { + struct ethtool_rxfh_fields fields = { + .flow_type = i, + }; + + if (!flow_type_hashable(i)) + continue; + + if (ops->get_rxfh_fields(dev, &fields)) + continue; + + err = ethtool_check_xfrm_rxfh(input_xfrm, fields.data); + if (err) + return err; + } + + return 0; +} + +static noinline_for_stack int +ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxfh_fields fields = {}; + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + int rc; + + if (!ops->set_rxfh_fields) + return -EOPNOTSUPP; + + rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (rc) + return rc; + + if (info.flow_type & FLOW_RSS && info.rss_context && + !ops->rxfh_per_ctx_fields) + return -EINVAL; + + mutex_lock(&dev->ethtool->rss_lock); + if (ops->get_rxfh) { + struct ethtool_rxfh_param rxfh = {}; + + rc = ops->get_rxfh(dev, &rxfh); + if (rc) + goto exit_unlock; + + rc = ethtool_check_xfrm_rxfh(rxfh.input_xfrm, info.data); + if (rc) + goto exit_unlock; + } + + fields.data = info.data; + fields.flow_type = info.flow_type & ~FLOW_RSS; + if (info.flow_type & FLOW_RSS) + fields.rss_context = info.rss_context; + + rc = ops->set_rxfh_fields(dev, &fields, NULL); +exit_unlock: + mutex_unlock(&dev->ethtool->rss_lock); + if (rc) + return rc; + + ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_NTF, fields.rss_context); + return 0; +} + +static noinline_for_stack int +ethtool_get_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxfh_fields fields = {}; + int ret; + + if (!ops->get_rxfh_fields) + return -EOPNOTSUPP; + + ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (ret) + return ret; + + if (info.flow_type & FLOW_RSS && info.rss_context && + !ops->rxfh_per_ctx_fields) + return -EINVAL; + + fields.flow_type = info.flow_type & ~FLOW_RSS; + if (info.flow_type & FLOW_RSS) + fields.rss_context = info.rss_context; + + mutex_lock(&dev->ethtool->rss_lock); + ret = ops->get_rxfh_fields(dev, &fields); + mutex_unlock(&dev->ethtool->rss_lock); + if (ret < 0) + return ret; + + info.data = fields.data; + + return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); +} + static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { @@ -1001,26 +1167,8 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, ethtool_get_flow_spec_ring(info.fs.ring_cookie)) return -EINVAL; - if (!xa_load(&dev->ethtool->rss_ctx, info.rss_context)) - return -EINVAL; - } - - if (cmd == ETHTOOL_SRXFH && ops->get_rxfh) { - struct ethtool_rxfh_param rxfh = {}; - - rc = ops->get_rxfh(dev, &rxfh); - if (rc) - return rc; - - /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then: - * 1 - no other fields besides IP src/dst and/or L4 src/dst - * 2 - If src is set, dst must also be set - */ - if ((rxfh.input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) && - ((info.data & ~(RXH_IP_SRC | RXH_IP_DST | - RXH_L4_B_0_1 | RXH_L4_B_2_3)) || - (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) || - (!!(info.data & RXH_L4_B_0_1) ^ !!(info.data & RXH_L4_B_2_3)))) + if (info.rss_context && + !xa_load(&dev->ethtool->rss_ctx, info.rss_context)) return -EINVAL; } @@ -1133,7 +1281,9 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, if (!rxfh.indir) return -ENOMEM; + mutex_lock(&dev->ethtool->rss_lock); ret = dev->ethtool_ops->get_rxfh(dev, &rxfh); + mutex_unlock(&dev->ethtool->rss_lock); if (ret) goto out; if (copy_to_user(useraddr + @@ -1198,9 +1348,11 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, } rxfh_dev.hfunc = ETH_RSS_HASH_NO_CHANGE; + + mutex_lock(&dev->ethtool->rss_lock); ret = ops->set_rxfh(dev, &rxfh_dev, extack); if (ret) - goto out; + goto out_unlock; /* indicate whether rxfh was set to default */ if (user_size == 0) @@ -1208,6 +1360,8 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, else dev->priv_flags |= IFF_RXFH_CONFIGURED; +out_unlock: + mutex_unlock(&dev->ethtool->rss_lock); out: kfree(rxfh_dev.indir); return ret; @@ -1243,8 +1397,7 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ - if (rxfh.rss_context && !(ops->cap_rss_ctx_supported || - ops->create_rxfh_context)) + if (rxfh.rss_context && !ops->create_rxfh_context) return -EOPNOTSUPP; rxfh.indir_size = rxfh_dev.indir_size; @@ -1268,6 +1421,7 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (user_key_size) rxfh_dev.key = rss_config + indir_bytes; + mutex_lock(&dev->ethtool->rss_lock); if (rxfh.rss_context) { ctx = xa_load(&dev->ethtool->rss_ctx, rxfh.rss_context); if (!ctx) { @@ -1313,45 +1467,12 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, ret = -EFAULT; } out: + mutex_unlock(&dev->ethtool->rss_lock); kfree(rss_config); return ret; } -static struct ethtool_rxfh_context * -ethtool_rxfh_ctx_alloc(const struct ethtool_ops *ops, - u32 indir_size, u32 key_size) -{ - size_t indir_bytes, flex_len, key_off, size; - struct ethtool_rxfh_context *ctx; - u32 priv_bytes, indir_max; - u16 key_max; - - key_max = max(key_size, ops->rxfh_key_space); - indir_max = max(indir_size, ops->rxfh_indir_space); - - priv_bytes = ALIGN(ops->rxfh_priv_size, sizeof(u32)); - indir_bytes = array_size(indir_max, sizeof(u32)); - - key_off = size_add(priv_bytes, indir_bytes); - flex_len = size_add(key_off, key_max); - size = struct_size_t(struct ethtool_rxfh_context, data, flex_len); - - ctx = kzalloc(size, GFP_KERNEL_ACCOUNT); - if (!ctx) - return NULL; - - ctx->indir_size = indir_size; - ctx->key_size = key_size; - ctx->key_off = key_off; - ctx->priv_size = ops->rxfh_priv_size; - - ctx->hfunc = ETH_RSS_HASH_NO_CHANGE; - ctx->input_xfrm = RXH_XFRM_NO_CHANGE; - - return ctx; -} - static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, void __user *useraddr) { @@ -1364,9 +1485,9 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; - bool locked = false; /* dev->ethtool->rss_lock taken */ bool create = false; u8 *rss_config; + int ntf = 0; int ret; if (!ops->get_rxnfc || !ops->set_rxfh) @@ -1384,8 +1505,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ - if (rxfh.rss_context && !(ops->cap_rss_ctx_supported || - ops->create_rxfh_context)) + if (rxfh.rss_context && !ops->create_rxfh_context) return -EOPNOTSUPP; /* Check input data transformation capabilities */ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && @@ -1429,7 +1549,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rx_rings.cmd = ETHTOOL_GRXRINGS; ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) - goto out; + goto out_free; /* rxfh.indir_size == 0 means reset the indir table to default (master * context) or delete the context (other RSS contexts). @@ -1445,7 +1565,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, &rx_rings, rxfh.indir_size); if (ret) - goto out; + goto out_free; } else if (rxfh.indir_size == 0) { if (rxfh.rss_context == 0) { u32 *indir; @@ -1467,86 +1587,81 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, useraddr + rss_cfg_offset + user_indir_len, rxfh.key_size)) { ret = -EFAULT; - goto out; + goto out_free; } } - if (rxfh.rss_context) { - mutex_lock(&dev->ethtool->rss_lock); - locked = true; - } + mutex_lock(&dev->ethtool->rss_lock); + + ret = ethtool_check_flow_types(dev, rxfh.input_xfrm); + if (ret) + goto out_unlock; if (rxfh.rss_context && rxfh_dev.rss_delete) { ret = ethtool_check_rss_ctx_busy(dev, rxfh.rss_context); if (ret) - goto out; + goto out_unlock; } if (create) { + u32 limit, ctx_id; + if (rxfh_dev.rss_delete) { ret = -EINVAL; - goto out; + goto out_unlock; } ctx = ethtool_rxfh_ctx_alloc(ops, dev_indir_size, dev_key_size); if (!ctx) { ret = -ENOMEM; - goto out; + goto out_unlock; } - if (ops->create_rxfh_context) { - u32 limit = ops->rxfh_max_num_contexts ?: U32_MAX; - u32 ctx_id; - - /* driver uses new API, core allocates ID */ - ret = xa_alloc(&dev->ethtool->rss_ctx, &ctx_id, ctx, - XA_LIMIT(1, limit - 1), - GFP_KERNEL_ACCOUNT); - if (ret < 0) { - kfree(ctx); - goto out; - } - WARN_ON(!ctx_id); /* can't happen */ - rxfh.rss_context = ctx_id; + limit = ops->rxfh_max_num_contexts ?: U32_MAX; + ret = xa_alloc(&dev->ethtool->rss_ctx, &ctx_id, ctx, + XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT); + if (ret < 0) { + kfree(ctx); + goto out_unlock; } + WARN_ON(!ctx_id); /* can't happen */ + rxfh.rss_context = ctx_id; } else if (rxfh.rss_context) { ctx = xa_load(&dev->ethtool->rss_ctx, rxfh.rss_context); if (!ctx) { ret = -ENOENT; - goto out; + goto out_unlock; } } rxfh_dev.hfunc = rxfh.hfunc; rxfh_dev.rss_context = rxfh.rss_context; rxfh_dev.input_xfrm = rxfh.input_xfrm; - if (rxfh.rss_context && ops->create_rxfh_context) { - if (create) { - ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev, - extack); - /* Make sure driver populates defaults */ - WARN_ON_ONCE(!ret && !rxfh_dev.key && - ops->rxfh_per_ctx_key && - !memchr_inv(ethtool_rxfh_context_key(ctx), - 0, ctx->key_size)); - } else if (rxfh_dev.rss_delete) { - ret = ops->remove_rxfh_context(dev, ctx, - rxfh.rss_context, - extack); - } else { - ret = ops->modify_rxfh_context(dev, ctx, &rxfh_dev, - extack); - } - } else { + if (!rxfh.rss_context) { + ntf = ETHTOOL_MSG_RSS_NTF; ret = ops->set_rxfh(dev, &rxfh_dev, extack); + } else if (create) { + ntf = ETHTOOL_MSG_RSS_CREATE_NTF; + ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev, extack); + /* Make sure driver populates defaults */ + WARN_ON_ONCE(!ret && !rxfh_dev.key && ops->rxfh_per_ctx_key && + !memchr_inv(ethtool_rxfh_context_key(ctx), 0, + ctx->key_size)); + } else if (rxfh_dev.rss_delete) { + ntf = ETHTOOL_MSG_RSS_DELETE_NTF; + ret = ops->remove_rxfh_context(dev, ctx, rxfh.rss_context, + extack); + } else { + ntf = ETHTOOL_MSG_RSS_NTF; + ret = ops->modify_rxfh_context(dev, ctx, &rxfh_dev, extack); } if (ret) { + ntf = 0; if (create) { /* failed to create, free our new tracking entry */ - if (ops->create_rxfh_context) - xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context); + xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context); kfree(ctx); } - goto out; + goto out_unlock; } if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), @@ -1561,36 +1676,6 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, dev->priv_flags |= IFF_RXFH_CONFIGURED; } /* Update rss_ctx tracking */ - if (create && !ops->create_rxfh_context) { - /* driver uses old API, it chose context ID */ - if (WARN_ON(xa_load(&dev->ethtool->rss_ctx, rxfh_dev.rss_context))) { - /* context ID reused, our tracking is screwed */ - kfree(ctx); - goto out; - } - /* Allocate the exact ID the driver gave us */ - if (xa_is_err(xa_store(&dev->ethtool->rss_ctx, rxfh_dev.rss_context, - ctx, GFP_KERNEL))) { - kfree(ctx); - goto out; - } - - /* Fetch the defaults for the old API, in the new API drivers - * should write defaults into ctx themselves. - */ - rxfh_dev.indir = (u32 *)rss_config; - rxfh_dev.indir_size = dev_indir_size; - - rxfh_dev.key = rss_config + indir_bytes; - rxfh_dev.key_size = dev_key_size; - - ret = ops->get_rxfh(dev, &rxfh_dev); - if (WARN_ON(ret)) { - xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context); - kfree(ctx); - goto out; - } - } if (rxfh_dev.rss_delete) { WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context) != ctx); kfree(ctx); @@ -1613,10 +1698,12 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, ctx->input_xfrm = rxfh_dev.input_xfrm; } -out: - if (locked) - mutex_unlock(&dev->ethtool->rss_lock); +out_unlock: + mutex_unlock(&dev->ethtool->rss_lock); +out_free: kfree(rss_config); + if (ntf) + ethtool_rss_notify(dev, ntf, rxfh.rss_context); return ret; } @@ -1728,7 +1815,7 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) return ret; dev->ethtool->wol_enabled = !!wol.wolopts; - ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF); return 0; } @@ -1804,7 +1891,7 @@ static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) eee_to_keee(&keee, &eee); ret = dev->ethtool_ops->set_eee(dev, &keee); if (!ret) - ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF); return ret; } @@ -2044,7 +2131,7 @@ static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (!ret) - ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF); return ret; } @@ -2088,7 +2175,7 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, NULL); if (!ret) - ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF); return ret; } @@ -2155,7 +2242,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, ret = dev->ethtool_ops->set_channels(dev, &channels); if (!ret) - ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF); return ret; } @@ -2186,7 +2273,7 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam); if (!ret) - ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF); return ret; } @@ -3188,7 +3275,7 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); if (!rc) - ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF); break; case ETHTOOL_GEEE: rc = ethtool_get_eee(dev, useraddr); @@ -3252,20 +3339,24 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_priv_flags); if (!rc) - ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF); break; case ETHTOOL_SPFLAGS: rc = ethtool_set_value(dev, useraddr, dev->ethtool_ops->set_priv_flags); break; case ETHTOOL_GRXFH: + rc = ethtool_get_rxfh_fields(dev, ethcmd, useraddr); + break; + case ETHTOOL_SRXFH: + rc = ethtool_set_rxfh_fields(dev, ethcmd, useraddr); + break; case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); break; - case ETHTOOL_SRXFH: case ETHTOOL_SRXCLSRLDEL: case ETHTOOL_SRXCLSRLINS: rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c index 2816bb23c3ad9..29bbbc149375d 100644 --- a/net/ethtool/mm.c +++ b/net/ethtool/mm.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright 2022-2023 NXP + * Copyright 2022-2025 NXP + * Copyright 2024 Furong Xu <0x1207@gmail.com> */ #include "common.h" #include "netlink.h" @@ -282,3 +283,279 @@ bool ethtool_dev_mm_supported(struct net_device *dev) return supported; } EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported); + +static void ethtool_mmsv_configure_tx(struct ethtool_mmsv *mmsv, + bool tx_active) +{ + if (mmsv->ops->configure_tx) + mmsv->ops->configure_tx(mmsv, tx_active); +} + +static void ethtool_mmsv_configure_pmac(struct ethtool_mmsv *mmsv, + bool pmac_enabled) +{ + if (mmsv->ops->configure_pmac) + mmsv->ops->configure_pmac(mmsv, pmac_enabled); +} + +static void ethtool_mmsv_send_mpacket(struct ethtool_mmsv *mmsv, + enum ethtool_mpacket mpacket) +{ + if (mmsv->ops->send_mpacket) + mmsv->ops->send_mpacket(mmsv, mpacket); +} + +/** + * ethtool_mmsv_verify_timer - Timer for MAC Merge verification + * @t: timer_list struct containing private info + * + * Verify the MAC Merge capability in the local TX direction, by + * transmitting Verify mPackets up to 3 times. Wait until link + * partner responds with a Response mPacket, otherwise fail. + */ +static void ethtool_mmsv_verify_timer(struct timer_list *t) +{ + struct ethtool_mmsv *mmsv = timer_container_of(mmsv, t, verify_timer); + unsigned long flags; + bool rearm = false; + + spin_lock_irqsave(&mmsv->lock, flags); + + switch (mmsv->status) { + case ETHTOOL_MM_VERIFY_STATUS_INITIAL: + case ETHTOOL_MM_VERIFY_STATUS_VERIFYING: + if (mmsv->verify_retries != 0) { + ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_VERIFY); + rearm = true; + } else { + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_FAILED; + } + + mmsv->verify_retries--; + break; + + case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: + ethtool_mmsv_configure_tx(mmsv, true); + break; + + default: + break; + } + + if (rearm) { + mod_timer(&mmsv->verify_timer, + jiffies + msecs_to_jiffies(mmsv->verify_time)); + } + + spin_unlock_irqrestore(&mmsv->lock, flags); +} + +static void ethtool_mmsv_verify_timer_arm(struct ethtool_mmsv *mmsv) +{ + if (mmsv->pmac_enabled && mmsv->tx_enabled && mmsv->verify_enabled && + mmsv->status != ETHTOOL_MM_VERIFY_STATUS_FAILED && + mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) { + timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); + mod_timer(&mmsv->verify_timer, jiffies); + } +} + +static void ethtool_mmsv_apply(struct ethtool_mmsv *mmsv) +{ + /* If verification is disabled, configure FPE right away. + * Otherwise let the timer code do it. + */ + if (!mmsv->verify_enabled) { + ethtool_mmsv_configure_pmac(mmsv, mmsv->pmac_enabled); + ethtool_mmsv_configure_tx(mmsv, mmsv->tx_enabled); + } else { + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; + mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; + + if (netif_running(mmsv->dev)) + ethtool_mmsv_verify_timer_arm(mmsv); + } +} + +/** + * ethtool_mmsv_stop() - Stop MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * + * Drivers should call this method in a state where the hardware is + * about to lose state, like ndo_stop() or suspend(), and turning off + * MAC Merge features would be superfluous. Otherwise, prefer + * ethtool_mmsv_link_state_handle() with up=false. + */ +void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv) +{ + timer_shutdown_sync(&mmsv->verify_timer); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_stop); + +/** + * ethtool_mmsv_link_state_handle() - Inform MAC Merge Software Verification + * of link state changes + * @mmsv: MAC Merge Software Verification state + * @up: True if device carrier is up and able to pass verification packets + * + * Calling context is expected to be from a task, interrupts enabled. + */ +void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up) +{ + unsigned long flags; + + ethtool_mmsv_stop(mmsv); + + spin_lock_irqsave(&mmsv->lock, flags); + + if (up && mmsv->pmac_enabled) { + /* VERIFY process requires pMAC enabled when NIC comes up */ + ethtool_mmsv_configure_pmac(mmsv, true); + + /* New link => maybe new partner => new verification process */ + ethtool_mmsv_apply(mmsv); + } else { + /* Reset the reported verification state while the link is down */ + if (mmsv->verify_enabled) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; + + /* No link or pMAC not enabled */ + ethtool_mmsv_configure_pmac(mmsv, false); + ethtool_mmsv_configure_tx(mmsv, false); + } + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_link_state_handle); + +/** + * ethtool_mmsv_event_handle() - Inform MAC Merge Software Verification + * of interrupt-based events + * @mmsv: MAC Merge Software Verification state + * @event: Event which took place (packet transmission or reception) + * + * Calling context expects to have interrupts disabled. + */ +void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, + enum ethtool_mmsv_event event) +{ + /* This is interrupt context, just spin_lock() */ + spin_lock(&mmsv->lock); + + if (!mmsv->pmac_enabled) + goto unlock; + + switch (event) { + case ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET: + /* Link partner has sent verify mPacket */ + ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_RESPONSE); + break; + case ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET: + /* Local device has sent verify mPacket */ + if (mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING; + break; + case ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET: + /* Link partner has sent response mPacket */ + if (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED; + break; + } + +unlock: + spin_unlock(&mmsv->lock); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_event_handle); + +static bool ethtool_mmsv_is_tx_active(struct ethtool_mmsv *mmsv) +{ + /* TX is active if administratively enabled, and verification either + * succeeded, or was administratively disabled. + */ + return mmsv->tx_enabled && + (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED || + mmsv->status == ETHTOOL_MM_VERIFY_STATUS_DISABLED); +} + +/** + * ethtool_mmsv_get_mm() - get_mm() hook for MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * @state: see struct ethtool_mm_state + * + * Drivers are expected to call this from their ethtool_ops :: get_mm() + * method. + */ +void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, + struct ethtool_mm_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&mmsv->lock, flags); + + state->max_verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; + state->verify_enabled = mmsv->verify_enabled; + state->pmac_enabled = mmsv->pmac_enabled; + state->verify_time = mmsv->verify_time; + state->tx_enabled = mmsv->tx_enabled; + state->verify_status = mmsv->status; + state->tx_active = ethtool_mmsv_is_tx_active(mmsv); + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_get_mm); + +/** + * ethtool_mmsv_set_mm() - set_mm() hook for MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * @cfg: see struct ethtool_mm_cfg + * + * Drivers are expected to call this from their ethtool_ops :: set_mm() + * method. + */ +void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg) +{ + unsigned long flags; + + /* Wait for the verification that's currently in progress to finish */ + ethtool_mmsv_stop(mmsv); + + spin_lock_irqsave(&mmsv->lock, flags); + + mmsv->verify_enabled = cfg->verify_enabled; + mmsv->pmac_enabled = cfg->pmac_enabled; + mmsv->verify_time = cfg->verify_time; + mmsv->tx_enabled = cfg->tx_enabled; + + if (!cfg->verify_enabled) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; + + ethtool_mmsv_apply(mmsv); + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_set_mm); + +/** + * ethtool_mmsv_init() - Initialize MAC Merge Software Verification state + * @mmsv: MAC Merge Software Verification state + * @dev: Pointer to network interface + * @ops: Methods for implementing the generic functionality + * + * The MAC Merge Software Verification is a timer- and event-based state + * machine intended for network interfaces which lack a hardware-based + * TX verification process (as per IEEE 802.3 clause 99.4.3). The timer + * is managed by the core code, whereas events are supplied by the + * driver explicitly calling one of the other API functions. + */ +void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, + const struct ethtool_mmsv_ops *ops) +{ + mmsv->ops = ops; + mmsv->dev = dev; + mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; + mmsv->verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; + timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); + spin_lock_init(&mmsv->lock); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_init); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 977beeaaa2f99..2f813f25f07e1 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -81,6 +81,12 @@ static void ethnl_sock_priv_destroy(void *priv) } } +u32 ethnl_bcast_seq_next(void) +{ + ASSERT_RTNL(); + return ++ethnl_bcast_seq; +} + int ethnl_ops_begin(struct net_device *dev) { int ret; @@ -357,6 +363,18 @@ struct ethnl_dump_ctx { unsigned long pos_ifindex; }; +/** + * struct ethnl_perphy_dump_ctx - context for dumpit() PHY-aware callbacks + * @ethnl_ctx: generic ethnl context + * @ifindex: For Filtered DUMP requests, the ifindex of the targeted netdev + * @pos_phyindex: iterator position for multi-msg DUMP + */ +struct ethnl_perphy_dump_ctx { + struct ethnl_dump_ctx ethnl_ctx; + unsigned int ifindex; + unsigned long pos_phyindex; +}; + static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, @@ -393,6 +411,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_PSE_GET] = ðnl_pse_request_ops, [ETHTOOL_MSG_PSE_SET] = ðnl_pse_request_ops, [ETHTOOL_MSG_RSS_GET] = ðnl_rss_request_ops, + [ETHTOOL_MSG_RSS_SET] = ðnl_rss_request_ops, [ETHTOOL_MSG_PLCA_GET_CFG] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_SET_CFG] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_GET_STATUS] = ðnl_plca_status_request_ops, @@ -400,6 +419,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_MM_SET] = ðnl_mm_request_ops, [ETHTOOL_MSG_TSCONFIG_GET] = ðnl_tsconfig_request_ops, [ETHTOOL_MSG_TSCONFIG_SET] = ðnl_tsconfig_request_ops, + [ETHTOOL_MSG_PHY_GET] = ðnl_phy_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -407,6 +427,12 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) return (struct ethnl_dump_ctx *)cb->ctx; } +static struct ethnl_perphy_dump_ctx * +ethnl_perphy_dump_context(struct netlink_callback *cb) +{ + return (struct ethnl_perphy_dump_ctx *)cb->ctx; +} + /** * ethnl_default_parse() - Parse request message * @req_info: pointer to structure to put data into @@ -436,10 +462,15 @@ static int ethnl_default_parse(struct ethnl_req_info *req_info, if (request_ops->parse_request) { ret = request_ops->parse_request(req_info, tb, info->extack); if (ret < 0) - return ret; + goto err_dev; } return 0; + +err_dev: + netdev_put(req_info->dev, &req_info->dev_tracker); + req_info->dev = NULL; + return ret; } /** @@ -489,7 +520,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do); if (ret < 0) - goto err_dev; + goto err_free; ethnl_init_reply_data(reply_data, ops, req_info->dev); rtnl_lock(); @@ -535,6 +566,7 @@ err_cleanup: ops->cleanup_data(reply_data); err_dev: netdev_put(req_info->dev, &req_info->dev_tracker); +err_free: kfree(reply_data); kfree(req_info); return ret; @@ -584,18 +616,19 @@ static int ethnl_default_dumpit(struct sk_buff *skb, { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); + netdevice_tracker dev_tracker; struct net_device *dev; int ret = 0; rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->pos_ifindex) { - dev_hold(dev); + netdev_hold(dev, &dev_tracker, GFP_ATOMIC); rcu_read_unlock(); ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); rcu_read_lock(); - dev_put(dev); + netdev_put(dev, &dev_tracker); if (ret < 0 && ret != -EOPNOTSUPP) { if (likely(skb->len)) @@ -636,6 +669,8 @@ static int ethnl_default_start(struct netlink_callback *cb) } ret = ethnl_default_parse(req_info, &info->info, ops, false); + if (ret < 0) + goto free_reply_data; if (req_info->dev) { /* We ignore device specification in dump requests but as the * same parser as for non-dump (doit) requests is used, it @@ -644,8 +679,61 @@ static int ethnl_default_start(struct netlink_callback *cb) netdev_put(req_info->dev, &req_info->dev_tracker); req_info->dev = NULL; } + + ctx->ops = ops; + ctx->req_info = req_info; + ctx->reply_data = reply_data; + ctx->pos_ifindex = 0; + + return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; +} + +/* per-PHY ->start() handler for GET requests */ +static int ethnl_perphy_start(struct netlink_callback *cb) +{ + struct ethnl_perphy_dump_ctx *phy_ctx = ethnl_perphy_dump_context(cb); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct ethnl_dump_ctx *ctx = &phy_ctx->ethnl_ctx; + struct ethnl_reply_data *reply_data; + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct genlmsghdr *ghdr; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + ghdr = nlmsg_data(cb->nlh); + ops = ethnl_default_requests[ghdr->cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) + return -EOPNOTSUPP; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + ret = -ENOMEM; + goto free_req_info; + } + + /* Unlike per-dev dump, don't ignore dev. The dump handler + * will notice it and dump PHYs from given dev. We only keep track of + * the dev's ifindex, .dumpit() will grab and release the netdev itself. + */ + ret = ethnl_default_parse(req_info, &info->info, ops, false); if (ret < 0) goto free_reply_data; + if (req_info->dev) { + phy_ctx->ifindex = req_info->dev->ifindex; + netdev_put(req_info->dev, &req_info->dev_tracker); + req_info->dev = NULL; + } ctx->ops = ops; ctx->req_info = req_info; @@ -662,6 +750,118 @@ free_req_info: return ret; } +static int ethnl_perphy_dump_one_dev(struct sk_buff *skb, + struct ethnl_perphy_dump_ctx *ctx, + const struct genl_info *info) +{ + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + struct net_device *dev = ethnl_ctx->req_info->dev; + struct phy_device_node *pdn; + int ret; + + if (!dev->link_topo) + return 0; + + xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, + ctx->pos_phyindex) { + ethnl_ctx->req_info->phy_index = ctx->pos_phyindex; + + /* We can re-use the original dump_one as ->prepare_data in + * commands use ethnl_req_get_phydev(), which gets the PHY from + * the req_info->phy_index + */ + ret = ethnl_default_dump_one(skb, dev, ethnl_ctx, info); + if (ret) + return ret; + } + + ctx->pos_phyindex = 0; + + return 0; +} + +static int ethnl_perphy_dump_all_dev(struct sk_buff *skb, + struct ethnl_perphy_dump_ctx *ctx, + const struct genl_info *info) +{ + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + struct net *net = sock_net(skb->sk); + netdevice_tracker dev_tracker; + struct net_device *dev; + int ret = 0; + + rcu_read_lock(); + for_each_netdev_dump(net, dev, ethnl_ctx->pos_ifindex) { + netdev_hold(dev, &dev_tracker, GFP_ATOMIC); + rcu_read_unlock(); + + /* per-PHY commands use ethnl_req_get_phydev(), which needs the + * net_device in the req_info + */ + ethnl_ctx->req_info->dev = dev; + ret = ethnl_perphy_dump_one_dev(skb, ctx, info); + + rcu_read_lock(); + netdev_put(dev, &dev_tracker); + ethnl_ctx->req_info->dev = NULL; + + if (ret < 0 && ret != -EOPNOTSUPP) { + if (likely(skb->len)) + ret = skb->len; + break; + } + ret = 0; + } + rcu_read_unlock(); + + return ret; +} + +/* per-PHY ->dumpit() handler for GET requests. */ +static int ethnl_perphy_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + int ret = 0; + + if (ctx->ifindex) { + netdevice_tracker dev_tracker; + struct net_device *dev; + + dev = netdev_get_by_index(genl_info_net(&info->info), + ctx->ifindex, &dev_tracker, + GFP_KERNEL); + if (!dev) + return -ENODEV; + + ethnl_ctx->req_info->dev = dev; + ret = ethnl_perphy_dump_one_dev(skb, ctx, genl_info_dump(cb)); + + if (ret < 0 && ret != -EOPNOTSUPP && likely(skb->len)) + ret = skb->len; + + netdev_put(dev, &dev_tracker); + } else { + ret = ethnl_perphy_dump_all_dev(skb, ctx, genl_info_dump(cb)); + } + + return ret; +} + +/* per-PHY ->done() handler for GET requests */ +static int ethnl_perphy_done(struct netlink_callback *cb) +{ + struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + + kfree(ethnl_ctx->reply_data); + kfree(ethnl_ctx->req_info); + + return 0; +} + /* default ->done() handler for GET requests */ static int ethnl_default_done(struct netlink_callback *cb) { @@ -676,8 +876,8 @@ static int ethnl_default_done(struct netlink_callback *cb) static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) { const struct ethnl_request_ops *ops; - struct ethnl_req_info req_info = {}; const u8 cmd = info->genlhdr->cmd; + struct ethnl_req_info *req_info; struct net_device *dev; int ret; @@ -687,20 +887,22 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) return -EINVAL; - ret = ethnl_parse_header_dev_get(&req_info, info->attrs[ops->hdr_attr], - genl_info_net(info), info->extack, - true); + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + + ret = ethnl_default_parse(req_info, info, ops, true); if (ret < 0) - return ret; + goto out_free_req; if (ops->set_validate) { - ret = ops->set_validate(&req_info, info); + ret = ops->set_validate(req_info, info); /* 0 means nothing to do */ if (ret <= 0) goto out_dev; } - dev = req_info.dev; + dev = req_info->dev; rtnl_lock(); netdev_lock_ops(dev); @@ -715,14 +917,14 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) if (ret < 0) goto out_free_cfg; - ret = ops->set(&req_info, info); + ret = ops->set(req_info, info); if (ret < 0) goto out_ops; swap(dev->cfg, dev->cfg_pending); if (!ret) goto out_ops; - ethtool_notify(dev, ops->set_ntf_cmd, NULL); + ethnl_notify(dev, ops->set_ntf_cmd, req_info); ret = 0; out_ops: @@ -734,7 +936,9 @@ out_tie_cfg: netdev_unlock_ops(dev); rtnl_unlock(); out_dev: - ethnl_parse_header_dev_put(&req_info); + ethnl_parse_header_dev_put(req_info); +out_free_req: + kfree(req_info); return ret; } @@ -755,11 +959,13 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_MODULE_NTF] = ðnl_module_request_ops, [ETHTOOL_MSG_PLCA_NTF] = ðnl_plca_cfg_request_ops, [ETHTOOL_MSG_MM_NTF] = ðnl_mm_request_ops, + [ETHTOOL_MSG_RSS_NTF] = ðnl_rss_request_ops, + [ETHTOOL_MSG_RSS_CREATE_NTF] = ðnl_rss_request_ops, }; /* default notification handler */ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, - const void *data) + const struct ethnl_req_info *orig_req_info) { struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; @@ -788,6 +994,11 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, req_info->dev = dev; req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; + if (orig_req_info) { + req_info->phy_index = orig_req_info->phy_index; + memcpy(&req_info[1], &orig_req_info[1], + ops->req_info_size - sizeof(*req_info)); + } netdev_ops_assert_locked(dev); @@ -838,7 +1049,7 @@ err_rep: /* notifications */ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, - const void *data); + const struct ethnl_req_info *req_info); static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, @@ -856,9 +1067,12 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_RSS_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_RSS_CREATE_NTF] = ethnl_default_notify, }; -void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) +void ethnl_notify(struct net_device *dev, unsigned int cmd, + const struct ethnl_req_info *req_info) { if (unlikely(!ethnl_ok)) return; @@ -866,18 +1080,23 @@ void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) && ethnl_notify_handlers[cmd])) - ethnl_notify_handlers[cmd](dev, cmd, data); + ethnl_notify_handlers[cmd](dev, cmd, req_info); else WARN_ONCE(1, "notification %u not implemented (dev=%s)\n", cmd, netdev_name(dev)); } + +void ethtool_notify(struct net_device *dev, unsigned int cmd) +{ + ethnl_notify(dev, cmd, NULL); +} EXPORT_SYMBOL(ethtool_notify); static void ethnl_notify_features(struct netdev_notifier_info *info) { struct net_device *dev = netdev_notifier_info_to_dev(info); - ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); + ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF); } static int ethnl_netdev_event(struct notifier_block *this, unsigned long event, @@ -1200,9 +1419,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PSE_GET, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_pse_get_policy, .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1, }, @@ -1224,9 +1443,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PLCA_GET_CFG, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_plca_get_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, }, @@ -1240,9 +1459,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_plca_get_status_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, }, @@ -1271,10 +1490,10 @@ static const struct genl_ops ethtool_genl_ops[] = { }, { .cmd = ETHTOOL_MSG_PHY_GET, - .doit = ethnl_phy_doit, - .start = ethnl_phy_start, - .dumpit = ethnl_phy_dumpit, - .done = ethnl_phy_done, + .doit = ethnl_default_doit, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_phy_get_policy, .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, }, @@ -1294,6 +1513,27 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_tsconfig_set_policy, .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_RSS_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_set_doit, + .policy = ethnl_rss_set_policy, + .maxattr = ARRAY_SIZE(ethnl_rss_set_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_RSS_CREATE_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_rss_create_doit, + .policy = ethnl_rss_create_policy, + .maxattr = ARRAY_SIZE(ethnl_rss_create_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_RSS_DELETE_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_rss_delete_doit, + .policy = ethnl_rss_delete_policy, + .maxattr = ARRAY_SIZE(ethnl_rss_delete_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index ec6ab5443a6f2..1d4f9ecb3d263 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -10,6 +10,7 @@ struct ethnl_req_info; +u32 ethnl_bcast_seq_next(void); int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *nest, struct net *net, struct netlink_ext_ack *extack, @@ -23,6 +24,8 @@ void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); +void ethnl_notify(struct net_device *dev, unsigned int cmd, + const struct ethnl_req_info *req_info); /** * ethnl_strz_size() - calculate attribute length for fixed size string @@ -337,6 +340,8 @@ int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, * header is already filled on entry, the rest up to @repdata_offset * is zero initialized. This callback should only modify type specific * request info by parsed attributes from request message. + * Called for both GET and SET. Information parsed for SET will + * be conveyed to the req_info used during NTF generation. * @prepare_data: * Retrieve and prepare data needed to compose a reply message. Calls to * ethtool_ops handlers are limited to this callback. Common reply data @@ -463,7 +468,7 @@ extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMB extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1]; extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1]; -extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; +extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1]; @@ -480,6 +485,9 @@ extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MO extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1]; +extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1]; +extern const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1]; +extern const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1]; extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; @@ -499,13 +507,11 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info); int ethnl_rss_dump_start(struct netlink_callback *cb); int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int ethnl_phy_start(struct netlink_callback *cb); -int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info); -int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int ethnl_phy_done(struct netlink_callback *cb); int ethnl_tsinfo_start(struct netlink_callback *cb); int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_tsinfo_done(struct netlink_callback *cb); +int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info); +int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index f7c847aeb1a29..0f9af1e665484 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -168,6 +168,7 @@ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, + [ETHTOOL_A_PAUSE_STATS_SRC] = { .type = NLA_REJECT }, }; static int diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index 1f590e8d75ed6..68372bef4b2fe 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -12,304 +12,154 @@ #include <net/netdev_lock.h> struct phy_req_info { - struct ethnl_req_info base; - struct phy_device_node *pdn; + struct ethnl_req_info base; }; -#define PHY_REQINFO(__req_base) \ - container_of(__req_base, struct phy_req_info, base) +struct phy_reply_data { + struct ethnl_reply_data base; + u32 phyindex; + char *drvname; + char *name; + unsigned int upstream_type; + char *upstream_sfp_name; + unsigned int upstream_index; + char *downstream_sfp_name; +}; + +#define PHY_REPDATA(__reply_base) \ + container_of(__reply_base, struct phy_reply_data, base) const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1] = { [ETHTOOL_A_PHY_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; -/* Caller holds rtnl */ -static ssize_t -ethnl_phy_reply_size(const struct ethnl_req_info *req_base, - struct netlink_ext_ack *extack) +static int phy_reply_size(const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data) { - struct phy_req_info *req_info = PHY_REQINFO(req_base); - struct phy_device_node *pdn = req_info->pdn; - struct phy_device *phydev = pdn->phy; + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); size_t size = 0; - ASSERT_RTNL(); - /* ETHTOOL_A_PHY_INDEX */ size += nla_total_size(sizeof(u32)); /* ETHTOOL_A_DRVNAME */ - if (phydev->drv) - size += nla_total_size(strlen(phydev->drv->name) + 1); + if (rep_data->drvname) + size += nla_total_size(strlen(rep_data->drvname) + 1); /* ETHTOOL_A_NAME */ - size += nla_total_size(strlen(dev_name(&phydev->mdio.dev)) + 1); + size += nla_total_size(strlen(rep_data->name) + 1); /* ETHTOOL_A_PHY_UPSTREAM_TYPE */ size += nla_total_size(sizeof(u32)); - if (phy_on_sfp(phydev)) { - const char *upstream_sfp_name = sfp_get_name(pdn->parent_sfp_bus); - - /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */ - if (upstream_sfp_name) - size += nla_total_size(strlen(upstream_sfp_name) + 1); + /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */ + if (rep_data->upstream_sfp_name) + size += nla_total_size(strlen(rep_data->upstream_sfp_name) + 1); - /* ETHTOOL_A_PHY_UPSTREAM_INDEX */ + /* ETHTOOL_A_PHY_UPSTREAM_INDEX */ + if (rep_data->upstream_index) size += nla_total_size(sizeof(u32)); - } /* ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME */ - if (phydev->sfp_bus) { - const char *sfp_name = sfp_get_name(phydev->sfp_bus); - - if (sfp_name) - size += nla_total_size(strlen(sfp_name) + 1); - } + if (rep_data->downstream_sfp_name) + size += nla_total_size(strlen(rep_data->downstream_sfp_name) + 1); return size; } -static int -ethnl_phy_fill_reply(const struct ethnl_req_info *req_base, struct sk_buff *skb) +static int phy_prepare_data(const struct ethnl_req_info *req_info, + struct ethnl_reply_data *reply_data, + const struct genl_info *info) { - struct phy_req_info *req_info = PHY_REQINFO(req_base); - struct phy_device_node *pdn = req_info->pdn; - struct phy_device *phydev = pdn->phy; - enum phy_upstream ptype; + struct phy_link_topology *topo = reply_data->dev->link_topo; + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); + struct nlattr **tb = info->attrs; + struct phy_device_node *pdn; + struct phy_device *phydev; - ptype = pdn->upstream_type; + /* RTNL is held by the caller */ + phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER, + info->extack); + if (IS_ERR_OR_NULL(phydev)) + return -EOPNOTSUPP; - if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, phydev->phyindex) || - nla_put_string(skb, ETHTOOL_A_PHY_NAME, dev_name(&phydev->mdio.dev)) || - nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, ptype)) - return -EMSGSIZE; + pdn = xa_load(&topo->phys, phydev->phyindex); + if (!pdn) + return -EOPNOTSUPP; - if (phydev->drv && - nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, phydev->drv->name)) - return -EMSGSIZE; + rep_data->phyindex = phydev->phyindex; + rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL); + rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + rep_data->upstream_type = pdn->upstream_type; - if (ptype == PHY_UPSTREAM_PHY) { + if (pdn->upstream_type == PHY_UPSTREAM_PHY) { struct phy_device *upstream = pdn->upstream.phydev; - const char *sfp_upstream_name; - - /* Parent index */ - if (nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, upstream->phyindex)) - return -EMSGSIZE; - - if (pdn->parent_sfp_bus) { - sfp_upstream_name = sfp_get_name(pdn->parent_sfp_bus); - if (sfp_upstream_name && - nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, - sfp_upstream_name)) - return -EMSGSIZE; - } - } - - if (phydev->sfp_bus) { - const char *sfp_name = sfp_get_name(phydev->sfp_bus); - - if (sfp_name && - nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, - sfp_name)) - return -EMSGSIZE; + rep_data->upstream_index = upstream->phyindex; } - return 0; -} - -static int ethnl_phy_parse_request(struct ethnl_req_info *req_base, - struct nlattr **tb, - struct netlink_ext_ack *extack) -{ - struct phy_link_topology *topo = req_base->dev->link_topo; - struct phy_req_info *req_info = PHY_REQINFO(req_base); - struct phy_device *phydev; + if (pdn->parent_sfp_bus) + rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus), + GFP_KERNEL); - phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PHY_HEADER, - extack); - if (!phydev) - return 0; - - if (IS_ERR(phydev)) - return PTR_ERR(phydev); - - if (!topo) - return 0; - - req_info->pdn = xa_load(&topo->phys, phydev->phyindex); + if (phydev->sfp_bus) + rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus), + GFP_KERNEL); return 0; } -int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info) +static int phy_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data) { - struct phy_req_info req_info = {}; - struct nlattr **tb = info->attrs; - struct sk_buff *rskb; - void *reply_payload; - int reply_len; - int ret; - - ret = ethnl_parse_header_dev_get(&req_info.base, - tb[ETHTOOL_A_PHY_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - - rtnl_lock(); - netdev_lock_ops(req_info.base.dev); - - ret = ethnl_phy_parse_request(&req_info.base, tb, info->extack); - if (ret < 0) - goto err_unlock; - - /* No PHY, return early */ - if (!req_info.pdn) - goto err_unlock; - - ret = ethnl_phy_reply_size(&req_info.base, info->extack); - if (ret < 0) - goto err_unlock; - reply_len = ret + ethnl_reply_header_size(); - - rskb = ethnl_reply_init(reply_len, req_info.base.dev, - ETHTOOL_MSG_PHY_GET_REPLY, - ETHTOOL_A_PHY_HEADER, - info, &reply_payload); - if (!rskb) { - ret = -ENOMEM; - goto err_unlock; - } - - ret = ethnl_phy_fill_reply(&req_info.base, rskb); - if (ret) - goto err_free_msg; + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); - netdev_unlock_ops(req_info.base.dev); - rtnl_unlock(); - ethnl_parse_header_dev_put(&req_info.base); - genlmsg_end(rskb, reply_payload); - - return genlmsg_reply(rskb, info); - -err_free_msg: - nlmsg_free(rskb); -err_unlock: - netdev_unlock_ops(req_info.base.dev); - rtnl_unlock(); - ethnl_parse_header_dev_put(&req_info.base); - return ret; -} - -struct ethnl_phy_dump_ctx { - struct phy_req_info *phy_req_info; - unsigned long ifindex; - unsigned long phy_index; -}; - -int ethnl_phy_start(struct netlink_callback *cb) -{ - const struct genl_info *info = genl_info_dump(cb); - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; - int ret; - - BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); - - ctx->phy_req_info = kzalloc(sizeof(*ctx->phy_req_info), GFP_KERNEL); - if (!ctx->phy_req_info) - return -ENOMEM; - - ret = ethnl_parse_header_dev_get(&ctx->phy_req_info->base, - info->attrs[ETHTOOL_A_PHY_HEADER], - sock_net(cb->skb->sk), cb->extack, - false); - ctx->ifindex = 0; - ctx->phy_index = 0; - - if (ret) - kfree(ctx->phy_req_info); + if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, rep_data->phyindex) || + nla_put_string(skb, ETHTOOL_A_PHY_NAME, rep_data->name) || + nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, rep_data->upstream_type)) + return -EMSGSIZE; - return ret; -} + if (rep_data->drvname && + nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, rep_data->drvname)) + return -EMSGSIZE; -int ethnl_phy_done(struct netlink_callback *cb) -{ - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; + if (rep_data->upstream_index && + nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, + rep_data->upstream_index)) + return -EMSGSIZE; - if (ctx->phy_req_info->base.dev) - ethnl_parse_header_dev_put(&ctx->phy_req_info->base); + if (rep_data->upstream_sfp_name && + nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, + rep_data->upstream_sfp_name)) + return -EMSGSIZE; - kfree(ctx->phy_req_info); + if (rep_data->downstream_sfp_name && + nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, + rep_data->downstream_sfp_name)) + return -EMSGSIZE; return 0; } -static int ethnl_phy_dump_one_dev(struct sk_buff *skb, struct net_device *dev, - struct netlink_callback *cb) +static void phy_cleanup_data(struct ethnl_reply_data *reply_data) { - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; - struct phy_req_info *pri = ctx->phy_req_info; - struct phy_device_node *pdn; - int ret = 0; - void *ehdr; - - if (!dev->link_topo) - return 0; - - xa_for_each_start(&dev->link_topo->phys, ctx->phy_index, pdn, ctx->phy_index) { - ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_PHY_GET_REPLY); - if (!ehdr) { - ret = -EMSGSIZE; - break; - } - - ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_PHY_HEADER); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - break; - } - - pri->pdn = pdn; - ret = ethnl_phy_fill_reply(&pri->base, skb); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - break; - } - - genlmsg_end(skb, ehdr); - } + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); - return ret; + kfree(rep_data->drvname); + kfree(rep_data->name); + kfree(rep_data->upstream_sfp_name); + kfree(rep_data->downstream_sfp_name); } -int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; - struct net *net = sock_net(skb->sk); - struct net_device *dev; - int ret = 0; - - rtnl_lock(); - - if (ctx->phy_req_info->base.dev) { - dev = ctx->phy_req_info->base.dev; - netdev_lock_ops(dev); - ret = ethnl_phy_dump_one_dev(skb, dev, cb); - netdev_unlock_ops(dev); - } else { - for_each_netdev_dump(net, dev, ctx->ifindex) { - netdev_lock_ops(dev); - ret = ethnl_phy_dump_one_dev(skb, dev, cb); - netdev_unlock_ops(dev); - if (ret) - break; - - ctx->phy_index = 0; - } - } - rtnl_unlock(); - - return ret; -} +const struct ethnl_request_ops ethnl_phy_request_ops = { + .request_cmd = ETHTOOL_MSG_PHY_GET, + .reply_cmd = ETHTOOL_MSG_PHY_GET_REPLY, + .hdr_attr = ETHTOOL_A_PHY_HEADER, + .req_info_size = sizeof(struct phy_req_info), + .reply_data_size = sizeof(struct phy_reply_data), + + .prepare_data = phy_prepare_data, + .reply_size = phy_reply_size, + .fill_reply = phy_fill_reply, + .cleanup_data = phy_cleanup_data, +}; diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index 4f6b99eab2a6c..24def9c9dd54b 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -11,6 +11,7 @@ #include "netlink.h" #include <linux/ethtool_netlink.h> #include <linux/ethtool.h> +#include <linux/export.h> #include <linux/phy.h> struct pse_req_info { @@ -83,6 +84,8 @@ static int pse_reply_size(const struct ethnl_req_info *req_base, const struct ethtool_pse_control_status *st = &data->status; int len = 0; + if (st->pw_d_id) + len += nla_total_size(sizeof(u32)); /* _PSE_PW_D_ID */ if (st->podl_admin_state > 0) len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */ if (st->podl_pw_status > 0) @@ -109,6 +112,9 @@ static int pse_reply_size(const struct ethnl_req_info *req_base, len += st->c33_pw_limit_nb_ranges * (nla_total_size(0) + nla_total_size(sizeof(u32)) * 2); + if (st->prio_max) + /* _PSE_PRIO_MAX + _PSE_PRIO */ + len += nla_total_size(sizeof(u32)) * 2; return len; } @@ -148,6 +154,11 @@ static int pse_fill_reply(struct sk_buff *skb, const struct pse_reply_data *data = PSE_REPDATA(reply_base); const struct ethtool_pse_control_status *st = &data->status; + if (st->pw_d_id && + nla_put_u32(skb, ETHTOOL_A_PSE_PW_D_ID, + st->pw_d_id)) + return -EMSGSIZE; + if (st->podl_admin_state > 0 && nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE, st->podl_admin_state)) @@ -198,6 +209,11 @@ static int pse_fill_reply(struct sk_buff *skb, pse_put_pw_limit_ranges(skb, st)) return -EMSGSIZE; + if (st->prio_max && + (nla_put_u32(skb, ETHTOOL_A_PSE_PRIO_MAX, st->prio_max) || + nla_put_u32(skb, ETHTOOL_A_PSE_PRIO, st->prio))) + return -EMSGSIZE; + return 0; } @@ -219,6 +235,7 @@ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { NLA_POLICY_RANGE(NLA_U32, ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED, ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED), [ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT] = { .type = NLA_U32 }, + [ETHTOOL_A_PSE_PRIO] = { .type = NLA_U32 }, }; static int @@ -267,6 +284,15 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) if (ret) return ret; + if (tb[ETHTOOL_A_PSE_PRIO]) { + unsigned int prio; + + prio = nla_get_u32(tb[ETHTOOL_A_PSE_PRIO]); + ret = pse_ethtool_set_prio(phydev->psec, info->extack, prio); + if (ret) + return ret; + } + if (tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]) { unsigned int pw_limit; @@ -315,3 +341,42 @@ const struct ethnl_request_ops ethnl_pse_request_ops = { .set = ethnl_set_pse, /* PSE has no notification */ }; + +void ethnl_pse_send_ntf(struct net_device *netdev, unsigned long notifs) +{ + void *reply_payload; + struct sk_buff *skb; + int reply_len; + int ret; + + ASSERT_RTNL(); + + if (!netdev || !notifs) + return; + + reply_len = ethnl_reply_header_size() + + nla_total_size(sizeof(u32)); /* _PSE_NTF_EVENTS */ + + skb = genlmsg_new(reply_len, GFP_KERNEL); + if (!skb) + return; + + reply_payload = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_PSE_NTF); + if (!reply_payload) + goto err_skb; + + ret = ethnl_fill_reply_header(skb, netdev, ETHTOOL_A_PSE_NTF_HEADER); + if (ret < 0) + goto err_skb; + + if (nla_put_uint(skb, ETHTOOL_A_PSE_NTF_EVENTS, notifs)) + goto err_skb; + + genlmsg_end(skb, reply_payload); + ethnl_multicast(skb, netdev); + return; + +err_skb: + nlmsg_free(skb); +} +EXPORT_SYMBOL_GPL(ethnl_pse_send_ntf); diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 6d9b1769896ba..992e98abe9ddb 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -12,6 +12,7 @@ struct rss_req_info { struct rss_reply_data { struct ethnl_reply_data base; + bool has_flow_hash; bool no_key_fields; u32 indir_size; u32 hkey_size; @@ -19,6 +20,37 @@ struct rss_reply_data { u32 input_xfrm; u32 *indir_table; u8 *hkey; + int flow_hash[__ETHTOOL_A_FLOW_CNT]; +}; + +static const u8 ethtool_rxfh_ft_nl2ioctl[] = { + [ETHTOOL_A_FLOW_ETHER] = ETHER_FLOW, + [ETHTOOL_A_FLOW_IP4] = IPV4_FLOW, + [ETHTOOL_A_FLOW_IP6] = IPV6_FLOW, + [ETHTOOL_A_FLOW_TCP4] = TCP_V4_FLOW, + [ETHTOOL_A_FLOW_UDP4] = UDP_V4_FLOW, + [ETHTOOL_A_FLOW_SCTP4] = SCTP_V4_FLOW, + [ETHTOOL_A_FLOW_AH_ESP4] = AH_ESP_V4_FLOW, + [ETHTOOL_A_FLOW_TCP6] = TCP_V6_FLOW, + [ETHTOOL_A_FLOW_UDP6] = UDP_V6_FLOW, + [ETHTOOL_A_FLOW_SCTP6] = SCTP_V6_FLOW, + [ETHTOOL_A_FLOW_AH_ESP6] = AH_ESP_V6_FLOW, + [ETHTOOL_A_FLOW_AH4] = AH_V4_FLOW, + [ETHTOOL_A_FLOW_ESP4] = ESP_V4_FLOW, + [ETHTOOL_A_FLOW_AH6] = AH_V6_FLOW, + [ETHTOOL_A_FLOW_ESP6] = ESP_V6_FLOW, + [ETHTOOL_A_FLOW_GTPU4] = GTPU_V4_FLOW, + [ETHTOOL_A_FLOW_GTPU6] = GTPU_V6_FLOW, + [ETHTOOL_A_FLOW_GTPC4] = GTPC_V4_FLOW, + [ETHTOOL_A_FLOW_GTPC6] = GTPC_V6_FLOW, + [ETHTOOL_A_FLOW_GTPC_TEID4] = GTPC_TEID_V4_FLOW, + [ETHTOOL_A_FLOW_GTPC_TEID6] = GTPC_TEID_V6_FLOW, + [ETHTOOL_A_FLOW_GTPU_EH4] = GTPU_EH_V4_FLOW, + [ETHTOOL_A_FLOW_GTPU_EH6] = GTPU_EH_V6_FLOW, + [ETHTOOL_A_FLOW_GTPU_UL4] = GTPU_UL_V4_FLOW, + [ETHTOOL_A_FLOW_GTPU_UL6] = GTPU_UL_V6_FLOW, + [ETHTOOL_A_FLOW_GTPU_DL4] = GTPU_DL_V4_FLOW, + [ETHTOOL_A_FLOW_GTPU_DL6] = GTPU_DL_V6_FLOW, }; #define RSS_REQINFO(__req_base) \ @@ -49,21 +81,43 @@ rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, return 0; } +static void +rss_prepare_flow_hash(const struct rss_req_info *req, struct net_device *dev, + struct rss_reply_data *data, const struct genl_info *info) +{ + int i; + + data->has_flow_hash = false; + + if (!dev->ethtool_ops->get_rxfh_fields) + return; + if (req->rss_context && !dev->ethtool_ops->rxfh_per_ctx_fields) + return; + + mutex_lock(&dev->ethtool->rss_lock); + for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) { + struct ethtool_rxfh_fields fields = { + .flow_type = ethtool_rxfh_ft_nl2ioctl[i], + .rss_context = req->rss_context, + }; + + if (dev->ethtool_ops->get_rxfh_fields(dev, &fields)) { + data->flow_hash[i] = -1; /* Unsupported */ + continue; + } + + data->flow_hash[i] = fields.data; + data->has_flow_hash = true; + } + mutex_unlock(&dev->ethtool->rss_lock); +} + static int -rss_prepare_get(const struct rss_req_info *request, struct net_device *dev, - struct rss_reply_data *data, const struct genl_info *info) +rss_get_data_alloc(struct net_device *dev, struct rss_reply_data *data) { - struct ethtool_rxfh_param rxfh = {}; - const struct ethtool_ops *ops; + const struct ethtool_ops *ops = dev->ethtool_ops; u32 total_size, indir_bytes; u8 *rss_config; - int ret; - - ops = dev->ethtool_ops; - - ret = ethnl_ops_begin(dev); - if (ret < 0) - return ret; data->indir_size = 0; data->hkey_size = 0; @@ -75,16 +129,39 @@ rss_prepare_get(const struct rss_req_info *request, struct net_device *dev, indir_bytes = data->indir_size * sizeof(u32); total_size = indir_bytes + data->hkey_size; rss_config = kzalloc(total_size, GFP_KERNEL); - if (!rss_config) { - ret = -ENOMEM; - goto out_ops; - } + if (!rss_config) + return -ENOMEM; if (data->indir_size) data->indir_table = (u32 *)rss_config; if (data->hkey_size) data->hkey = rss_config + indir_bytes; + return 0; +} + +static void rss_get_data_free(const struct rss_reply_data *data) +{ + kfree(data->indir_table); +} + +static int +rss_prepare_get(const struct rss_req_info *request, struct net_device *dev, + struct rss_reply_data *data, const struct genl_info *info) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxfh_param rxfh = {}; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + mutex_lock(&dev->ethtool->rss_lock); + + ret = rss_get_data_alloc(dev, data); + if (ret) + goto out_unlock; + rxfh.indir_size = data->indir_size; rxfh.indir = data->indir_table; rxfh.key_size = data->hkey_size; @@ -92,15 +169,35 @@ rss_prepare_get(const struct rss_req_info *request, struct net_device *dev, ret = ops->get_rxfh(dev, &rxfh); if (ret) - goto out_ops; + goto out_unlock; data->hfunc = rxfh.hfunc; data->input_xfrm = rxfh.input_xfrm; -out_ops: +out_unlock: + mutex_unlock(&dev->ethtool->rss_lock); ethnl_ops_complete(dev); return ret; } +static void +__rss_prepare_ctx(struct net_device *dev, struct rss_reply_data *data, + struct ethtool_rxfh_context *ctx) +{ + if (WARN_ON_ONCE(data->indir_size != ctx->indir_size || + data->hkey_size != ctx->key_size)) + return; + + data->no_key_fields = !dev->ethtool_ops->rxfh_per_ctx_key; + + data->hfunc = ctx->hfunc; + data->input_xfrm = ctx->input_xfrm; + memcpy(data->indir_table, ethtool_rxfh_context_indir(ctx), + data->indir_size * sizeof(u32)); + if (data->hkey_size) + memcpy(data->hkey, ethtool_rxfh_context_key(ctx), + data->hkey_size); +} + static int rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev, struct rss_reply_data *data, const struct genl_info *info) @@ -108,34 +205,51 @@ rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev, struct ethtool_rxfh_context *ctx; u32 total_size, indir_bytes; u8 *rss_config; + int ret; - data->no_key_fields = !dev->ethtool_ops->rxfh_per_ctx_key; - + mutex_lock(&dev->ethtool->rss_lock); ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context); - if (!ctx) - return -ENOENT; + if (!ctx) { + ret = -ENOENT; + goto out_unlock; + } data->indir_size = ctx->indir_size; data->hkey_size = ctx->key_size; - data->hfunc = ctx->hfunc; - data->input_xfrm = ctx->input_xfrm; indir_bytes = data->indir_size * sizeof(u32); total_size = indir_bytes + data->hkey_size; rss_config = kzalloc(total_size, GFP_KERNEL); - if (!rss_config) - return -ENOMEM; + if (!rss_config) { + ret = -ENOMEM; + goto out_unlock; + } data->indir_table = (u32 *)rss_config; - memcpy(data->indir_table, ethtool_rxfh_context_indir(ctx), indir_bytes); - - if (data->hkey_size) { + if (data->hkey_size) data->hkey = rss_config + indir_bytes; - memcpy(data->hkey, ethtool_rxfh_context_key(ctx), - data->hkey_size); - } - return 0; + __rss_prepare_ctx(dev, data, ctx); + + ret = 0; +out_unlock: + mutex_unlock(&dev->ethtool->rss_lock); + return ret; +} + +static int +rss_prepare(const struct rss_req_info *request, struct net_device *dev, + struct rss_reply_data *data, const struct genl_info *info) +{ + rss_prepare_flow_hash(request, dev, data, info); + + /* Coming from RSS_SET, driver may only have flow_hash_fields ops */ + if (!dev->ethtool_ops->get_rxfh) + return 0; + + if (request->rss_context) + return rss_prepare_ctx(request, dev, data, info); + return rss_prepare_get(request, dev, data, info); } static int @@ -153,14 +267,10 @@ rss_prepare_data(const struct ethnl_req_info *req_base, return -EOPNOTSUPP; /* Some drivers don't handle rss_context */ - if (request->rss_context) { - if (!ops->cap_rss_ctx_supported && !ops->create_rxfh_context) - return -EOPNOTSUPP; - - return rss_prepare_ctx(request, dev, data, info); - } + if (request->rss_context && !ops->create_rxfh_context) + return -EOPNOTSUPP; - return rss_prepare_get(request, dev, data, info); + return rss_prepare(request, dev, data, info); } static int @@ -174,7 +284,10 @@ rss_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ - nla_total_size(data->hkey_size); /* _RSS_HKEY */ + nla_total_size(data->hkey_size) + /* _RSS_HKEY */ + nla_total_size(0) + /* _RSS_FLOW_HASH */ + nla_total_size(sizeof(u32)) * ETHTOOL_A_FLOW_MAX + + 0; return len; } @@ -195,17 +308,34 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, sizeof(u32) * data->indir_size, data->indir_table))) return -EMSGSIZE; - if (data->no_key_fields) - return 0; - - if ((data->hfunc && - nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || - (data->input_xfrm && - nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) || - (data->hkey_size && - nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))) + if (!data->no_key_fields && + ((data->hfunc && + nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || + (data->input_xfrm && + nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) || + (data->hkey_size && + nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey)))) return -EMSGSIZE; + if (data->has_flow_hash) { + struct nlattr *nest; + int i; + + nest = nla_nest_start(skb, ETHTOOL_A_RSS_FLOW_HASH); + if (!nest) + return -EMSGSIZE; + + for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) { + if (data->flow_hash[i] >= 0 && + nla_put_uint(skb, i, data->flow_hash[i])) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + } + + nla_nest_end(skb, nest); + } + return 0; } @@ -213,7 +343,7 @@ static void rss_cleanup_data(struct ethnl_reply_data *reply_base) { const struct rss_reply_data *data = RSS_REPDATA(reply_base); - kfree(data->indir_table); + rss_get_data_free(data); } struct rss_nl_dump_ctx { @@ -284,11 +414,7 @@ rss_dump_one_ctx(struct sk_buff *skb, struct netlink_callback *cb, if (ret < 0) goto err_cancel; - /* Context 0 is not currently storred or cached in the XArray */ - if (!rss_context) - ret = rss_prepare_get(&req, dev, &data, info); - else - ret = rss_prepare_ctx(&req, dev, &data, info); + ret = rss_prepare(&req, dev, &data, info); if (ret) goto err_cancel; @@ -358,6 +484,434 @@ int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return ret; } +/* RSS_NTF */ + +static void ethnl_rss_delete_notify(struct net_device *dev, u32 rss_context) +{ + struct sk_buff *ntf; + size_t ntf_size; + void *hdr; + + ntf_size = ethnl_reply_header_size() + + nla_total_size(sizeof(u32)); /* _RSS_CONTEXT */ + + ntf = genlmsg_new(ntf_size, GFP_KERNEL); + if (!ntf) + goto out_warn; + + hdr = ethnl_bcastmsg_put(ntf, ETHTOOL_MSG_RSS_DELETE_NTF); + if (!hdr) + goto out_free_ntf; + + if (ethnl_fill_reply_header(ntf, dev, ETHTOOL_A_RSS_HEADER) || + nla_put_u32(ntf, ETHTOOL_A_RSS_CONTEXT, rss_context)) + goto out_free_ntf; + + genlmsg_end(ntf, hdr); + if (ethnl_multicast(ntf, dev)) + goto out_warn; + + return; + +out_free_ntf: + nlmsg_free(ntf); +out_warn: + pr_warn_once("Failed to send a RSS delete notification"); +} + +void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context) +{ + struct rss_req_info req_info = { + .rss_context = rss_context, + }; + + if (type == ETHTOOL_MSG_RSS_DELETE_NTF) + ethnl_rss_delete_notify(dev, rss_context); + else + ethnl_notify(dev, type, &req_info.base); +} + +/* RSS_SET */ + +#define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \ + RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \ + RXH_GTP_TEID | RXH_DISCARD) + +static const struct nla_policy ethnl_rss_flows_policy[] = { + [ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), +}; + +const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = { + [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32, }, + [ETHTOOL_A_RSS_HFUNC] = NLA_POLICY_MIN(NLA_U32, 1), + [ETHTOOL_A_RSS_INDIR] = { .type = NLA_BINARY, }, + [ETHTOOL_A_RSS_HKEY] = NLA_POLICY_MIN(NLA_BINARY, 1), + [ETHTOOL_A_RSS_INPUT_XFRM] = + NLA_POLICY_MAX(NLA_U32, RXH_XFRM_SYM_OR_XOR), + [ETHTOOL_A_RSS_FLOW_HASH] = NLA_POLICY_NESTED(ethnl_rss_flows_policy), +}; + +static int +ethnl_rss_set_validate(struct ethnl_req_info *req_info, struct genl_info *info) +{ + const struct ethtool_ops *ops = req_info->dev->ethtool_ops; + struct rss_req_info *request = RSS_REQINFO(req_info); + struct nlattr **tb = info->attrs; + struct nlattr *bad_attr = NULL; + u32 input_xfrm; + + if (request->rss_context && !ops->create_rxfh_context) + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_CONTEXT]; + + if (request->rss_context && !ops->rxfh_per_ctx_key) { + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HFUNC]; + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HKEY]; + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM]; + } + + input_xfrm = nla_get_u32_default(tb[ETHTOOL_A_RSS_INPUT_XFRM], 0); + if (input_xfrm & ~ops->supported_input_xfrm) + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM]; + + if (tb[ETHTOOL_A_RSS_FLOW_HASH] && !ops->set_rxfh_fields) + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_FLOW_HASH]; + if (request->rss_context && + tb[ETHTOOL_A_RSS_FLOW_HASH] && !ops->rxfh_per_ctx_fields) + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_FLOW_HASH]; + + if (bad_attr) { + NL_SET_BAD_ATTR(info->extack, bad_attr); + return -EOPNOTSUPP; + } + + return 1; +} + +static int +rss_set_prep_indir(struct net_device *dev, struct genl_info *info, + struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh, + bool *reset, bool *mod) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct netlink_ext_ack *extack = info->extack; + struct nlattr **tb = info->attrs; + struct ethtool_rxnfc rx_rings; + size_t alloc_size; + u32 user_size; + int i, err; + + if (!tb[ETHTOOL_A_RSS_INDIR]) + return 0; + if (!data->indir_size || !ops->get_rxnfc) + return -EOPNOTSUPP; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + err = ops->get_rxnfc(dev, &rx_rings, NULL); + if (err) + return err; + + if (nla_len(tb[ETHTOOL_A_RSS_INDIR]) % 4) { + NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_INDIR]); + return -EINVAL; + } + user_size = nla_len(tb[ETHTOOL_A_RSS_INDIR]) / 4; + if (!user_size) { + if (rxfh->rss_context) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_RSS_INDIR], + "can't reset table for a context"); + return -EINVAL; + } + *reset = true; + } else if (data->indir_size % user_size) { + NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR], + "size (%d) mismatch with device indir table (%d)", + user_size, data->indir_size); + return -EINVAL; + } + + rxfh->indir_size = data->indir_size; + alloc_size = array_size(data->indir_size, sizeof(rxfh->indir[0])); + rxfh->indir = kzalloc(alloc_size, GFP_KERNEL); + if (!rxfh->indir) + return -ENOMEM; + + nla_memcpy(rxfh->indir, tb[ETHTOOL_A_RSS_INDIR], alloc_size); + for (i = 0; i < user_size; i++) { + if (rxfh->indir[i] < rx_rings.data) + continue; + + NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR], + "entry %d: queue out of range (%d)", + i, rxfh->indir[i]); + err = -EINVAL; + goto err_free; + } + + if (user_size) { + /* Replicate the user-provided table to fill the device table */ + for (i = user_size; i < data->indir_size; i++) + rxfh->indir[i] = rxfh->indir[i % user_size]; + } else { + for (i = 0; i < data->indir_size; i++) + rxfh->indir[i] = + ethtool_rxfh_indir_default(i, rx_rings.data); + } + + *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size); + + return 0; + +err_free: + kfree(rxfh->indir); + rxfh->indir = NULL; + return err; +} + +static int +rss_set_prep_hkey(struct net_device *dev, struct genl_info *info, + struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh, + bool *mod) +{ + struct nlattr **tb = info->attrs; + + if (!tb[ETHTOOL_A_RSS_HKEY]) + return 0; + + if (nla_len(tb[ETHTOOL_A_RSS_HKEY]) != data->hkey_size) { + NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_HKEY]); + return -EINVAL; + } + + rxfh->key_size = data->hkey_size; + rxfh->key = kmemdup(data->hkey, data->hkey_size, GFP_KERNEL); + if (!rxfh->key) + return -ENOMEM; + + ethnl_update_binary(rxfh->key, rxfh->key_size, tb[ETHTOOL_A_RSS_HKEY], + mod); + return 0; +} + +static int +rss_check_rxfh_fields_sym(struct net_device *dev, struct genl_info *info, + struct rss_reply_data *data, bool xfrm_sym) +{ + struct nlattr **tb = info->attrs; + int i; + + if (!xfrm_sym) + return 0; + if (!data->has_flow_hash) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RSS_INPUT_XFRM], + "hash field config not reported"); + return -EINVAL; + } + + for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) + if (data->flow_hash[i] >= 0 && + !ethtool_rxfh_config_is_sym(data->flow_hash[i])) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_RSS_INPUT_XFRM], + "hash field config is not symmetric"); + return -EINVAL; + } + + return 0; +} + +static int +ethnl_set_rss_fields(struct net_device *dev, struct genl_info *info, + u32 rss_context, struct rss_reply_data *data, + bool xfrm_sym, bool *mod) +{ + struct nlattr *flow_nest = info->attrs[ETHTOOL_A_RSS_FLOW_HASH]; + struct nlattr *flows[ETHTOOL_A_FLOW_MAX + 1]; + const struct ethtool_ops *ops; + int i, ret; + + ops = dev->ethtool_ops; + + ret = rss_check_rxfh_fields_sym(dev, info, data, xfrm_sym); + if (ret) + return ret; + + if (!flow_nest) + return 0; + + ret = nla_parse_nested(flows, ARRAY_SIZE(ethnl_rss_flows_policy) - 1, + flow_nest, ethnl_rss_flows_policy, info->extack); + if (ret < 0) + return ret; + + for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) { + struct ethtool_rxfh_fields fields = { + .flow_type = ethtool_rxfh_ft_nl2ioctl[i], + .rss_context = rss_context, + }; + + if (!flows[i]) + continue; + + fields.data = nla_get_u32(flows[i]); + if (data->has_flow_hash && data->flow_hash[i] == fields.data) + continue; + + if (xfrm_sym && !ethtool_rxfh_config_is_sym(fields.data)) { + NL_SET_ERR_MSG_ATTR(info->extack, flows[i], + "conflict with xfrm-input"); + return -EINVAL; + } + + ret = ops->set_rxfh_fields(dev, &fields, info->extack); + if (ret) + return ret; + + *mod = true; + } + + return 0; +} + +static void +rss_set_ctx_update(struct ethtool_rxfh_context *ctx, struct nlattr **tb, + struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh) +{ + int i; + + if (rxfh->indir) { + for (i = 0; i < data->indir_size; i++) + ethtool_rxfh_context_indir(ctx)[i] = rxfh->indir[i]; + ctx->indir_configured = !!nla_len(tb[ETHTOOL_A_RSS_INDIR]); + } + if (rxfh->key) { + memcpy(ethtool_rxfh_context_key(ctx), rxfh->key, + data->hkey_size); + ctx->key_configured = !!rxfh->key_size; + } + if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE) + ctx->hfunc = rxfh->hfunc; + if (rxfh->input_xfrm != RXH_XFRM_NO_CHANGE) + ctx->input_xfrm = rxfh->input_xfrm; +} + +static int +ethnl_rss_set(struct ethnl_req_info *req_info, struct genl_info *info) +{ + bool indir_reset = false, indir_mod, xfrm_sym = false; + struct rss_req_info *request = RSS_REQINFO(req_info); + struct ethtool_rxfh_context *ctx = NULL; + struct net_device *dev = req_info->dev; + bool mod = false, fields_mod = false; + struct ethtool_rxfh_param rxfh = {}; + struct nlattr **tb = info->attrs; + struct rss_reply_data data = {}; + const struct ethtool_ops *ops; + int ret; + + ops = dev->ethtool_ops; + data.base.dev = dev; + + ret = rss_prepare(request, dev, &data, info); + if (ret) + return ret; + + rxfh.rss_context = request->rss_context; + + ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_reset, &mod); + if (ret) + goto exit_clean_data; + indir_mod = !!tb[ETHTOOL_A_RSS_INDIR]; + + rxfh.hfunc = data.hfunc; + ethnl_update_u8(&rxfh.hfunc, tb[ETHTOOL_A_RSS_HFUNC], &mod); + if (rxfh.hfunc == data.hfunc) + rxfh.hfunc = ETH_RSS_HASH_NO_CHANGE; + + ret = rss_set_prep_hkey(dev, info, &data, &rxfh, &mod); + if (ret) + goto exit_free_indir; + + rxfh.input_xfrm = data.input_xfrm; + ethnl_update_u8(&rxfh.input_xfrm, tb[ETHTOOL_A_RSS_INPUT_XFRM], &mod); + /* For drivers which don't support input_xfrm it will be set to 0xff + * in the RSS context info. In all other case input_xfrm != 0 means + * symmetric hashing is requested. + */ + if (!request->rss_context || ops->rxfh_per_ctx_key) + xfrm_sym = rxfh.input_xfrm || data.input_xfrm; + if (rxfh.input_xfrm == data.input_xfrm) + rxfh.input_xfrm = RXH_XFRM_NO_CHANGE; + + mutex_lock(&dev->ethtool->rss_lock); + if (request->rss_context) { + ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context); + if (!ctx) { + ret = -ENOENT; + goto exit_unlock; + } + } + + ret = ethnl_set_rss_fields(dev, info, request->rss_context, + &data, xfrm_sym, &fields_mod); + if (ret) + goto exit_unlock; + + if (!mod) + ret = 0; /* nothing to tell the driver */ + else if (!ops->set_rxfh) + ret = -EOPNOTSUPP; + else if (!rxfh.rss_context) + ret = ops->set_rxfh(dev, &rxfh, info->extack); + else + ret = ops->modify_rxfh_context(dev, ctx, &rxfh, info->extack); + if (ret) + goto exit_unlock; + + if (ctx) + rss_set_ctx_update(ctx, tb, &data, &rxfh); + else if (indir_reset) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (indir_mod) + dev->priv_flags |= IFF_RXFH_CONFIGURED; + +exit_unlock: + mutex_unlock(&dev->ethtool->rss_lock); + kfree(rxfh.key); +exit_free_indir: + kfree(rxfh.indir); +exit_clean_data: + rss_cleanup_data(&data.base); + + return ret ?: mod || fields_mod; +} + const struct ethnl_request_ops ethnl_rss_request_ops = { .request_cmd = ETHTOOL_MSG_RSS_GET, .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY, @@ -370,4 +924,282 @@ const struct ethnl_request_ops ethnl_rss_request_ops = { .reply_size = rss_reply_size, .fill_reply = rss_fill_reply, .cleanup_data = rss_cleanup_data, + + .set_validate = ethnl_rss_set_validate, + .set = ethnl_rss_set, + .set_ntf_cmd = ETHTOOL_MSG_RSS_NTF, +}; + +/* RSS_CREATE */ + +const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1] = { + [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_RSS_CONTEXT] = NLA_POLICY_MIN(NLA_U32, 1), + [ETHTOOL_A_RSS_HFUNC] = NLA_POLICY_MIN(NLA_U32, 1), + [ETHTOOL_A_RSS_INDIR] = NLA_POLICY_MIN(NLA_BINARY, 1), + [ETHTOOL_A_RSS_HKEY] = NLA_POLICY_MIN(NLA_BINARY, 1), + [ETHTOOL_A_RSS_INPUT_XFRM] = + NLA_POLICY_MAX(NLA_U32, RXH_XFRM_SYM_OR_XOR), }; + +static int +ethnl_rss_create_validate(struct net_device *dev, struct genl_info *info) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct nlattr **tb = info->attrs; + struct nlattr *bad_attr = NULL; + u32 rss_context, input_xfrm; + + if (!ops->create_rxfh_context) + return -EOPNOTSUPP; + + rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0); + if (ops->rxfh_max_num_contexts && + ops->rxfh_max_num_contexts <= rss_context) { + NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT]); + return -ERANGE; + } + + if (!ops->rxfh_per_ctx_key) { + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HFUNC]; + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HKEY]; + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM]; + } + + input_xfrm = nla_get_u32_default(tb[ETHTOOL_A_RSS_INPUT_XFRM], 0); + if (input_xfrm & ~ops->supported_input_xfrm) + bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM]; + + if (bad_attr) { + NL_SET_BAD_ATTR(info->extack, bad_attr); + return -EOPNOTSUPP; + } + + return 0; +} + +static void +ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev) +{ + struct nlmsghdr *nlh = (void *)rsp->data; + struct genlmsghdr *genl_hdr; + + /* Convert the reply into a notification */ + nlh->nlmsg_pid = 0; + nlh->nlmsg_seq = ethnl_bcast_seq_next(); + + genl_hdr = nlmsg_data(nlh); + genl_hdr->cmd = ETHTOOL_MSG_RSS_CREATE_NTF; + + ethnl_multicast(rsp, dev); +} + +int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info) +{ + bool indir_dflt = false, mod = false, ntf_fail = false; + struct ethtool_rxfh_param rxfh = {}; + struct ethtool_rxfh_context *ctx; + struct nlattr **tb = info->attrs; + struct rss_reply_data data = {}; + const struct ethtool_ops *ops; + struct rss_req_info req = {}; + struct net_device *dev; + struct sk_buff *rsp; + void *hdr; + u32 limit; + int ret; + + rsp = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + ret = ethnl_parse_header_dev_get(&req.base, tb[ETHTOOL_A_RSS_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + goto exit_free_rsp; + + dev = req.base.dev; + ops = dev->ethtool_ops; + + req.rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0); + + ret = ethnl_rss_create_validate(dev, info); + if (ret) + goto exit_free_dev; + + rtnl_lock(); + netdev_lock_ops(dev); + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto exit_dev_unlock; + + ret = rss_get_data_alloc(dev, &data); + if (ret) + goto exit_ops; + + ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_dflt, &mod); + if (ret) + goto exit_clean_data; + + ethnl_update_u8(&rxfh.hfunc, tb[ETHTOOL_A_RSS_HFUNC], &mod); + + ret = rss_set_prep_hkey(dev, info, &data, &rxfh, &mod); + if (ret) + goto exit_free_indir; + + rxfh.input_xfrm = RXH_XFRM_NO_CHANGE; + ethnl_update_u8(&rxfh.input_xfrm, tb[ETHTOOL_A_RSS_INPUT_XFRM], &mod); + + ctx = ethtool_rxfh_ctx_alloc(ops, data.indir_size, data.hkey_size); + if (!ctx) { + ret = -ENOMEM; + goto exit_free_hkey; + } + + mutex_lock(&dev->ethtool->rss_lock); + if (!req.rss_context) { + limit = ops->rxfh_max_num_contexts ?: U32_MAX; + ret = xa_alloc(&dev->ethtool->rss_ctx, &req.rss_context, ctx, + XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT); + } else { + ret = xa_insert(&dev->ethtool->rss_ctx, + req.rss_context, ctx, GFP_KERNEL_ACCOUNT); + } + if (ret < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT], + "error allocating context ID"); + goto err_unlock_free_ctx; + } + rxfh.rss_context = req.rss_context; + + ret = ops->create_rxfh_context(dev, ctx, &rxfh, info->extack); + if (ret) + goto err_ctx_id_free; + + /* Make sure driver populates defaults */ + WARN_ON_ONCE(!rxfh.key && ops->rxfh_per_ctx_key && + !memchr_inv(ethtool_rxfh_context_key(ctx), 0, + ctx->key_size)); + + /* Store the config from rxfh to Xarray.. */ + rss_set_ctx_update(ctx, tb, &data, &rxfh); + /* .. copy from Xarray to data. */ + __rss_prepare_ctx(dev, &data, ctx); + + hdr = ethnl_unicast_put(rsp, info->snd_portid, info->snd_seq, + ETHTOOL_MSG_RSS_CREATE_ACT_REPLY); + ntf_fail = ethnl_fill_reply_header(rsp, dev, ETHTOOL_A_RSS_HEADER); + ntf_fail |= rss_fill_reply(rsp, &req.base, &data.base); + if (WARN_ON(!hdr || ntf_fail)) { + ret = -EMSGSIZE; + goto exit_unlock; + } + + genlmsg_end(rsp, hdr); + + /* Use the same skb for the response and the notification, + * genlmsg_reply() will copy the skb if it has elevated user count. + */ + skb_get(rsp); + ret = genlmsg_reply(rsp, info); + ethnl_rss_create_send_ntf(rsp, dev); + rsp = NULL; + +exit_unlock: + mutex_unlock(&dev->ethtool->rss_lock); +exit_free_hkey: + kfree(rxfh.key); +exit_free_indir: + kfree(rxfh.indir); +exit_clean_data: + rss_get_data_free(&data); +exit_ops: + ethnl_ops_complete(dev); +exit_dev_unlock: + netdev_unlock_ops(dev); + rtnl_unlock(); +exit_free_dev: + ethnl_parse_header_dev_put(&req.base); +exit_free_rsp: + nlmsg_free(rsp); + return ret; + +err_ctx_id_free: + xa_erase(&dev->ethtool->rss_ctx, req.rss_context); +err_unlock_free_ctx: + kfree(ctx); + goto exit_unlock; +} + +/* RSS_DELETE */ + +const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1] = { + [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_RSS_CONTEXT] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ethtool_rxfh_context *ctx; + struct nlattr **tb = info->attrs; + struct ethnl_req_info req = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + u32 rss_context; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_RSS_CONTEXT)) + return -EINVAL; + rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]); + + ret = ethnl_parse_header_dev_get(&req, tb[ETHTOOL_A_RSS_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req.dev; + ops = dev->ethtool_ops; + + if (!ops->create_rxfh_context) + goto exit_free_dev; + + rtnl_lock(); + netdev_lock_ops(dev); + + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto exit_dev_unlock; + + mutex_lock(&dev->ethtool->rss_lock); + ret = ethtool_check_rss_ctx_busy(dev, rss_context); + if (ret) + goto exit_unlock; + + ctx = xa_load(&dev->ethtool->rss_ctx, rss_context); + if (!ctx) { + ret = -ENOENT; + goto exit_unlock; + } + + ret = ops->remove_rxfh_context(dev, ctx, rss_context, info->extack); + if (ret) + goto exit_unlock; + + WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rss_context) != ctx); + kfree(ctx); + + ethnl_rss_delete_notify(dev, rss_context); + +exit_unlock: + mutex_unlock(&dev->ethtool->rss_lock); + ethnl_ops_complete(dev); +exit_dev_unlock: + netdev_unlock_ops(dev); + rtnl_unlock(); +exit_free_dev: + ethnl_parse_header_dev_put(&req); + return ret; +} diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 8130b406ef107..8c654caa6805a 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -160,6 +160,12 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base, /* _TSINFO_HWTSTAMP_PROVIDER */ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); } + if (ts_info->phc_source) { + len += nla_total_size(sizeof(u32)); /* _TSINFO_HWTSTAMP_SOURCE */ + if (ts_info->phc_phyindex) + /* _TSINFO_HWTSTAMP_PHYINDEX */ + len += nla_total_size(sizeof(u32)); + } if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _TSINFO_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; @@ -259,6 +265,16 @@ static int tsinfo_fill_reply(struct sk_buff *skb, nla_nest_end(skb, nest); } + if (ts_info->phc_source) { + if (nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE, + ts_info->phc_source)) + return -EMSGSIZE; + + if (ts_info->phc_phyindex && + nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX, + ts_info->phc_phyindex)) + return -EMSGSIZE; + } if (req_base->flags & ETHTOOL_FLAG_STATS && tsinfo_put_stats(skb, &data->stats)) return -EMSGSIZE; @@ -346,6 +362,11 @@ static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb, if (ret < 0) goto err; + if (reply_data->ts_info.phc_index >= 0) { + reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_PHYLIB; + reply_data->ts_info.phc_phyindex = phydev->phyindex; + } + ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) goto err; @@ -389,6 +410,8 @@ static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, if (ret < 0) goto err; + if (reply_data->ts_info.phc_index >= 0) + reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_NETDEV; ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index d6f52839827ea..081093dfd5533 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -230,6 +230,12 @@ static int tls_handshake_accept(struct handshake_req *req, if (ret < 0) goto out_cancel; } + if (treq->th_keyring) { + ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_KEYRING, + treq->th_keyring); + if (ret < 0) + goto out_cancel; + } ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_AUTH_MODE, treq->th_auth_mode); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 1b1b700ec05ea..88657255fec12 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -408,7 +408,7 @@ static void hsr_announce(struct timer_list *t) struct hsr_port *master; unsigned long interval; - hsr = from_timer(hsr, t, announce_timer); + hsr = timer_container_of(hsr, t, announce_timer); rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); @@ -424,7 +424,8 @@ static void hsr_announce(struct timer_list *t) */ static void hsr_proxy_announce(struct timer_list *t) { - struct hsr_priv *hsr = from_timer(hsr, t, announce_proxy_timer); + struct hsr_priv *hsr = timer_container_of(hsr, t, + announce_proxy_timer); struct hsr_port *interlink; unsigned long interval = 0; struct hsr_node *node; @@ -761,6 +762,11 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res) goto err_unregister; + if (protocol_version == PRP_V1) { + eth_hw_addr_set(slave[1], slave[0]->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, slave[1]); + } + if (interlink) { res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack); if (res) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 4ce471a2f3870..3a2a2fa7a0a39 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -617,7 +617,7 @@ static struct hsr_port *get_late_port(struct hsr_priv *hsr, */ void hsr_prune_nodes(struct timer_list *t) { - struct hsr_priv *hsr = from_timer(hsr, t, prune_timer); + struct hsr_priv *hsr = timer_container_of(hsr, t, prune_timer); struct hsr_node *node; struct hsr_node *tmp; struct hsr_port *port; @@ -685,7 +685,7 @@ void hsr_prune_nodes(struct timer_list *t) void hsr_prune_proxy_nodes(struct timer_list *t) { - struct hsr_priv *hsr = from_timer(hsr, t, prune_proxy_timer); + struct hsr_priv *hsr = timer_container_of(hsr, t, prune_proxy_timer); unsigned long timestamp; struct hsr_node *node; struct hsr_node *tmp; diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index d7ae32473c41a..192893c3f2ec7 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -78,6 +78,15 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, eth_hw_addr_set(master->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); + + if (hsr->prot_version == PRP_V1) { + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + if (port) { + eth_hw_addr_set(port->dev, dev->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, + port->dev); + } + } } /* Make sure we recognize frames from ourselves in hsr_rcv() */ diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 1bc47b17a2968..135ec5fce0196 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -155,6 +155,7 @@ struct hsr_port { struct hsr_priv *hsr; enum hsr_port_type type; struct rcu_head rcu; + unsigned char original_macaddress[ETH_ALEN]; }; struct hsr_frame_info; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 2a802a5de2acc..b87b6a6fe0700 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -196,6 +196,7 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, port->hsr = hsr; port->dev = dev; port->type = type; + ether_addr_copy(port->original_macaddress, dev->dev_addr); if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); @@ -232,6 +233,7 @@ void hsr_del_port(struct hsr_port *port) if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); + eth_hw_addr_set(port->dev, port->original_macaddress); } kfree_rcu(port, rcu); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index d4b983d170382..ddb6a5817d098 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -44,7 +44,7 @@ static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) static void lowpan_frag_expire(struct timer_list *t) { - struct inet_frag_queue *frag = from_timer(frag, t, timer); + struct inet_frag_queue *frag = timer_container_of(frag, t, timer); struct frag_queue *fq; int refs = 1; diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 359249ab77bf3..4c07a475c567e 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -224,10 +224,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) dev_hold(dev); if (info->attrs[IEEE802154_ATTR_HW_ADDR]) { - struct sockaddr addr; + struct sockaddr_storage addr; - addr.sa_family = ARPHRD_IEEE802154; - nla_memcpy(&addr.sa_data, info->attrs[IEEE802154_ATTR_HW_ADDR], + addr.ss_family = ARPHRD_IEEE802154; + nla_memcpy(&addr.__data, info->attrs[IEEE802154_ATTR_HW_ADDR], IEEE802154_ADDR_LEN); /* strangely enough, some callbacks (inetdev_event) from diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6d2c97f8e9ef9..12850a277251d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -425,7 +425,7 @@ config INET_DIAG tristate "INET: socket monitoring interface" default y help - Support for INET (TCP, DCCP, etc) socket monitoring interface used by + Support for INET (TCP, UDP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5df1f1325259d..76e38092cd8a3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1328,10 +1328,7 @@ int inet_sk_rebuild_header(struct sock *sk) /* Routing failed... */ sk->sk_route_caps = 0; - /* - * Other protocols have to map its equivalent state to TCP_SYN_SENT. - * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme - */ + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a648fff71ea7d..5cfc1c9396732 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -864,7 +864,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && - pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { + pneigh_lookup(&arp_tbl, net, &tip, dev)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -966,6 +966,7 @@ static int arp_is_multicast(const void *pkey) static int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { + enum skb_drop_reason drop_reason; const struct arphdr *arp; /* do not tweak dropwatch on an ARP we will ignore */ @@ -979,12 +980,15 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, goto out_of_mem; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ - if (!pskb_may_pull(skb, arp_hdr_len(dev))) + drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev)); + if (drop_reason != SKB_NOT_DROPPED_YET) goto freeskb; arp = arp_hdr(skb); - if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) + if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) { + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; goto freeskb; + } memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); @@ -996,7 +1000,7 @@ consumeskb: consume_skb(skb); return NET_RX_SUCCESS; freeskb: - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); out_of_mem: return NET_RX_DROP; } @@ -1085,9 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) - return -ENOBUFS; - return 0; + return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false); } return arp_req_set_proxy(net, dev, 1); diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 4b5bc6eb52e75..c2b2cda1a7e50 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -109,7 +109,7 @@ void ip4_datagram_release_cb(struct sock *sk) rcu_read_lock(); dst = __sk_dst_get(sk); - if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { + if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, 0)) { rcu_read_unlock(); return; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 77e5705ac799e..c47d3828d4f65 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1792,12 +1792,12 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3f4e629998fab..6e1b94796f67a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -807,7 +807,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), - extack, false); + extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); @@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack, false); + extack); if (err < 0) goto errout; break; @@ -948,12 +948,12 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, if (filter->rtnl_held) ASSERT_RTNL(); - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } - rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); @@ -1524,7 +1524,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: - flags = dev_get_flags(dev); + flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f68bb9e34c34c..a5f3c8459758f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -365,7 +365,7 @@ static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) { /* The second half is used for prefsrc */ - return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *), + return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head), GFP_KERNEL); } @@ -625,11 +625,6 @@ int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, if (encap) { struct lwtunnel_state *lwtstate; - if (encap_type == LWTUNNEL_ENCAP_NONE) { - NL_SET_ERR_MSG(extack, "LWT encap type not specified"); - err = -EINVAL; - goto lwt_failure; - } err = lwtunnel_build_state(net, encap_type, encap, nhc->nhc_family, cfg, &lwtstate, extack); @@ -1640,8 +1635,7 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) goto nla_put_failure; - if (nhc->nhc_lwtstate && - lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, + if (lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; @@ -2093,7 +2087,7 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) return 0; if (nh_flags & RTNH_F_DEAD) { - unsigned int flags = dev_get_flags(dev); + unsigned int flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) nh_flags |= RTNH_F_LINKDOWN; @@ -2168,34 +2162,52 @@ static bool fib_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -void fib_select_multipath(struct fib_result *res, int hash) +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; - bool first = false; + bool found = false; + bool use_neigh; + __be32 saddr; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } + use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); + saddr = fl4 ? fl4->saddr : 0; + change_nexthops(fi) { - if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { - if (!fib_good_nh(nexthop_nh)) - continue; - if (!first) { - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - first = true; - } + int nh_upper_bound; + + /* Nexthops without a carrier are assigned an upper bound of + * minus one when "ignore_routes_with_linkdown" is set. + */ + nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound); + if (nh_upper_bound == -1 || + (use_neigh && !fib_good_nh(nexthop_nh))) + continue; + + if (!found) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + found = !saddr || nexthop_nh->nh_saddr == saddr; } - if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) + if (hash > nh_upper_bound) continue; - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - return; + if (!saddr || nexthop_nh->nh_saddr == saddr) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + return; + } + + if (found) + return; + } endfor_nexthops(fi); } #endif @@ -2210,7 +2222,7 @@ void fib_select_path(struct net *net, struct fib_result *res, if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); - fib_select_multipath(res, h); + fib_select_multipath(res, h, fl4); } else #endif diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 6701a98d9a9ff..dafd68f3436a2 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -199,7 +199,7 @@ static const struct net_protocol net_gre_protocol = { static int __init gre_init(void) { - pr_info("GRE over IPv4 demultiplexor driver\n"); + pr_info("GRE over IPv4 demultiplexer driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 717cb7d3607a1..2ffe73ea644ff 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -311,18 +311,20 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, { struct dst_entry *dst = &rt->dst; struct inet_peer *peer; + struct net_device *dev; bool rc = true; if (!apply_ratelimit) return true; /* No rate limit on loopback */ - if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) + dev = dst_dev(dst); + if (dev && (dev->flags & IFF_LOOPBACK)) goto out; rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, - l3mdev_master_ifindex_rcu(dst->dev)); + l3mdev_master_ifindex_rcu(dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); rcu_read_unlock(); @@ -466,13 +468,13 @@ out_bh_enable: */ static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) { - struct net_device *route_lookup_dev = NULL; + struct net_device *dev = skb->dev; + const struct dst_entry *dst; - if (skb->dev) - route_lookup_dev = skb->dev; - else if (skb_dst(skb)) - route_lookup_dev = skb_dst(skb)->dev; - return route_lookup_dev; + if (dev) + return dev; + dst = skb_dst(skb); + return dst ? dst_dev(dst) : NULL; } static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, @@ -869,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) struct net *net; u32 info = 0; - net = dev_net_rcu(skb_dst(skb)->dev); + net = skb_dst_dev_net_rcu(skb); /* * Incomplete header ? @@ -1012,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) struct icmp_bxm icmp_param; struct net *net; - net = dev_net_rcu(skb_dst(skb)->dev); + net = skb_dst_dev_net_rcu(skb); /* should there be an ICMP stat for ignored echos? */ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; @@ -1182,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) return SKB_NOT_DROPPED_YET; out_err: - __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(skb_dst_dev_net_rcu(skb), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index ca7d539b38463..7182f1419c2a4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -427,7 +427,7 @@ static int igmpv3_sendpack(struct sk_buff *skb) pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); + return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) @@ -801,7 +801,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, static void igmp_gq_timer_expire(struct timer_list *t) { - struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer); + struct in_device *in_dev = timer_container_of(in_dev, t, mr_gq_timer); in_dev->mr_gq_running = 0; igmpv3_send_report(in_dev, NULL); @@ -810,7 +810,7 @@ static void igmp_gq_timer_expire(struct timer_list *t) static void igmp_ifc_timer_expire(struct timer_list *t) { - struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer); + struct in_device *in_dev = timer_container_of(in_dev, t, mr_ifc_timer); u32 mr_ifc_count; igmpv3_send_cr(in_dev); @@ -840,7 +840,7 @@ static void igmp_ifc_event(struct in_device *in_dev) static void igmp_timer_expire(struct timer_list *t) { - struct ip_mc_list *im = from_timer(im, t, timer); + struct ip_mc_list *im = timer_container_of(im, t, timer); struct in_device *in_dev = im->interface; spin_lock(&im->lock); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dd5cf8914a28d..1e2df51427fed 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -168,7 +168,7 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk) } static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, - kuid_t sk_uid, bool relax, + kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { int bound_dev_if2; @@ -185,12 +185,12 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, if (!relax || (!reuseport_ok && sk->sk_reuseport && sk2->sk_reuseport && reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || - uid_eq(sk_uid, sock_i_uid(sk2))))) + uid_eq(uid, sk_uid(sk2))))) return true; } else if (!reuseport_ok || !sk->sk_reuseport || !sk2->sk_reuseport || !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && - !uid_eq(sk_uid, sock_i_uid(sk2)))) { + !uid_eq(uid, sk_uid(sk2)))) { return true; } } @@ -198,7 +198,7 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, } static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, - kuid_t sk_uid, bool relax, + kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { if (ipv6_only_sock(sk2)) { @@ -211,20 +211,20 @@ static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, #endif } - return inet_bind_conflict(sk, sk2, sk_uid, relax, + return inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok); } static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, - kuid_t sk_uid, + kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { struct sock *sk2; sk_for_each_bound(sk2, &tb2->owners) { - if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, + if (__inet_bhash2_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) return true; } @@ -242,8 +242,8 @@ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { - kuid_t uid = sock_i_uid((struct sock *)sk); struct sock_reuseport *reuseport_cb; + kuid_t uid = sk_uid(sk); bool reuseport_cb_ok; struct sock *sk2; @@ -287,11 +287,11 @@ static int inet_csk_bind_conflict(const struct sock *sk, static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev, bool relax, bool reuseport_ok) { - kuid_t uid = sock_i_uid((struct sock *)sk); const struct net *net = sock_net(sk); struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; + kuid_t uid = sk_uid(sk); bool conflict = false; bool reuseport_cb_ok; @@ -330,7 +330,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); @@ -425,15 +425,13 @@ success: static inline int sk_reuseport_match(struct inet_bind_bucket *tb, struct sock *sk) { - kuid_t uid = sock_i_uid(sk); - if (tb->fastreuseport <= 0) return 0; if (!sk->sk_reuseport) return 0; if (rcu_access_pointer(sk->sk_reuseport_cb)) return 0; - if (!uid_eq(tb->fastuid, uid)) + if (!uid_eq(tb->fastuid, sk_uid(sk))) return 0; /* We only need to check the rcv_saddr if this tb was once marked * without fastreuseport and then was reset, as we can only know that @@ -458,14 +456,13 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct sock *sk) { - kuid_t uid = sock_i_uid(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; if (hlist_empty(&tb->bhash2)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; - tb->fastuid = uid; + tb->fastuid = sk_uid(sk); tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; @@ -492,7 +489,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, */ if (!sk_reuseport_match(tb, sk)) { tb->fastreuseport = FASTREUSEPORT_STRICT; - tb->fastuid = uid; + tb->fastuid = sk_uid(sk); tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; @@ -512,10 +509,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; @@ -767,7 +764,6 @@ void inet_csk_init_xmit_timers(struct sock *sk, timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } -EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { @@ -780,7 +776,6 @@ void inet_csk_clear_xmit_timers(struct sock *sk) sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } -EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_clear_xmit_timers_sync(struct sock *sk) { @@ -814,7 +809,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num), sk->sk_uid); + htons(ireq->ir_num), sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -831,7 +826,6 @@ no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } -EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, @@ -852,7 +846,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num), sk->sk_uid); + htons(ireq->ir_num), sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -890,16 +884,6 @@ static void syn_ack_recalc(struct request_sock *req, req->num_timeout >= rskq_defer_accept - 1; } -int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) -{ - int err = req->rsk_ops->rtx_syn_ack(parent, req); - - if (!err) - req->num_retrans++; - return err; -} -EXPORT_SYMBOL(inet_rtx_syn_ack); - static struct request_sock * reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) @@ -1026,9 +1010,10 @@ static bool reqsk_queue_unlink(struct request_sock *req) bool found = false; if (sk_hashed(sk)) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); - spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); + spinlock_t *lock; + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); @@ -1058,18 +1043,17 @@ bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { return __inet_csk_reqsk_queue_drop(sk, req, false); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); +EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { - struct request_sock *req = from_timer(req, t, rsk_timer); + struct request_sock *req = timer_container_of(req, t, rsk_timer); struct request_sock *nreq = NULL, *oreq = req; struct sock *sk_listener = req->rsk_listener; struct inet_connection_sock *icsk; @@ -1139,7 +1123,7 @@ static void reqsk_timer_handler(struct timer_list *t) req->rsk_ops->syn_ack_timeout(req); if (!expire && (!resend || - !inet_rtx_syn_ack(sk_listener, req) || + !tcp_rtx_synack(sk_listener, req) || inet_rsk(req)->acked)) { if (req->num_timeout++ == 0) atomic_dec(&queue->young); @@ -1209,7 +1193,6 @@ bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, inet_csk_reqsk_queue_added(sk); return true; } -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) @@ -1290,7 +1273,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, return newsk; } -EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this @@ -1322,7 +1304,7 @@ void inet_csk_destroy_sock(struct sock *sk) EXPORT_SYMBOL(inet_csk_destroy_sock); /* This function allows to force a closure of a socket after the call to - * tcp/dccp_create_openreq_child(). + * tcp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) @@ -1380,7 +1362,6 @@ int inet_csk_listen_start(struct sock *sk) inet_sk_set_state(sk, TCP_CLOSE); return err; } -EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) @@ -1475,7 +1456,6 @@ child_put: sock_put(child); return NULL; } -EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially @@ -1590,4 +1570,3 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) out: return dst; } -EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c2bb91d9e9ffd..2fa53b16fe778 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -160,7 +160,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, ext & (1 << (INET_DIAG_TCLASS - 1))) { u32 classid = 0; -#ifdef CONFIG_SOCK_CGROUP_DATA +#ifdef CONFIG_CGROUP_NET_CLASSID classid = sock_cgroup_classid(&sk->sk_cgrp_data); #endif /* Fallback to socket priority if class id isn't set. @@ -181,7 +181,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, goto errout; #endif - r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->idiag_uid = from_kuid_munged(user_ns, sk_uid(sk)); r->idiag_inode = sock_i_ino(sk); memset(&inet_sockopt, 0, sizeof(inet_sockopt)); @@ -1369,8 +1369,6 @@ static int inet_diag_type2proto(int type) switch (type) { case TCPDIAG_GETSOCK: return IPPROTO_TCP; - case DCCPDIAG_GETSOCK: - return IPPROTO_DCCP; default: return 0; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 5bf163f756e95..ceeeec9b7290a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -23,11 +23,12 @@ #if IS_ENABLED(CONFIG_IPV6) #include <net/inet6_hashtables.h> #endif -#include <net/secure_seq.h> #include <net/hotdata.h> #include <net/ip.h> -#include <net/tcp.h> +#include <net/rps.h> +#include <net/secure_seq.h> #include <net/sock_reuseport.h> +#include <net/tcp.h> u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -176,7 +177,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, */ static void __inet_put_port(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; @@ -215,7 +216,7 @@ EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { - struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *table = tcp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; @@ -668,7 +669,7 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk, */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list; spinlock_t *lock; @@ -713,15 +714,15 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) } return ok; } -EXPORT_SYMBOL_GPL(inet_ehash_nolisten); +EXPORT_IPV6_MOD(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; const struct hlist_nulls_node *node; + kuid_t uid = sk_uid(sk); struct sock *sk2; - kuid_t uid = sock_i_uid(sk); sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && @@ -729,7 +730,7 @@ static int inet_reuseport_add_sock(struct sock *sk, ipv6_only_sock(sk2) == ipv6_only_sock(sk) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if && inet_csk(sk2)->icsk_bind_hash == tb && - sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); @@ -740,7 +741,7 @@ static int inet_reuseport_add_sock(struct sock *sk, int __inet_hash(struct sock *sk, struct sock *osk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; @@ -771,7 +772,7 @@ unlock: return err; } -EXPORT_SYMBOL(__inet_hash); +EXPORT_IPV6_MOD(__inet_hash); int inet_hash(struct sock *sk) { @@ -782,15 +783,15 @@ int inet_hash(struct sock *sk) return err; } -EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); if (sk_unhashed(sk)) return; + sock_rps_delete_flow(sk); if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; @@ -823,7 +824,7 @@ void inet_unhash(struct sock *sk) spin_unlock_bh(lock); } } -EXPORT_SYMBOL_GPL(inet_unhash); +EXPORT_IPV6_MOD(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, @@ -874,7 +875,7 @@ inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) @@ -902,7 +903,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family) static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); @@ -982,14 +983,14 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } -EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +EXPORT_IPV6_MOD(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } -EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); +EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') @@ -1214,7 +1215,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet_check_established); } -EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { @@ -1265,7 +1265,6 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h) init_hashinfo_lhash2(h); return 0; } -EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { @@ -1305,7 +1304,6 @@ set_mask: hashinfo->ehash_locks_mask = nblocks - 1; return 0; } -EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) @@ -1341,7 +1339,6 @@ free_hashinfo: err: return NULL; } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { @@ -1352,4 +1349,3 @@ void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) vfree(hashinfo->ehash); kfree(hashinfo); } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index aded4bf1bc16d..875ff923a8ed0 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -166,11 +166,10 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, spin_unlock(lock); local_bh_enable(); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { - struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); + struct inet_timewait_sock *tw = timer_container_of(tw, t, tw_timer); inet_twsk_kill(tw); } @@ -223,7 +222,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, return tw; } -EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. @@ -306,7 +304,6 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo) @@ -365,4 +362,3 @@ restart: rcu_read_unlock(); } } -EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 77f395b28ec74..b2584cce90ae1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -123,7 +123,7 @@ static bool frag_expire_skip_icmp(u32 user) static void ip_expire(struct timer_list *t) { enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; - struct inet_frag_queue *frag = from_timer(frag, t, timer); + struct inet_frag_queue *frag = timer_container_of(frag, t, timer); const struct iphdr *iph; struct sk_buff *head = NULL; struct net *net; @@ -476,7 +476,7 @@ out_fail: /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { - struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; + struct net_device *dev = skb->dev ? : skb_dst_dev(skb); int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 26d15f907551e..f5b9004d6938b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1066,16 +1066,15 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit_batch_rtnl = ipgre_exit_batch_rtnl, + .exit_rtnl = ipgre_exit_rtnl, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1752,16 +1751,15 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_tap_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops, - dev_to_kill); + ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl, + .exit_rtnl = ipgre_tap_exit_rtnl, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1772,16 +1770,15 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit erspan_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit_batch_rtnl = erspan_exit_batch_rtnl, + .exit_rtnl = erspan_exit_rtnl, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 30a5e9460d006..fc323994b1fa0 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -226,6 +226,12 @@ resubmit: static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); + return 0; + } + skb_clear_delivery_time(skb); __skb_pull(skb, skb_network_header_len(skb)); @@ -319,8 +325,8 @@ static int ip_rcv_finish_core(struct net *net, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); - int err, drop_reason; struct rtable *rt; + int drop_reason; if (ip_can_use_hint(skb, iph, hint)) { drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, @@ -345,9 +351,10 @@ static int ip_rcv_finish_core(struct net *net, break; case IPPROTO_UDP: if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { - err = udp_v4_early_demux(skb); - if (unlikely(err)) + drop_reason = udp_v4_early_demux(skb); + if (unlikely(drop_reason)) goto drop_error; + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e18d7ec50624..84e7f8a2f50f5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -116,7 +116,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, NULL, skb_dst_dev(skb), dst_output); } @@ -199,7 +199,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; @@ -425,15 +425,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; + struct net_device *dev, *indev = skb->dev; + int ret_val; + rcu_read_lock(); + dev = skb_dst_dev_rcu(skb); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip_finish_output, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); + rcu_read_unlock(); + return ret_val; } EXPORT_SYMBOL(ip_output); @@ -1014,7 +1019,8 @@ static int __ip_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1221,8 +1227,7 @@ alloc_new_skb: if (WARN_ON_ONCE(copy > msg->msg_iter.count)) goto error; - err = skb_splice_from_iter(skb, &msg->msg_iter, copy, - sk->sk_allocation); + err = skb_splice_from_iter(skb, &msg->msg_iter, copy); if (err < 0) goto error; copy = err; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 1024f961ec9ad..aaeb5d16f0c9a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -243,11 +243,11 @@ static struct net_device *__ip_tunnel_create(struct net *net, if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; - strscpy(name, parms->name, IFNAMSIZ); + strscpy(name, parms->name); } else { if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; - strcpy(name, ops->kind); + strscpy(name, ops->kind); strcat(name, "%d"); } @@ -668,7 +668,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_adj_headroom(dev, headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return; tx_error: DEV_STATS_INC(dev, tx_errors); @@ -857,7 +857,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_adj_headroom(dev, max_headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return; #if IS_ENABLED(CONFIG_IPV6) @@ -1174,13 +1174,16 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, - struct list_head *head, - struct rtnl_link_ops *ops) +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *head) { + struct ip_tunnel_net *itn = net_generic(net, id); struct net_device *dev, *aux; int h; + ASSERT_RTNL_NET(net); + for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == ops) unregister_netdevice_queue(dev, head); @@ -1198,21 +1201,7 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, unregister_netdevice_queue(t->dev, head); } } - -void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill) -{ - struct ip_tunnel_net *itn; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - itn = net_generic(net, id); - ip_tunnel_destroy(net, itn, dev_to_kill, ops); - } -} -EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); int ip_tunnel_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f65d2f7273813..cc9915543637d 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -49,7 +49,8 @@ EXPORT_SYMBOL(ip6tun_encaps); void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, - __u8 tos, __u8 ttl, __be16 df, bool xnet) + __u8 tos, __u8 ttl, __be16 df, bool xnet, + u16 ipcb_flags) { int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); @@ -62,6 +63,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_clear_hash_if_not_l4(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + IPCB(skb)->flags = ipcb_flags; /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 159b4473290e0..95b6bb78fcd27 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -229,7 +229,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error_icmp; } - tdev = dst->dev; + tdev = dst_dev(dst); if (tdev == dev) { dst_release(dst); @@ -259,7 +259,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, xmit: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); - skb->dev = skb_dst(skb)->dev; + skb->dev = skb_dst_dev(skb); err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) @@ -523,16 +523,15 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit vti_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit_batch_rtnl = vti_exit_batch_rtnl, + .exit_rtnl = vti_exit_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 5a4fb2539b08b..9a45aed508d19 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -54,6 +54,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info) } /* We always hold one tunnel user reference to indicate a tunnel */ +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -62,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPIP; t->id.spi = x->props.saddr.a4; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index c56b6fe6f0d77..22a7889876c1c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -274,9 +274,9 @@ static int __init ic_open_devs(void) /* wait for a carrier on at least one device */ start = jiffies; - next_msg = start + msecs_to_jiffies(20000); + next_msg = start + secs_to_jiffies(20); while (time_before(jiffies, start + - msecs_to_jiffies(carrier_timeout * 1000))) { + secs_to_jiffies(carrier_timeout))) { int wait, elapsed; rtnl_lock(); @@ -295,7 +295,7 @@ static int __init ic_open_devs(void) elapsed = jiffies_to_msecs(jiffies - start); wait = (carrier_timeout * 1000 - elapsed + 500) / 1000; pr_info("Waiting up to %d more seconds for network.\n", wait); - next_msg = jiffies + msecs_to_jiffies(20000); + next_msg = jiffies + secs_to_jiffies(20); } have_carrier: diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index bab0bf90c908d..3e03af073a1cc 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -604,16 +604,15 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipip_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit_batch_rtnl = ipip_exit_batch_rtnl, + .exit_rtnl = ipip_exit_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 85dc208f32e99..e86a8a862c411 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -765,7 +765,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) /* Timer process for the unresolved queue. */ static void ipmr_expire_process(struct timer_list *t) { - struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); + struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer); struct mr_mfc *c, *next; unsigned long expires; unsigned long now; @@ -901,7 +901,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0), (VIFF_TUNNEL | VIFF_REGISTER)); - err = dev_get_port_parent_id(dev, &ppid, true); + err = netif_get_port_parent_id(dev, &ppid, true); if (err == 0) { memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len); v->dev_parent_id.id_len = ppid.id_len; @@ -1853,20 +1853,19 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, /* Processing handlers for ipmr_forward, under rcu_read_lock() */ -static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - int in_vifi, struct sk_buff *skb, int vifi) +static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; - struct net_device *dev; struct rtable *rt; struct flowi4 fl4; int encap = 0; vif_dev = vif_dev_read(vif); if (!vif_dev) - goto out_free; + return -1; if (vif->flags & VIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); @@ -1874,12 +1873,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); - goto out_free; + return -1; } - if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) - goto out_free; - if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, @@ -1887,7 +1883,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, IPPROTO_IPIP, iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) - goto out_free; + return -1; encap = sizeof(struct iphdr); } else { rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, @@ -1895,11 +1891,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, IPPROTO_IPIP, iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) - goto out_free; + return -1; } - dev = rt->dst.dev; - if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not * allow to send ICMP, so that packets will disappear @@ -1907,14 +1901,14 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, */ IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); - goto out_free; + return -1; } - encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len; + encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); - goto out_free; + return -1; } WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); @@ -1934,6 +1928,22 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); } + return 0; +} + +static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt, + int in_vifi, struct sk_buff *skb, int vifi) +{ + struct rtable *rt; + + if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) + goto out_free; + + if (ipmr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; + + rt = skb_rtable(skb); + IPCB(skb)->flags |= IPSKB_FORWARDED; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally @@ -1947,7 +1957,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * result in receiving multiple packets. */ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, - net, NULL, skb, skb->dev, dev, + net, NULL, skb, skb->dev, rt->dst.dev, ipmr_forward_finish); return; @@ -1955,6 +1965,19 @@ out_free: kfree_skb(skb); } +static void ipmr_queue_output_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) +{ + if (ipmr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; + + ip_mc_output(net, NULL, skb); + return; + +out_free: + kfree_skb(skb); +} + /* Called with mrt_lock or rcu_read_lock() */ static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev) { @@ -2065,8 +2088,8 @@ forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, true_vifi, - skb2, psend); + ipmr_queue_fwd_xmit(net, mrt, true_vifi, + skb2, psend); } psend = ct; } @@ -2077,10 +2100,10 @@ last_forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, true_vifi, skb2, - psend); + ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb2, + psend); } else { - ipmr_queue_xmit(net, mrt, true_vifi, skb, psend); + ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb, psend); return; } } @@ -2214,6 +2237,110 @@ dont_forward: return 0; } +static void ip_mr_output_finish(struct net *net, struct mr_table *mrt, + struct net_device *dev, struct sk_buff *skb, + struct mfc_cache *c) +{ + int psend = -1; + int ct; + + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); + + /* Forward the frame */ + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == htonl(INADDR_ANY)) { + if (ip_hdr(skb)->ttl > + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = c->_c.mfc_parent; + goto last_xmit; + } + goto dont_xmit; + } + + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { + if (ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ipmr_queue_output_xmit(net, mrt, + skb2, psend); + } + psend = ct; + } + } + +last_xmit: + if (psend != -1) { + ipmr_queue_output_xmit(net, mrt, skb, psend); + return; + } + +dont_xmit: + kfree_skb(skb); +} + +/* Multicast packets for forwarding arrive here + * Called with rcu_read_lock(); + */ +int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct mfc_cache *cache; + struct net_device *dev; + struct mr_table *mrt; + int vif; + + guard(rcu)(); + + dev = rt->dst.dev; + + if (IPCB(skb)->flags & IPSKB_FORWARDED) + goto mc_output; + if (!(IPCB(skb)->flags & IPSKB_MCROUTE)) + goto mc_output; + + skb->dev = dev; + + mrt = ipmr_rt_fib_lookup(net, skb); + if (IS_ERR(mrt)) + goto mc_output; + + cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + if (!cache) { + vif = ipmr_find_vif(mrt, dev); + if (vif >= 0) + cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, + vif); + } + + /* No usable cache entry */ + if (!cache) { + vif = ipmr_find_vif(mrt, dev); + if (vif >= 0) + return ipmr_cache_unresolved(mrt, vif, skb, dev); + goto mc_output; + } + + vif = cache->_c.mfc_parent; + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) + goto mc_output; + + ip_mr_output_finish(net, mrt, dev, skb, cache); + return 0; + +mc_output: + return ip_mc_output(net, sk, skb); +} + #ifdef CONFIG_IP_PIMSM_V1 /* Handle IGMP messages of PIMv1 */ int pim_rcv_v1(struct sk_buff *skb) @@ -2501,7 +2628,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); return -EINVAL; } @@ -2510,7 +2638,6 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || @@ -2826,7 +2953,8 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); return -EINVAL; } @@ -2836,7 +2964,6 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 08bc3f2c0078b..0565f001120dc 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -20,12 +20,12 @@ /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type) { + struct net_device *dev = skb_dst_dev(skb); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; __u8 flags; - struct net_device *dev = skb_dst(skb)->dev; struct flow_keys flkeys; unsigned int hh_len; @@ -74,7 +74,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un #endif /* Change in oif may mean change in hh_len. */ - hh_len = skb_dst(skb)->dev->hard_header_len; + hh_len = skb_dst_dev(skb)->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ef8009281da5c..2c438b140e88f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -13,8 +13,8 @@ config NF_DEFRAG_IPV4 # old sockopt interface and eval loop config IP_NF_IPTABLES_LEGACY tristate "Legacy IP tables support" - default n - select NETFILTER_XTABLES + depends on NETFILTER_XTABLES_LEGACY + default m if NETFILTER_XTABLES_LEGACY help iptables is a legacy packet classifier. This is not needed if you are using iptables over nftables @@ -182,8 +182,8 @@ config IP_NF_MATCH_TTL # `filter', generic and specific targets config IP_NF_FILTER tristate "Packet filtering" - default m if NETFILTER_ADVANCED=n - select IP_NF_IPTABLES_LEGACY + default m if NETFILTER_ADVANCED=n || IP_NF_IPTABLES_LEGACY + depends on IP_NF_IPTABLES_LEGACY help Packet filtering defines a table `filter', which has a series of rules for simple packet filtering at local input, forwarding and @@ -220,10 +220,10 @@ config IP_NF_TARGET_SYNPROXY config IP_NF_NAT tristate "iptables NAT support" depends on NF_CONNTRACK + depends on IP_NF_IPTABLES_LEGACY default m if NETFILTER_ADVANCED=n select NF_NAT select NETFILTER_XT_NAT - select IP_NF_IPTABLES_LEGACY help This enables the `nat' table in iptables. This allows masquerading, port forwarding and other forms of full Network Address Port @@ -263,8 +263,8 @@ endif # IP_NF_NAT # mangle + specific targets config IP_NF_MANGLE tristate "Packet mangling" - default m if NETFILTER_ADVANCED=n - select IP_NF_IPTABLES_LEGACY + default m if NETFILTER_ADVANCED=n || IP_NF_IPTABLES_LEGACY + depends on IP_NF_IPTABLES_LEGACY help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -299,7 +299,7 @@ config IP_NF_TARGET_TTL # raw + specific targets config IP_NF_RAW tristate 'raw table support (required for NOTRACK/TRACE)' - select IP_NF_IPTABLES_LEGACY + depends on IP_NF_IPTABLES_LEGACY help This option adds a `raw' table to iptables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -313,7 +313,7 @@ config IP_NF_SECURITY tristate "Security table" depends on SECURITY depends on NETFILTER_ADVANCED - select IP_NF_IPTABLES_LEGACY + depends on IP_NF_IPTABLES_LEGACY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. @@ -325,8 +325,8 @@ endif # IP_NF_IPTABLES # ARP tables config IP_NF_ARPTABLES tristate "Legacy ARPTABLES support" - depends on NETFILTER_XTABLES - default n + depends on NETFILTER_XTABLES_LEGACY + default n help arptables is a legacy packet classifier. This is not needed if you are using arptables over nftables @@ -342,7 +342,7 @@ config IP_NF_ARPFILTER tristate "arptables-legacy packet filtering support" select IP_NF_ARPTABLES select NETFILTER_FAMILY_ARP - depends on NETFILTER_XTABLES + depends on NETFILTER_XTABLES_LEGACY help ARP packet filtering defines a table `filter', which has a series of rules for simple ARP packet filtering at local input and diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 3d101613f27fa..23c8deff8095a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -270,7 +270,7 @@ ipt_do_table(void *priv, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 25e1e8eb18dd5..ed08fb78cfa8c 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, struct iphdr *iph; local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->ttl; if (nf_dup_ipv4_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 9082ca17e845c..7e7c49535e3f5 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -50,7 +50,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, else addr = iph->saddr; - *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) { + *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + return; + } + + *dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr); } EXPORT_SYMBOL_GPL(nft_fib4_eval_type); @@ -65,8 +70,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi4 fl4 = { .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_proto = pkt->tprot, .flowi4_uid = sock_net_uid(nft_net(pkt), NULL), - .flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; const struct net_device *oif; const struct net_device *found; @@ -90,6 +95,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; + fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif); + iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { regs->verdict.code = NFT_BREAK; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 467151517023d..29118c43ebf5f 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -541,6 +541,7 @@ static struct nexthop *nexthop_alloc(void) INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); + spin_lock_init(&nh->lock); } return nh; } @@ -984,8 +985,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, break; } - if (nhi->fib_nhc.nhc_lwtstate && - lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, + if (lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, NHA_ENCAP, NHA_ENCAP_TYPE) < 0) goto nla_put_failure; @@ -1555,12 +1555,12 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, if (nh->is_group) { struct nh_group *nhg; - nhg = rtnl_dereference(nh->nh_grp); + nhg = rcu_dereference_rtnl(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { - nhi = rtnl_dereference(nh->nh_info); + nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; @@ -2118,7 +2118,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { - struct fib6_info *f6i, *tmp; + struct fib6_info *f6i; bool do_flush = false; struct fib_info *fi; @@ -2129,13 +2129,24 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) if (do_flush) fib_flush(net); - /* ip6_del_rt removes the entry from this list hence the _safe */ - list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { + spin_lock_bh(&nh->lock); + + nh->dead = true; + + while (!list_empty(&nh->f6i_list)) { + f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list); + /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); + + spin_unlock_bh(&nh->lock); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); + + spin_lock_bh(&nh->lock); } + + spin_unlock_bh(&nh->lock); } static void __remove_nexthop(struct net *net, struct nexthop *nh, @@ -3168,8 +3179,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->nh_encap_type, - extack, false); + err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; @@ -3874,7 +3884,7 @@ static int nh_netdev_event(struct notifier_block *this, nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: - if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) + if (!(netif_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: @@ -4040,14 +4050,11 @@ out: } EXPORT_SYMBOL(nexthop_res_grp_activity_update); -static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit nexthop_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - flush_all_nexthops(net); + ASSERT_RTNL_NET(net); + flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) @@ -4072,7 +4079,7 @@ static int __net_init nexthop_net_init(struct net *net) static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, - .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, + .exit_rtnl = nexthop_net_exit_rtnl, }; static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c14baa6589c74..031df4c19fcc5 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -781,7 +781,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, - saddr, 0, 0, sk->sk_uid); + saddr, 0, 0, sk_uid(sk)); fl4.fl4_icmp_type = user_icmph.type; fl4.fl4_icmp_code = user_icmph.code; @@ -1116,7 +1116,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, - from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 10cbeb76c2745..65b0d0ab00840 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -189,8 +189,10 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("BeyondWindow", LINUX_MIB_BEYOND_WINDOW), SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), + SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6aace4d55733e..1d2c89d63cc71 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -610,7 +610,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), - daddr, saddr, 0, 0, sk->sk_uid); + daddr, saddr, 0, 0, sk_uid(sk)); fl4.fl4_icmp_type = 0; fl4.fl4_icmp_code = 0; @@ -1043,7 +1043,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 753704f75b2c6..f639a2ae881ac 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -189,7 +189,11 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); +#ifndef CONFIG_PREEMPT_RT #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) +#else +#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) +#endif #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) @@ -409,7 +413,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); struct neighbour *n; rcu_read_lock(); @@ -436,7 +440,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); const __be32 *pkey = daddr; if (rt->rt_gw_family == AF_INET) { @@ -552,7 +556,8 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), - daddr, inet->inet_saddr, 0, 0, sk->sk_uid); + daddr, inet->inet_saddr, 0, 0, + sk_uid(sk)); rcu_read_unlock(); } @@ -712,7 +717,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, */ rt = rcu_dereference(nhc->nhc_rth_input); if (rt) - rt->dst.obsolete = DST_OBSOLETE_KILL; + WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); for_each_possible_cpu(i) { struct rtable __rcu **prt; @@ -720,7 +725,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) - rt->dst.obsolete = DST_OBSOLETE_KILL; + WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); } } @@ -792,7 +797,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow jiffies + ip_rt_gc_timeout); } if (kill_route) - rt->dst.obsolete = DST_OBSOLETE_KILL; + WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } neigh_release(n); @@ -837,9 +842,9 @@ static void ipv4_negative_advice(struct sock *sk, { struct rtable *rt = dst_rtable(dst); - if ((dst->obsolete > 0) || + if ((READ_ONCE(dst->obsolete) > 0) || (rt->rt_flags & RTCF_REDIRECTED) || - rt->dst.expires) + READ_ONCE(rt->dst.expires)) sk_dst_reset(sk); } @@ -1021,14 +1026,15 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); } if (rt->rt_pmtu == mtu && !lock && - time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) + time_before(jiffies, READ_ONCE(dst->expires) - + net->ipv4.ip_rt_mtu_expires / 2)) goto out; if (fib_lookup(net, fl4, &res, 0) == 0) { @@ -1131,7 +1137,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = dst_rtable(odst); - if (odst->obsolete && !odst->ops->check(odst, 0)) { + if (READ_ONCE(odst->obsolete) && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; @@ -1206,7 +1212,8 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or * DST_OBSOLETE_DEAD. */ - if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) + if (READ_ONCE(dst->obsolete) != DST_OBSOLETE_FORCE_CHK || + rt_is_expired(rt)) return NULL; return dst; } @@ -1319,7 +1326,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); @@ -1566,7 +1573,7 @@ void rt_flush_dev(struct net_device *dev) static bool rt_cache_valid(const struct rtable *rt) { return rt && - rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && !rt_is_expired(rt); } @@ -1680,8 +1687,8 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; - new_rt->dst.input = rt->dst.input; - new_rt->dst.output = rt->dst.output; + new_rt->dst.input = READ_ONCE(rt->dst.input); + new_rt->dst.output = READ_ONCE(rt->dst.output); new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); @@ -2037,8 +2044,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net, hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; @@ -2093,7 +2104,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; - hash_keys.ports.src = fl4->fl4_sport; + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } @@ -2154,7 +2168,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); - fib_select_multipath(res, h); + fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif @@ -2574,7 +2588,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res, do_cache = true; if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - fi = NULL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, @@ -2649,7 +2662,7 @@ add: if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; - rth->dst.output = ip_mc_output; + rth->dst.output = ip_mr_output; } } #endif @@ -2699,8 +2712,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || - ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) { + ipv4_is_lbcast(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } @@ -2967,8 +2979,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; - if (rt->dst.lwtstate && - lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) + if (lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid && @@ -2999,7 +3010,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, } } - expires = rt->dst.expires; + expires = READ_ONCE(rt->dst.expires); if (expires) { unsigned long now = jiffies; @@ -3206,7 +3217,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; @@ -3216,7 +3228,6 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 5459a78b98095..eb0819463faed 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -454,7 +454,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); + ireq->ir_loc_addr, th->source, th->dest, + sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6edc441b37023..71a956fbfc553 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -302,8 +302,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); long sysctl_tcp_mem[3] __read_mostly; EXPORT_IPV6_MOD(sysctl_tcp_mem); -atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ -EXPORT_IPV6_MOD(tcp_memory_allocated); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); @@ -1059,6 +1057,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { + struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; @@ -1066,11 +1065,20 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; + int sockc_err = 0; int zc = 0; long timeo; flags = msg->msg_flags; + sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) }; + if (msg->msg_controllen) { + sockc_err = sock_cmsg_send(sk, msg, &sockc); + /* Don't return error until MSG_FASTOPEN has been processed; + * that may succeed even if the cmsg is invalid. + */ + } + if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; @@ -1078,7 +1086,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), + !sockc_err && sockc.dmabuf_id); if (!uarg) { err = -ENOBUFS; goto out_err; @@ -1087,12 +1096,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; + + if (!sockc_err && sockc.dmabuf_id) { + binding = net_devmem_get_binding(sk, sockc.dmabuf_id); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + binding = NULL; + goto out_err; + } + } } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } + if (!sockc_err && sockc.dmabuf_id && + (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { + err = -EINVAL; + goto out_err; + } + if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { @@ -1131,13 +1155,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)}; - if (msg->msg_controllen) { - err = sock_cmsg_send(sk, msg, &sockc); - if (unlikely(err)) { - err = -EINVAL; - goto out_err; - } + if (sockc_err) { + err = sockc_err; + goto out_err; } /* This should be in poll */ @@ -1154,12 +1174,14 @@ restart: goto do_error; while (msg_data_left(msg)) { - ssize_t copy = 0; + int copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; + trace_tcp_sendmsg_locked(sk, msg, skb, size_goal); + if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; @@ -1256,7 +1278,8 @@ new_segment: goto wait_for_space; } - err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, + binding); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; @@ -1272,8 +1295,7 @@ new_segment: if (!copy) goto wait_for_space; - err = skb_splice_from_iter(skb, &msg->msg_iter, copy, - sk->sk_allocation); + err = skb_splice_from_iter(skb, &msg->msg_iter, copy); if (err < 0) { if (err == -EMSGSIZE) { tcp_mark_push(tp, skb); @@ -1337,6 +1359,8 @@ out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); + if (binding) + net_devmem_dmabuf_binding_put(binding); return copied + copied_syn; do_error: @@ -1354,6 +1378,9 @@ out_err: sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } + if (binding) + net_devmem_dmabuf_binding_put(binding); + return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); @@ -3407,6 +3434,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; + tp->syn_fastopen_child = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; @@ -3723,6 +3751,19 @@ int tcp_set_window_clamp(struct sock *sk, int val) return 0; } +int tcp_sock_set_maxseg(struct sock *sk, int val) +{ + /* Values greater than interface MTU won't take effect. However + * at the point when this call is done we typically don't yet + * know which interface is going to be used + */ + if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) + return -EINVAL; + + tcp_sk(sk)->rx_opt.user_mss = val; + return 0; +} + /* * Socket option code for TCP. */ @@ -3855,15 +3896,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case TCP_MAXSEG: - /* Values greater than interface MTU won't take effect. However - * at the point when this call is done we typically don't yet - * know which interface is going to be used - */ - if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { - err = -EINVAL; - break; - } - tp->rx_opt.user_mss = val; + err = tcp_sock_set_maxseg(sk, val); break; case TCP_NODELAY: @@ -4162,6 +4195,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_SYN_DATA; if (tp->tcp_usec_ts) info->tcpi_options |= TCPI_OPT_USEC_TS; + if (tp->syn_fastopen_child) + info->tcpi_options |= TCPI_OPT_TFO_CHILD; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, @@ -5020,9 +5055,8 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32); /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); @@ -5194,7 +5228,7 @@ void __init tcp_init(void) /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); - max_rshare = min(6UL*1024*1024, limit); + max_rshare = min(32UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; @@ -5210,6 +5244,6 @@ void __init tcp_init(void) tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); - tcp_tasklet_init(); + tcp_tsq_work_init(); mptcp_init(); } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 1a6b1bc542451..f1884f0c9e523 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -3,6 +3,7 @@ #include <linux/tcp.h> #include <linux/rcupdate.h> #include <net/tcp.h> +#include <net/busy_poll.h> void tcp_fastopen_init_key_once(struct net *net) { @@ -279,6 +280,8 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); + sk_mark_napi_id_set(child, skb); + /* Now finish processing the fastopen child socket. */ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); @@ -401,6 +404,7 @@ fastopen: } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); + tcp_sk(child)->syn_fastopen_child = 1; return child; } NET_INC_STATS(sock_net(sk), @@ -555,6 +559,7 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct net_device *dev; struct dst_entry *dst; struct sk_buff *skb; @@ -572,7 +577,8 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) } else if (tp->syn_fastopen_ch && atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); - if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) + dev = dst ? dst_dev(dst) : NULL; + if (!(dev && (dev->flags & IFF_LOOPBACK))) atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a35018e2d0ba2..71b76e98371a6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -664,10 +664,12 @@ EXPORT_IPV6_MOD(tcp_initialize_rcv_mss); */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { - u32 new_sample = tp->rcv_rtt_est.rtt_us; - long m = sample; + u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us; + long m = sample << 3; - if (new_sample != 0) { + if (old_sample == 0 || m < old_sample) { + new_sample = m; + } else { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially * with chatty applications or bulk transfer apps which @@ -678,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * else with timestamps disabled convergence takes too * long. */ - if (!win_dep) { - m -= (new_sample >> 3); - new_sample += m; - } else { - m <<= 3; - if (m < new_sample) - new_sample = m; - } - } else { - /* No previous measure. */ - new_sample = m << 3; + if (win_dep) + return; + /* Do not use this sample if receive queue is not empty. */ + if (tp->rcv_nxt != tp->copied_seq) + return; + new_sample = old_sample - (old_sample >> 3) + sample; } tp->rcv_rtt_est.rtt_us = new_sample; @@ -712,7 +709,7 @@ new_measure: tp->rcv_rtt_est.time = tp->tcp_mstamp; } -static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) +static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta) { u32 delta, delta_us; @@ -722,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) - delta = 1; + delta = min_delta; delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); return delta_us; } @@ -740,13 +737,39 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { - s32 delta = tcp_rtt_tsopt_us(tp); + s32 delta = tcp_rtt_tsopt_us(tp, 0); - if (delta >= 0) + if (delta > 0) tcp_rcv_rtt_update(tp, delta, 0); } } +static void tcp_rcvbuf_grow(struct sock *sk) +{ + const struct net *net = sock_net(sk); + struct tcp_sock *tp = tcp_sk(sk); + int rcvwin, rcvbuf, cap; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + return; + + /* slow start: allow the sender to double its rate. */ + rcvwin = tp->rcvq_space.space << 1; + + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) + rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; + + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); + + rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + /* Make the window clamp follow along. */ + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); + } +} /* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. @@ -754,8 +777,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 copied; - int time; + int time, inq, copied; trace_tcp_rcv_space_adjust(sk); @@ -766,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk) /* Number of bytes copied to user in last RTT */ copied = tp->copied_seq - tp->rcvq_space.seq; + /* Number of bytes in receive queue. */ + inq = tp->rcv_nxt - tp->copied_seq; + copied -= inq; if (copied <= tp->rcvq_space.space) goto new_measure; - /* A bit of theory : - * copied = bytes received in previous RTT, our base window - * To cope with packet losses, we need a 2x factor - * To cope with slow start, and sender growing its cwin by 100 % - * every RTT, we need a 4x factor, because the ACK we are sending - * now is for the next RTT, not the current one : - * <prev RTT . ><current RTT .. ><next RTT .... > - */ - - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - u64 rcvwin, grow; - int rcvbuf; - - /* minimal window to cope with packet losses, assuming - * steady state. Add some cushion because of small variations. - */ - rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - - /* Accommodate for sender rate increase (eg. slow start) */ - grow = rcvwin * (copied - tp->rcvq_space.space); - do_div(grow, tp->rcvq_space.space); - rcvwin += (grow << 1); + trace_tcp_rcvbuf_grow(sk, time); - rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - if (rcvbuf > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - - /* Make the window clamp follow along. */ - WRITE_ONCE(tp->window_clamp, - tcp_win_from_space(sk, rcvbuf)); - } - } tp->rcvq_space.space = copied; + tcp_rcvbuf_grow(sk); + new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tp->tcp_mstamp; @@ -1456,11 +1451,6 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; - - /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ - if (tp->lost_skb_hint && - before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) - tp->lost_cnt_hint += pcount; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1501,9 +1491,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); - if (skb == tp->lost_skb_hint) - tp->lost_cnt_hint += pcount; - TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; @@ -1536,10 +1523,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; - if (skb == tp->lost_skb_hint) { - tp->lost_skb_hint = prev; - tp->lost_cnt_hint -= tcp_skb_pcount(prev); - } TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; @@ -2156,12 +2139,6 @@ static inline void tcp_init_undo(struct tcp_sock *tp) tp->undo_retrans = -1; } -static bool tcp_is_rack(const struct sock *sk) -{ - return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & - TCP_RACK_LOSS_DETECTION; -} - /* If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. @@ -2187,8 +2164,7 @@ static void tcp_timeout_mark_lost(struct sock *sk) skb_rbtree_walk_from(skb) { if (is_reneg) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; - else if (tcp_is_rack(sk) && skb != head && - tcp_rack_skb_timeout(tp, skb, 0) > 0) + else if (skb != head && tcp_rack_skb_timeout(tp, skb, 0) > 0) continue; /* Don't mark recently sent ones lost yet */ tcp_mark_skb_lost(sk, skb); } @@ -2269,22 +2245,6 @@ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) return false; } -/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs - * counter when SACK is enabled (without SACK, sacked_out is used for - * that purpose). - * - * With reordering, holes may still be in flight, so RFC3517 recovery - * uses pure sacked_out (total number of SACKed segments) even though - * it violates the RFC that uses duplicate ACKs, often these are equal - * but when e.g. out-of-window ACKs or packet duplication occurs, - * they differ. Since neither occurs due to loss, TCP should really - * ignore them. - */ -static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) -{ - return tp->sacked_out + 1; -} - /* Linux NewReno/SACK/ECN state machine. * -------------------------------------- * @@ -2337,13 +2297,7 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) * * If the receiver supports SACK: * - * RFC6675/3517: It is the conventional algorithm. A packet is - * considered lost if the number of higher sequence packets - * SACKed is greater than or equal the DUPACK thoreshold - * (reordering). This is implemented in tcp_mark_head_lost and - * tcp_update_scoreboard. - * - * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm + * RACK (RFC8985): RACK is a newer loss detection algorithm * (2017-) that checks timing instead of counting DUPACKs. * Essentially a packet is considered lost if it's not S/ACKed * after RTT + reordering_window, where both metrics are @@ -2358,8 +2312,8 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * - * Really tricky (and requiring careful tuning) part of algorithm - * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). + * The really tricky (and requiring careful tuning) part of the algorithm + * is hidden in the RACK code in tcp_recovery.c and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, * hence, slow down forward transmission. In fact, it determines the moment * when we decide that hole is caused by loss, rather than by a reorder. @@ -2382,83 +2336,10 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) * Main question: may we further continue forward transmission * with the same cwnd? */ -static bool tcp_time_to_recover(struct sock *sk, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - - /* Trick#1: The loss is proven. */ - if (tp->lost_out) - return true; - - /* Not-A-Trick#2 : Classic rule... */ - if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering) - return true; - - return false; -} - -/* Detect loss in event "A" above by marking head of queue up as lost. - * For RFC3517 SACK, a segment is considered lost if it - * has at least tp->reordering SACKed seqments above it; "packets" refers to - * the maximum SACKed segments to pass before reaching this limit. - */ -static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int cnt; - /* Use SACK to deduce losses of new sequences sent during recovery */ - const u32 loss_high = tp->snd_nxt; - - WARN_ON(packets > tp->packets_out); - skb = tp->lost_skb_hint; - if (skb) { - /* Head already handled? */ - if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) - return; - cnt = tp->lost_cnt_hint; - } else { - skb = tcp_rtx_queue_head(sk); - cnt = 0; - } - - skb_rbtree_walk_from(skb) { - /* TODO: do this better */ - /* this is not the most efficient way to do this... */ - tp->lost_skb_hint = skb; - tp->lost_cnt_hint = cnt; - - if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) - break; - - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) - cnt += tcp_skb_pcount(skb); - - if (cnt > packets) - break; - - if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) - tcp_mark_skb_lost(sk, skb); - - if (mark_head) - break; - } - tcp_verify_left_out(tp); -} - -/* Account newly detected lost packet(s) */ - -static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) +static bool tcp_time_to_recover(const struct tcp_sock *tp) { - struct tcp_sock *tp = tcp_sk(sk); - - if (tcp_is_sack(tp)) { - int sacked_upto = tp->sacked_out - tp->reordering; - if (sacked_upto >= 0) - tcp_mark_head_lost(sk, sacked_upto, 0); - else if (fast_rexmit) - tcp_mark_head_lost(sk, 1, 1); - } + /* Has loss detection marked at least one packet lost? */ + return tp->lost_out != 0; } static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) @@ -2484,20 +2365,33 @@ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { const struct sock *sk = (const struct sock *)tp; - if (tp->retrans_stamp && - tcp_tsopt_ecr_before(tp, tp->retrans_stamp)) - return true; /* got echoed TS before first retransmission */ + /* Received an echoed timestamp before the first retransmission? */ + if (tp->retrans_stamp) + return tcp_tsopt_ecr_before(tp, tp->retrans_stamp); + + /* We set tp->retrans_stamp upon the first retransmission of a loss + * recovery episode, so normally if tp->retrans_stamp is 0 then no + * retransmission has happened yet (likely due to TSQ, which can cause + * fast retransmits to be delayed). So if snd_una advanced while + * (tp->retrans_stamp is 0 then apparently a packet was merely delayed, + * not lost. But there are exceptions where we retransmit but then + * clear tp->retrans_stamp, so we check for those exceptions. + */ - /* Check if nothing was retransmitted (retrans_stamp==0), which may - * happen in fast recovery due to TSQ. But we ignore zero retrans_stamp - * in TCP_SYN_SENT, since when we set FLAG_SYN_ACKED we also clear - * retrans_stamp even if we had retransmitted the SYN. + /* (1) For non-SACK connections, tcp_is_non_sack_preventing_reopen() + * clears tp->retrans_stamp when snd_una == high_seq. */ - if (!tp->retrans_stamp && /* no record of a retransmit/SYN? */ - sk->sk_state != TCP_SYN_SENT) /* not the FLAG_SYN_ACKED case? */ - return true; /* nothing was retransmitted */ + if (!tcp_is_sack(tp) && !before(tp->snd_una, tp->high_seq)) + return false; - return false; + /* (2) In TCP_SYN_SENT tcp_clean_rtx_queue() clears tp->retrans_stamp + * when setting FLAG_SYN_ACKED is set, even if the SYN was + * retransmitted. + */ + if (sk->sk_state == TCP_SYN_SENT) + return false; + + return true; /* tp->retrans_stamp is zero; no retransmit yet */ } /* Undo procedures. */ @@ -2886,8 +2780,6 @@ void tcp_simple_retransmit(struct sock *sk) tcp_mark_skb_lost(sk, skb); } - tcp_clear_retrans_hints_partial(tp); - if (!tp->lost_out) return; @@ -2995,17 +2887,8 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, *rexmit = REXMIT_LOST; } -static bool tcp_force_fast_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - return after(tcp_highest_sack_seq(tp), - tp->snd_una + tp->reordering * tp->mss_cache); -} - /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, - bool *do_lost) +static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); @@ -3030,9 +2913,6 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una, tcp_undo_cwnd_reduction(sk, true); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); - } else { - /* Partial ACK arrived. Force fast retransmit. */ - *do_lost = tcp_force_fast_retransmit(sk); } return false; } @@ -3046,7 +2926,7 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) if (unlikely(tcp_is_reno(tp))) { tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED); - } else if (tcp_is_rack(sk)) { + } else { u32 prior_retrans = tp->retrans_out; if (tcp_rack_mark_lost(sk)) @@ -3073,10 +2953,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int fast_rexmit = 0, flag = *ack_flag; + int flag = *ack_flag; bool ece_ack = flag & FLAG_ECE; - bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && - tcp_force_fast_retransmit(sk)); if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; @@ -3125,7 +3003,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (tcp_is_reno(tp)) tcp_add_reno_sack(sk, num_dupack, ece_ack); - } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost)) + } else if (tcp_try_undo_partial(sk, prior_snd_una)) return; if (tcp_try_undo_dsack(sk)) @@ -3133,7 +3011,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tcp_identify_packet_loss(sk, ack_flag); if (icsk->icsk_ca_state != TCP_CA_Recovery) { - if (!tcp_time_to_recover(sk, flag)) + if (!tcp_time_to_recover(tp)) return; /* Undo reverts the recovery state. If loss is evident, * starts a new recovery (e.g. reordering then loss); @@ -3162,7 +3040,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tcp_try_undo_dsack(sk); tcp_identify_packet_loss(sk, ack_flag); - if (!tcp_time_to_recover(sk, flag)) { + if (!tcp_time_to_recover(tp)) { tcp_try_to_open(sk, flag); return; } @@ -3180,11 +3058,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* Otherwise enter Recovery state */ tcp_enter_recovery(sk, ece_ack); - fast_rexmit = 1; } - if (!tcp_is_rack(sk) && do_lost) - tcp_update_scoreboard(sk, fast_rexmit); *rexmit = REXMIT_LOST; } @@ -3226,7 +3101,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp); + seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1); rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) @@ -3440,8 +3315,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; - if (unlikely(skb == tp->lost_skb_hint)) - tp->lost_skb_hint = NULL; tcp_highest_sack_replace(sk, skb, next); tcp_rtx_queue_unlink_and_free(skb, sk); } @@ -3499,14 +3372,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (flag & FLAG_RETRANS_DATA_ACKED) flag &= ~FLAG_ORIG_SACK_ACKED; } else { - int delta; - /* Non-retransmitted hole got filled? That's reordering */ if (before(reord, prior_fack)) tcp_check_sack_reordering(sk, reord, 0); - - delta = prior_sacked - tp->sacked_out; - tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, @@ -3846,7 +3714,7 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) } /* This routine deals with acks during a TLP episode and ends an episode by - * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985 */ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) { @@ -4523,14 +4391,22 @@ static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk, * (borrowed from freebsd) */ -static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, +static enum skb_drop_reason tcp_sequence(const struct sock *sk, u32 seq, u32 end_seq) { + const struct tcp_sock *tp = tcp_sk(sk); + if (before(end_seq, tp->rcv_wup)) return SKB_DROP_REASON_TCP_OLD_SEQUENCE; - if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) - return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; + if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) { + if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) + return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; + + /* Only accept this packet if receive queue is empty. */ + if (skb_queue_len(&sk->sk_receive_queue)) + return SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE; + } return SKB_NOT_DROPPED_YET; } @@ -4977,8 +4853,9 @@ static void tcp_ofo_queue(struct sock *sk) if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { __u32 dsack = dsack_high; + if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) - dsack_high = TCP_SKB_CB(skb)->end_seq; + dsack = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } p = rb_next(p); @@ -5011,10 +4888,20 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); -static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, +/* Check if this incoming skb can be added to socket receive queues + * while satisfying sk->sk_rcvbuf limit. + */ +static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) +{ + unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize; + + return new_mem <= sk->sk_rcvbuf; +} + +static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, unsigned int size) { - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + if (!tcp_can_ingest(sk, skb) || !sk_rmem_schedule(sk, skb, size)) { if (tcp_prune_queue(sk, skb) < 0) @@ -5046,6 +4933,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) return; } + tcp_measure_rcv_mss(sk, skb); /* Disable header prediction. */ tp->pred_flags = 0; inet_csk_schedule_ack(sk); @@ -5173,6 +5061,9 @@ end: skb_condense(skb); skb_set_owner_r(skb, sk); } + /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ + if (sk->sk_socket) + tcp_rcvbuf_grow(sk); } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, @@ -5626,7 +5517,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb) tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); tp->ooo_last_skb = rb_to_skb(prev); if (!prev || goal <= 0) { - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + if (tcp_can_ingest(sk, in_skb) && !tcp_under_memory_pressure(sk)) break; goal = sk->sk_rcvbuf >> 3; @@ -5658,14 +5549,18 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); + /* Do nothing if our queues are empty. */ + if (!atomic_read(&sk->sk_rmem_alloc)) + return -1; + NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + if (!tcp_can_ingest(sk, in_skb)) tcp_clamp_window(sk); else if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + if (tcp_can_ingest(sk, in_skb)) return 0; tcp_collapse_ofo_queue(sk); @@ -5675,7 +5570,7 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) NULL, tp->copied_seq, tp->rcv_nxt); - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + if (tcp_can_ingest(sk, in_skb)) return 0; /* Collapsing did not help, destructive actions follow. @@ -5683,7 +5578,7 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) tcp_prune_ofo_queue(sk, in_skb); - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + if (tcp_can_ingest(sk, in_skb)) return 0; /* If we are really being abused, tell the caller to silently @@ -6009,7 +5904,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, step1: /* Step 1: check sequence number */ - reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." @@ -6020,6 +5915,11 @@ step1: if (!th->rst) { if (th->syn) goto syn_challenge; + + if (reason == SKB_DROP_REASON_TCP_INVALID_SEQUENCE || + reason == SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_BEYOND_WINDOW); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSEQ, &tp->last_oow_ack_time)) @@ -6238,6 +6138,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (tcp_checksum_complete(skb)) goto csum_error; + if (after(TCP_SKB_CB(skb)->end_seq, + tp->rcv_nxt + tcp_receive_window(tp))) + goto validate; + if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; @@ -6293,7 +6197,7 @@ slow_path: /* * Standard slow path. */ - +validate: if (!tcp_validate_incoming(sk, skb, th, 1)) return; @@ -6873,6 +6777,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + if (req) { tcp_rcv_synrecv_state_fastopen(sk); } else { @@ -6898,9 +6805,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8cce0d5489dae..84d3d556ed806 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -58,7 +58,9 @@ #include <linux/times.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sock_diag.h> +#include <net/aligned_data.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> @@ -787,7 +789,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) @@ -1703,7 +1705,6 @@ static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), - .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, @@ -2025,6 +2026,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, u32 gso_size; u64 limit; int delta; + int err; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. @@ -2135,21 +2137,27 @@ no_coalesce: limit = min_t(u64, limit, UINT_MAX); - if (unlikely(sk_add_backlog(sk, skb, limit))) { + err = sk_add_backlog(sk, skb, limit); + if (unlikely(err)) { bh_unlock_sock(sk); - *reason = SKB_DROP_REASON_SOCKET_BACKLOG; - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); + if (err == -ENOMEM) { + *reason = SKB_DROP_REASON_PFMEMALLOC; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); + } else { + *reason = SKB_DROP_REASON_SOCKET_BACKLOG; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); + } return true; } return false; } EXPORT_IPV6_MOD(tcp_add_backlog); -int tcp_filter(struct sock *sk, struct sk_buff *skb) +int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { struct tcphdr *th = (struct tcphdr *)skb->data; - return sk_filter_trim_cap(sk, skb, th->doff * 4); + return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); } EXPORT_IPV6_MOD(tcp_filter); @@ -2276,14 +2284,12 @@ lookup: } refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) { + if (!tcp_filter(sk, skb, &drop_reason)) { th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen, &drop_reason); - } else { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); @@ -2339,10 +2345,9 @@ process: nf_reset_ct(skb); - if (tcp_filter(sk, skb)) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + if (tcp_filter(sk, skb, &drop_reason)) goto discard_and_relse; - } + th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); @@ -2417,7 +2422,8 @@ do_time_wait: goto csum_error; } - tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, @@ -2895,7 +2901,7 @@ static void get_openreq4(const struct request_sock *req, jiffies_delta_to_clock_t(delta), req->num_timeout, from_kuid_munged(seq_user_ns(f), - sock_i_uid(req->rsk_listener)), + sk_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, @@ -2953,7 +2959,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(f), sk_uid(sk)), icsk->icsk_probes_out, sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, @@ -3013,13 +3019,17 @@ out: } #ifdef CONFIG_BPF_SYSCALL +union bpf_tcp_iter_batch_item { + struct sock *sk; + __u64 cookie; +}; + struct bpf_tcp_iter_state { struct tcp_iter_state state; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; - struct sock **batch; - bool st_bucket_done; + union bpf_tcp_iter_batch_item *batch; }; struct bpf_iter__tcp { @@ -3042,21 +3052,32 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) { - while (iter->cur_sk < iter->end_sk) - sock_gen_put(iter->batch[iter->cur_sk++]); + union bpf_tcp_iter_batch_item *item; + unsigned int cur_sk = iter->cur_sk; + __u64 cookie; + + /* Remember the cookies of the sockets we haven't seen yet, so we can + * pick up where we left off next time around. + */ + while (cur_sk < iter->end_sk) { + item = &iter->batch[cur_sk++]; + cookie = sock_gen_cookie(item->sk); + sock_gen_put(item->sk); + item->cookie = cookie; + } } static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, - unsigned int new_batch_sz) + unsigned int new_batch_sz, gfp_t flags) { - struct sock **new_batch; + union bpf_tcp_iter_batch_item *new_batch; new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, - GFP_USER | __GFP_NOWARN); + flags | __GFP_NOWARN); if (!new_batch) return -ENOMEM; - bpf_iter_tcp_put_batch(iter); + memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; @@ -3064,112 +3085,234 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, return 0; } -static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, - struct sock *start_sk) +static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk, + union bpf_tcp_iter_batch_item *cookies, + int n_cookies) +{ + struct hlist_nulls_node *node; + struct sock *sk; + int i; + + for (i = 0; i < n_cookies; i++) { + sk = first_sk; + sk_nulls_for_each_from(sk, node) + if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) + return sk; + } + + return NULL; +} + +static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq) +{ + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + unsigned int find_cookie = iter->cur_sk; + unsigned int end_cookie = iter->end_sk; + int resume_bucket = st->bucket; + struct sock *sk; + + if (end_cookie && find_cookie == end_cookie) + ++st->bucket; + + sk = listening_get_first(seq); + iter->cur_sk = 0; + iter->end_sk = 0; + + if (sk && st->bucket == resume_bucket && end_cookie) { + sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], + end_cookie - find_cookie); + if (!sk) { + spin_unlock(&hinfo->lhash2[st->bucket].lock); + ++st->bucket; + sk = listening_get_first(seq); + } + } + + return sk; +} + +static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; + unsigned int find_cookie = iter->cur_sk; + unsigned int end_cookie = iter->end_sk; + int resume_bucket = st->bucket; + struct sock *sk; + + if (end_cookie && find_cookie == end_cookie) + ++st->bucket; + + sk = established_get_first(seq); + iter->cur_sk = 0; + iter->end_sk = 0; + + if (sk && st->bucket == resume_bucket && end_cookie) { + sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie], + end_cookie - find_cookie); + if (!sk) { + spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); + ++st->bucket; + sk = established_get_first(seq); + } + } + + return sk; +} + +static struct sock *bpf_iter_tcp_resume(struct seq_file *seq) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct sock *sk = NULL; + + switch (st->state) { + case TCP_SEQ_STATE_LISTENING: + sk = bpf_iter_tcp_resume_listening(seq); + if (sk) + break; + st->bucket = 0; + st->state = TCP_SEQ_STATE_ESTABLISHED; + fallthrough; + case TCP_SEQ_STATE_ESTABLISHED: + sk = bpf_iter_tcp_resume_established(seq); + break; + } + + return sk; +} + +static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, + struct sock **start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; struct hlist_nulls_node *node; unsigned int expected = 1; struct sock *sk; - sock_hold(start_sk); - iter->batch[iter->end_sk++] = start_sk; + sock_hold(*start_sk); + iter->batch[iter->end_sk++].sk = *start_sk; - sk = sk_nulls_next(start_sk); + sk = sk_nulls_next(*start_sk); + *start_sk = NULL; sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); - iter->batch[iter->end_sk++] = sk; + iter->batch[iter->end_sk++].sk = sk; + } else if (!*start_sk) { + /* Remember where we left off. */ + *start_sk = sk; } expected++; } } - spin_unlock(&hinfo->lhash2[st->bucket].lock); return expected; } static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, - struct sock *start_sk) + struct sock **start_sk) { - struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct bpf_tcp_iter_state *iter = seq->private; - struct tcp_iter_state *st = &iter->state; struct hlist_nulls_node *node; unsigned int expected = 1; struct sock *sk; - sock_hold(start_sk); - iter->batch[iter->end_sk++] = start_sk; + sock_hold(*start_sk); + iter->batch[iter->end_sk++].sk = *start_sk; - sk = sk_nulls_next(start_sk); + sk = sk_nulls_next(*start_sk); + *start_sk = NULL; sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); - iter->batch[iter->end_sk++] = sk; + iter->batch[iter->end_sk++].sk = sk; + } else if (!*start_sk) { + /* Remember where we left off. */ + *start_sk = sk; } expected++; } } - spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); return expected; } -static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) +static unsigned int bpf_iter_fill_batch(struct seq_file *seq, + struct sock **start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + + if (st->state == TCP_SEQ_STATE_LISTENING) + return bpf_iter_tcp_listening_batch(seq, start_sk); + else + return bpf_iter_tcp_established_batch(seq, start_sk); +} + +static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; + + if (st->state == TCP_SEQ_STATE_LISTENING) + spin_unlock(&hinfo->lhash2[st->bucket].lock); + else + spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); +} + +static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) +{ + struct bpf_tcp_iter_state *iter = seq->private; unsigned int expected; - bool resized = false; struct sock *sk; + int err; - /* The st->bucket is done. Directly advance to the next - * bucket instead of having the tcp_seek_last_pos() to skip - * one by one in the current bucket and eventually find out - * it has to advance to the next bucket. - */ - if (iter->st_bucket_done) { - st->offset = 0; - st->bucket++; - if (st->state == TCP_SEQ_STATE_LISTENING && - st->bucket > hinfo->lhash2_mask) { - st->state = TCP_SEQ_STATE_ESTABLISHED; - st->bucket = 0; - } - } + sk = bpf_iter_tcp_resume(seq); + if (!sk) + return NULL; /* Done */ -again: - /* Get a new batch */ - iter->cur_sk = 0; - iter->end_sk = 0; - iter->st_bucket_done = false; + expected = bpf_iter_fill_batch(seq, &sk); + if (likely(iter->end_sk == expected)) + goto done; - sk = tcp_seek_last_pos(seq); + /* Batch size was too small. */ + bpf_iter_tcp_unlock_bucket(seq); + bpf_iter_tcp_put_batch(iter); + err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2, + GFP_USER); + if (err) + return ERR_PTR(err); + + sk = bpf_iter_tcp_resume(seq); if (!sk) return NULL; /* Done */ - if (st->state == TCP_SEQ_STATE_LISTENING) - expected = bpf_iter_tcp_listening_batch(seq, sk); - else - expected = bpf_iter_tcp_established_batch(seq, sk); - - if (iter->end_sk == expected) { - iter->st_bucket_done = true; - return sk; - } + expected = bpf_iter_fill_batch(seq, &sk); + if (likely(iter->end_sk == expected)) + goto done; - if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { - resized = true; - goto again; + /* Batch size was still too small. Hold onto the lock while we try + * again with a larger batch to make sure the current bucket's size + * does not change in the meantime. + */ + err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT); + if (err) { + bpf_iter_tcp_unlock_bucket(seq); + return ERR_PTR(err); } - return sk; + expected = bpf_iter_fill_batch(seq, &sk); + WARN_ON_ONCE(iter->end_sk != expected); +done: + bpf_iter_tcp_unlock_bucket(seq); + return iter->batch[0].sk; } static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) @@ -3199,16 +3342,11 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) * meta.seq_num is used instead. */ st->num++; - /* Move st->offset to the next sk in the bucket such that - * the future start() will resume at st->offset in - * st->bucket. See tcp_seek_last_pos(). - */ - st->offset++; - sock_gen_put(iter->batch[iter->cur_sk++]); + sock_gen_put(iter->batch[iter->cur_sk++].sk); } if (iter->cur_sk < iter->end_sk) - sk = iter->batch[iter->cur_sk]; + sk = iter->batch[iter->cur_sk].sk; else sk = bpf_iter_tcp_batch(seq); @@ -3245,9 +3383,9 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) const struct request_sock *req = v; uid = from_kuid_munged(seq_user_ns(seq), - sock_i_uid(req->rsk_listener)); + sk_uid(req->rsk_listener)); } else { - uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); } meta.seq = seq; @@ -3274,10 +3412,8 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) (void)tcp_prog_seq_show(prog, &meta, v, 0); } - if (iter->cur_sk < iter->end_sk) { + if (iter->cur_sk < iter->end_sk) bpf_iter_tcp_put_batch(iter); - iter->st_bucket_done = false; - } } static const struct seq_operations bpf_iter_tcp_seq_ops = { @@ -3390,7 +3526,7 @@ struct proto tcp_prot = { .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, - .memory_allocated = &tcp_memory_allocated, + .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, @@ -3494,8 +3630,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; - /* Default TSQ limit of 16 TSO segments */ - net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; + /* Default TSQ limit of 4 MB */ + net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; @@ -3595,7 +3731,7 @@ static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) if (err) return err; - err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); + err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); if (err) { bpf_iter_fini_seq_net(priv_data); return err; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4251670e328c8..03c068ea27b6a 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -166,11 +166,11 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, unsigned int hash) { struct tcp_metrics_block *tm; - struct net *net; bool reclaim = false; + struct net *net; spin_lock_bh(&tcp_metrics_lock); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index fb9349be36b80..2994c9222c9cb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -97,7 +97,8 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th, u32 *tw_isn) + const struct tcphdr *th, u32 *tw_isn, + enum skb_drop_reason *drop_reason) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); @@ -245,8 +246,10 @@ kill: return TCP_TW_SYN; } - if (paws_reject) - __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + if (paws_reject) { + *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); + } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. @@ -723,7 +726,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && - !inet_rtx_syn_ack(sk, req)) { + !tcp_rtx_synack(sk, req)) { unsigned long expires = jiffies; expires += reqsk_timeout(req, TCP_RTO_MAX); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index d293087b426df..be5c2294610e5 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -359,6 +359,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, flush |= skb->ip_summed != p->ip_summed; flush |= skb->csum_level != p->csum_level; flush |= NAPI_GRO_CB(p)->count >= 64; + skb_set_network_header(skb, skb_gro_receive_network_offset(skb)); if (flush || skb_gro_receive_list(p, skb)) mss = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 13295a59d22e6..caf11920a8786 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,15 +1066,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb * needs to be reallocated in a driver. * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * - * Since transmit from skb destructor is forbidden, we use a tasklet + * Since transmit from skb destructor is forbidden, we use a BH work item * to process all sockets that eventually need to send more skbs. - * We use one tasklet per cpu, with its own queue of sockets. + * We use one work item per cpu, with its own queue of sockets. */ -struct tsq_tasklet { - struct tasklet_struct tasklet; +struct tsq_work { + struct work_struct work; struct list_head head; /* queue of tcp sockets */ }; -static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); +static DEFINE_PER_CPU(struct tsq_work, tsq_work); static void tcp_tsq_write(struct sock *sk) { @@ -1104,14 +1104,14 @@ static void tcp_tsq_handler(struct sock *sk) bh_unlock_sock(sk); } /* - * One tasklet per cpu tries to send more skbs. - * We run in tasklet context but need to disable irqs when + * One work item per cpu tries to send more skbs. + * We run in BH context but need to disable irqs when * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ -static void tcp_tasklet_func(struct tasklet_struct *t) +static void tcp_tsq_workfn(struct work_struct *work) { - struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); + struct tsq_work *tsq = container_of(work, struct tsq_work, work); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; @@ -1181,15 +1181,15 @@ void tcp_release_cb(struct sock *sk) } EXPORT_IPV6_MOD(tcp_release_cb); -void __init tcp_tasklet_init(void) +void __init tcp_tsq_work_init(void) { int i; for_each_possible_cpu(i) { - struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + struct tsq_work *tsq = &per_cpu(tsq_work, i); INIT_LIST_HEAD(&tsq->head); - tasklet_setup(&tsq->tasklet, tcp_tasklet_func); + INIT_WORK(&tsq->work, tcp_tsq_workfn); } } @@ -1203,11 +1203,11 @@ void tcp_wfree(struct sk_buff *skb) struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; - struct tsq_tasklet *tsq; + struct tsq_work *tsq; bool empty; /* Keep one reference on sk_wmem_alloc. - * Will be released by sk_free() from here or tcp_tasklet_func() + * Will be released by sk_free() from here or tcp_tsq_workfn() */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); @@ -1229,13 +1229,13 @@ void tcp_wfree(struct sk_buff *skb) nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); - /* queue this socket to tasklet queue */ + /* queue this socket to BH workqueue */ local_irq_save(flags); - tsq = this_cpu_ptr(&tsq_tasklet); + tsq = this_cpu_ptr(&tsq_work); empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); if (empty) - tasklet_schedule(&tsq->tasklet); + queue_work(system_bh_wq, &tsq->work); local_irq_restore(flags); return; out: @@ -1554,11 +1554,6 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); - if (tp->lost_skb_hint && - before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - tp->lost_cnt_hint -= decr; - tcp_verify_left_out(tp); } @@ -2619,9 +2614,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = max_t(unsigned long, 2 * skb->truesize, READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); - if (sk->sk_pacing_status == SK_PACING_NONE) - limit = min_t(unsigned long, limit, - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); + limit = min_t(unsigned long, limit, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && @@ -2640,7 +2634,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, - * after softirq/tasklet schedule. + * after softirq schedule. * This helps when TX completions are delayed too much. */ if (tcp_rtx_queue_empty_or_single_skb(sk)) @@ -3253,7 +3247,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ - tcp_clear_retrans_hints_partial(tp); if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; @@ -3337,8 +3330,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; - if (skb_still_in_host_queue(sk, skb)) - return -EBUSY; + if (skb_still_in_host_queue(sk, skb)) { + err = -EBUSY; + goto out; + } start: if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { @@ -3349,14 +3344,19 @@ start: } if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { WARN_ON_ONCE(1); - return -EINVAL; + err = -EINVAL; + goto out; + } + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) { + err = -ENOMEM; + goto out; } - if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) - return -ENOMEM; } - if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) - return -EHOSTUNREACH; /* Routing failure or similar. */ + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) { + err = -EHOSTUNREACH; /* Routing failure or similar. */ + goto out; + } cur_mss = tcp_current_mss(sk); avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; @@ -3367,8 +3367,10 @@ start: * our retransmit of one segment serves as a zero window probe. */ if (avail_wnd <= 0) { - if (TCP_SKB_CB(skb)->seq != tp->snd_una) - return -EAGAIN; + if (TCP_SKB_CB(skb)->seq != tp->snd_una) { + err = -EAGAIN; + goto out; + } avail_wnd = cur_mss; } @@ -3380,11 +3382,15 @@ start: } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, - cur_mss, GFP_ATOMIC)) - return -ENOMEM; /* We'll try again later. */ + cur_mss, GFP_ATOMIC)) { + err = -ENOMEM; /* We'll try again later. */ + goto out; + } } else { - if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) - return -ENOMEM; + if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) { + err = -ENOMEM; + goto out; + } diff = tcp_skb_pcount(skb); tcp_set_skb_tso_segs(skb, cur_mss); @@ -3438,17 +3444,16 @@ start: tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); - if (likely(!err)) { - trace_tcp_retransmit_skb(sk, skb); - } else if (err != -EBUSY) { + if (unlikely(err) && err != -EBUSY) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); - } /* To avoid taking spuriously low RTT samples based on a timestamp * for a transmit that never happened, always mark EVER_RETRANS */ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; +out: + trace_tcp_retransmit_skb(sk, skb, err); return err; } @@ -4432,6 +4437,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); + req->num_retrans++; } return res; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index bba10110fbbc1..c52fd3254b6e0 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -35,7 +35,7 @@ s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb)); } -/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): +/* RACK loss detection (IETF RFC8985): * * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e4c616bbd7271..a207877270fbd 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -359,7 +359,7 @@ void tcp_delack_timer_handler(struct sock *sk) static void tcp_delack_timer(struct timer_list *t) { struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_delack_timer); + timer_container_of(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; /* Avoid taking socket spinlock if there is no ACK to send. @@ -478,7 +478,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * regular retransmit because if the child socket has been accepted * it's not good to give up too easily. */ - inet_rtx_syn_ack(sk, req); + tcp_rtx_synack(sk, req); req->num_timeout++; tcp_update_rto_stats(sk); if (!tp->retrans_stamp) @@ -726,7 +726,7 @@ void tcp_write_timer_handler(struct sock *sk) static void tcp_write_timer(struct timer_list *t) { struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_retransmit_timer); + timer_container_of(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; /* Avoid locking the socket when there is no pending event. */ @@ -778,7 +778,7 @@ EXPORT_IPV6_MOD_GPL(tcp_set_keepalive); static void tcp_keepalive_timer(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2742cc7602bb5..cc3ce0f762ec2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -93,6 +93,7 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/slab.h> +#include <linux/sock_diag.h> #include <net/tcp_states.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> @@ -119,14 +120,13 @@ #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif +#include <net/rps.h> struct udp_table udp_table __read_mostly; long sysctl_udp_mem[3] __read_mostly; EXPORT_IPV6_MOD(sysctl_udp_mem); -atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; -EXPORT_IPV6_MOD(udp_memory_allocated); DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); @@ -143,8 +143,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned long *bitmap, struct sock *sk, unsigned int log) { + kuid_t uid = sk_uid(sk); struct sock *sk2; - kuid_t uid = sock_i_uid(sk); sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && @@ -156,7 +156,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(uid, sock_i_uid(sk2))) { + uid_eq(uid, sk_uid(sk2))) { if (!bitmap) return 0; } else { @@ -178,8 +178,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk) { + kuid_t uid = sk_uid(sk); struct sock *sk2; - kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); @@ -193,7 +193,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(uid, sock_i_uid(sk2))) { + uid_eq(uid, sk_uid(sk2))) { res = 0; } else { res = 1; @@ -208,7 +208,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) { struct net *net = sock_net(sk); - kuid_t uid = sock_i_uid(sk); + kuid_t uid = sk_uid(sk); struct sock *sk2; sk_for_each(sk2, &hslot->head) { @@ -218,7 +218,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) ipv6_only_sock(sk2) == ipv6_only_sock(sk) && (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); @@ -1443,7 +1443,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, - dport, inet->inet_sport, sk->sk_uid); + dport, inet->inet_sport, + sk_uid(sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); @@ -1942,8 +1943,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, error = -EAGAIN; do { spin_lock_bh(&queue->lock); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb) { if (!(flags & MSG_PEEK)) udp_skb_destructor(sk, skb); @@ -1964,8 +1965,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb && !(flags & MSG_PEEK)) udp_skb_dtor_locked(sk, skb); spin_unlock(&sk_queue->lock); @@ -2199,6 +2200,7 @@ void udp_lib_unhash(struct sock *sk) struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; + sock_rps_delete_flow(sk); hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); @@ -2345,7 +2347,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { - int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; + enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); @@ -2434,10 +2436,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) udp_lib_checksum_complete(skb)) goto csum_error; - if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop; - } udp_csum_pull_header(skb); @@ -2897,20 +2897,40 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (udp_test_bit(ENCAP_ENABLED, sk)) + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); + udp_tunnel_cleanup_gro(sk); + } } } +typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family, struct sock *sk) { #ifdef CONFIG_XFRM + udp_gro_receive_t new_gro_receive; + if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) { - if (family == AF_INET) - WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv); - else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) - WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv); + if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) + new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv; + else + new_gro_receive = xfrm4_gro_udp_encap_rcv; + + if (udp_sk(sk)->gro_receive != new_gro_receive) { + /* + * With IPV6_ADDRFORM the gro callback could change + * after being set, unregister the old one, if valid. + */ + if (udp_sk(sk)->gro_receive) + udp_tunnel_update_gro_rcv(sk, false); + + WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive); + udp_tunnel_update_gro_rcv(sk, true); + } } #endif } @@ -2960,6 +2980,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_ENCAP: + sockopt_lock_sock(sk); switch (val) { case 0: #ifdef CONFIG_XFRM @@ -2983,6 +3004,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, err = -ENOPROTOOPT; break; } + sockopt_release_sock(sk); break; case UDP_NO_CHECK6_TX: @@ -3000,13 +3022,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_GRO: - + sockopt_lock_sock(sk); /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk); udp_assign_bit(GRO_ENABLED, sk, valbool); udp_assign_bit(ACCEPT_L4, sk, valbool); set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk); + sockopt_release_sock(sk); break; /* @@ -3208,7 +3231,7 @@ struct proto udp_prot = { #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = udp_bpf_update_proto, #endif - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, @@ -3360,7 +3383,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, sk_wmem_alloc_get(sp), udp_rqueue_get(sp), 0, 0L, 0, - from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); @@ -3390,34 +3413,55 @@ struct bpf_iter__udp { int bucket __aligned(8); }; +union bpf_udp_iter_batch_item { + struct sock *sk; + __u64 cookie; +}; + struct bpf_udp_iter_state { struct udp_iter_state state; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; - int offset; - struct sock **batch; - bool st_bucket_done; + union bpf_udp_iter_batch_item *batch; }; static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, - unsigned int new_batch_sz); + unsigned int new_batch_sz, gfp_t flags); +static struct sock *bpf_iter_udp_resume(struct sock *first_sk, + union bpf_udp_iter_batch_item *cookies, + int n_cookies) +{ + struct sock *sk = NULL; + int i; + + for (i = 0; i < n_cookies; i++) { + sk = first_sk; + udp_portaddr_for_each_entry_from(sk) + if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) + goto done; + } +done: + return sk; +} + static struct sock *bpf_iter_udp_batch(struct seq_file *seq) { struct bpf_udp_iter_state *iter = seq->private; struct udp_iter_state *state = &iter->state; + unsigned int find_cookie, end_cookie; struct net *net = seq_file_net(seq); - int resume_bucket, resume_offset; struct udp_table *udptable; unsigned int batch_sks = 0; - bool resized = false; + int resume_bucket; + int resizes = 0; struct sock *sk; + int err = 0; resume_bucket = state->bucket; - resume_offset = iter->offset; /* The current batch is done, so advance the bucket. */ - if (iter->st_bucket_done) + if (iter->cur_sk == iter->end_sk) state->bucket++; udptable = udp_get_table_seq(seq, net); @@ -3430,62 +3474,89 @@ again: * before releasing the bucket lock. This allows BPF programs that are * called in seq_show to acquire the bucket lock if needed. */ + find_cookie = iter->cur_sk; + end_cookie = iter->end_sk; iter->cur_sk = 0; iter->end_sk = 0; - iter->st_bucket_done = false; batch_sks = 0; for (; state->bucket <= udptable->mask; state->bucket++) { struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot; if (hlist_empty(&hslot2->head)) - continue; + goto next_bucket; - iter->offset = 0; spin_lock_bh(&hslot2->lock); - udp_portaddr_for_each_entry(sk, &hslot2->head) { + sk = hlist_entry_safe(hslot2->head.first, struct sock, + __sk_common.skc_portaddr_node); + /* Resume from the first (in iteration order) unseen socket from + * the last batch that still exists in resume_bucket. Most of + * the time this will just be where the last iteration left off + * in resume_bucket unless that socket disappeared between + * reads. + */ + if (state->bucket == resume_bucket) + sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie], + end_cookie - find_cookie); +fill_batch: + udp_portaddr_for_each_entry_from(sk) { if (seq_sk_match(seq, sk)) { - /* Resume from the last iterated socket at the - * offset in the bucket before iterator was stopped. - */ - if (state->bucket == resume_bucket && - iter->offset < resume_offset) { - ++iter->offset; - continue; - } if (iter->end_sk < iter->max_sk) { sock_hold(sk); - iter->batch[iter->end_sk++] = sk; + iter->batch[iter->end_sk++].sk = sk; } batch_sks++; } } + + /* Allocate a larger batch and try again. */ + if (unlikely(resizes <= 1 && iter->end_sk && + iter->end_sk != batch_sks)) { + resizes++; + + /* First, try with GFP_USER to maximize the chances of + * grabbing more memory. + */ + if (resizes == 1) { + spin_unlock_bh(&hslot2->lock); + err = bpf_iter_udp_realloc_batch(iter, + batch_sks * 3 / 2, + GFP_USER); + if (err) + return ERR_PTR(err); + /* Start over. */ + goto again; + } + + /* Next, hold onto the lock, so the bucket doesn't + * change while we get the rest of the sockets. + */ + err = bpf_iter_udp_realloc_batch(iter, batch_sks, + GFP_NOWAIT); + if (err) { + spin_unlock_bh(&hslot2->lock); + return ERR_PTR(err); + } + + /* Pick up where we left off. */ + sk = iter->batch[iter->end_sk - 1].sk; + sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next, + struct sock, + __sk_common.skc_portaddr_node); + batch_sks = iter->end_sk; + goto fill_batch; + } + spin_unlock_bh(&hslot2->lock); if (iter->end_sk) break; +next_bucket: + resizes = 0; } - /* All done: no batch made. */ - if (!iter->end_sk) - return NULL; - - if (iter->end_sk == batch_sks) { - /* Batching is done for the current bucket; return the first - * socket to be iterated from the batch. - */ - iter->st_bucket_done = true; - goto done; - } - if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) { - resized = true; - /* After allocating a larger batch, retry one more time to grab - * the whole bucket. - */ - goto again; - } -done: - return iter->batch[0]; + WARN_ON_ONCE(iter->end_sk != batch_sks); + return iter->end_sk ? iter->batch[0].sk : NULL; } static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -3496,16 +3567,14 @@ static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so unref the iter->cur_sk. */ - if (iter->cur_sk < iter->end_sk) { - sock_put(iter->batch[iter->cur_sk++]); - ++iter->offset; - } + if (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++].sk); /* After updating iter->cur_sk, check if there are more sockets * available in the current bucket batch. */ if (iter->cur_sk < iter->end_sk) - sk = iter->batch[iter->cur_sk]; + sk = iter->batch[iter->cur_sk].sk; else /* Prepare a new batch. */ sk = bpf_iter_udp_batch(seq); @@ -3557,7 +3626,7 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) goto unlock; } - uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket); @@ -3569,8 +3638,19 @@ unlock: static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter) { - while (iter->cur_sk < iter->end_sk) - sock_put(iter->batch[iter->cur_sk++]); + union bpf_udp_iter_batch_item *item; + unsigned int cur_sk = iter->cur_sk; + __u64 cookie; + + /* Remember the cookies of the sockets we haven't seen yet, so we can + * pick up where we left off next time around. + */ + while (cur_sk < iter->end_sk) { + item = &iter->batch[cur_sk++]; + cookie = sock_gen_cookie(item->sk); + sock_put(item->sk); + item->cookie = cookie; + } } static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) @@ -3586,10 +3666,8 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) (void)udp_prog_seq_show(prog, &meta, v, 0, 0); } - if (iter->cur_sk < iter->end_sk) { + if (iter->cur_sk < iter->end_sk) bpf_iter_udp_put_batch(iter); - iter->st_bucket_done = false; - } } static const struct seq_operations bpf_iter_udp_seq_ops = { @@ -3810,6 +3888,15 @@ fallback: static int __net_init udp_pernet_init(struct net *net) { +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + int i; + + /* No tunnel is configured */ + for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { + INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); + RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); + } +#endif udp_sysctl_init(net); udp_set_table(net); @@ -3831,16 +3918,19 @@ DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, - unsigned int new_batch_sz) + unsigned int new_batch_sz, gfp_t flags) { - struct sock **new_batch; + union bpf_udp_iter_batch_item *new_batch; new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch), - GFP_USER | __GFP_NOWARN); + flags | __GFP_NOWARN); if (!new_batch) return -ENOMEM; - bpf_iter_udp_put_batch(iter); + if (flags != GFP_NOWAIT) + bpf_iter_udp_put_batch(iter); + + memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; @@ -3859,10 +3949,12 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) if (ret) return ret; - ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ); + ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); if (ret) bpf_iter_fini_seq_net(priv_data); + iter->state.bucket = -1; + return ret; } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index e1ff3a3759961..c7142213fc211 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _UDP4_IMPL_H #define _UDP4_IMPL_H +#include <net/aligned_data.h> #include <net/udp.h> #include <net/udplite.h> #include <net/protocol.h> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 9a8142ccbabe4..5128e2a5b00a1 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -12,6 +12,162 @@ #include <net/udp.h> #include <net/protocol.h> #include <net/inet_common.h> +#include <net/udp_tunnel.h> + +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + +/* + * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL + * values for the udp tunnel static call. + */ +static struct sk_buff *dummy_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->flush = 1; + return NULL; +} + +typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + +struct udp_tunnel_type_entry { + udp_tunnel_gro_rcv_t gro_receive; + refcount_t count; +}; + +#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ + IS_ENABLED(CONFIG_VXLAN) * 2 + \ + IS_ENABLED(CONFIG_NET_FOU) * 2 + \ + IS_ENABLED(CONFIG_XFRM) * 2) + +DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); +static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); +static DEFINE_MUTEX(udp_tunnel_gro_type_lock); +static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; +static unsigned int udp_tunnel_gro_type_nr; +static DEFINE_SPINLOCK(udp_tunnel_gro_lock); + +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + struct udp_sock *tup, *up = udp_sk(sk); + struct udp_tunnel_gro *udp_tunnel_gro; + + spin_lock(&udp_tunnel_gro_lock); + udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; + if (add) + hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); + else if (up->tunnel_list.pprev) + hlist_del_init(&up->tunnel_list); + + if (udp_tunnel_gro->list.first && + !udp_tunnel_gro->list.first->next) { + tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, + tunnel_list); + + rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); + } else { + RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); + } + + spin_unlock(&udp_tunnel_gro_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); + +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) +{ + struct udp_tunnel_type_entry *cur = NULL; + struct udp_sock *up = udp_sk(sk); + int i, old_gro_type_nr; + + if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive) + return; + + mutex_lock(&udp_tunnel_gro_type_lock); + + /* Check if the static call is permanently disabled. */ + if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES) + goto out; + + for (i = 0; i < udp_tunnel_gro_type_nr; i++) + if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) + cur = &udp_tunnel_gro_types[i]; + + old_gro_type_nr = udp_tunnel_gro_type_nr; + if (add) { + /* + * Update the matching entry, if found, or add a new one + * if needed + */ + if (cur) { + refcount_inc(&cur->count); + goto out; + } + + if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { + pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); + /* Ensure static call will never be enabled */ + udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1; + } else { + cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; + refcount_set(&cur->count, 1); + cur->gro_receive = up->gro_receive; + } + } else { + /* + * The stack cleanups only successfully added tunnel, the + * lookup on removal should never fail. + */ + if (WARN_ON_ONCE(!cur)) + goto out; + + if (!refcount_dec_and_test(&cur->count)) + goto out; + + /* Avoid gaps, so that the enable tunnel has always id 0 */ + *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; + } + + if (udp_tunnel_gro_type_nr == 1) { + static_call_update(udp_tunnel_gro_rcv, + udp_tunnel_gro_types[0].gro_receive); + static_branch_enable(&udp_tunnel_static_call); + } else if (old_gro_type_nr == 1) { + static_branch_disable(&udp_tunnel_static_call); + static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); + } + +out: + mutex_unlock(&udp_tunnel_gro_type_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + if (static_branch_likely(&udp_tunnel_static_call)) { + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + return static_call(udp_tunnel_gro_rcv)(sk, head, skb); + } + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#else + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -332,6 +488,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, bool copy_dtor; __sum16 check; __be16 newlen; + int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) @@ -360,6 +517,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) return __udp_gso_segment_list(gso_skb, features, is_ipv6); + ret = __skb_linearize(gso_skb); + if (ret) + return ERR_PTR(ret); + /* Setup csum, as fraglist skips this in udp4_gro_receive. */ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; gso_skb->csum_offset = offsetof(struct udphdr, check); @@ -599,6 +760,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, NAPI_GRO_CB(skb)->flush = 1; return NULL; } + skb_set_network_header(skb, skb_gro_receive_network_offset(skb)); ret = skb_gro_receive_list(p, skb); } else { skb_gro_postpull_rcsum(skb, uh, @@ -681,7 +843,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); + pp = udp_tunnel_gro_rcv(sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -694,8 +856,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, false); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, @@ -826,5 +993,6 @@ int __init udpv4_offload_init(void) .gro_complete = udp4_gro_complete, }, }; + return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 619a53eb672da..fce945f23069c 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -58,6 +58,15 @@ error: } EXPORT_SYMBOL(udp_sock_create4); +static bool sk_saddr_any(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#else + return !sk->sk_rcv_saddr; +#endif +} + void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { @@ -80,6 +89,12 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); + + udp_tunnel_update_gro_rcv(sk, true); + + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && + sk->sk_kern_sock) + udp_tunnel_update_gro_lookup(net, sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); @@ -119,15 +134,17 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) struct udp_tunnel_info ti; struct net_device *dev; + ASSERT_RTNL(); + ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { + udp_tunnel_nic_lock(dev); udp_tunnel_nic_add_port(dev, &ti); + udp_tunnel_nic_unlock(dev); } - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); @@ -139,22 +156,24 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) struct udp_tunnel_info ti; struct net_device *dev; + ASSERT_RTNL(); + ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { + udp_tunnel_nic_lock(dev); udp_tunnel_nic_del_port(dev, &ti); + udp_tunnel_nic_unlock(dev); } - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck) + bool xnet, bool nocheck, u16 ipcb_flags) { struct udphdr *uh; @@ -170,7 +189,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb udp_set_csum(nocheck, skb, src, dst, skb->len); - iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); + iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet, + ipcb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index b6d2d16189c0c..ff66db48453cf 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -29,6 +29,7 @@ struct udp_tunnel_nic_table_entry { * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer + * @lock: protects all fields * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled @@ -41,6 +42,8 @@ struct udp_tunnel_nic { struct net_device *dev; + struct mutex lock; + u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; @@ -298,22 +301,11 @@ __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { - const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; - bool may_sleep; - if (!utn->need_sync) return; - /* Drivers which sleep in the callback need to update from - * the workqueue, if we come from the tunnel driver's notification. - */ - may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; - if (!may_sleep) - __udp_tunnel_nic_device_sync(dev, utn); - if (may_sleep || utn->need_replay) { - queue_work(udp_tunnel_nic_workqueue, &utn->work); - utn->work_pending = 1; - } + queue_work(udp_tunnel_nic_workqueue, &utn->work); + utn->work_pending = 1; } static bool @@ -554,12 +546,12 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) struct udp_tunnel_nic *utn; unsigned int i, j; - ASSERT_RTNL(); - utn = dev->udp_tunnel_nic; if (!utn) return; + mutex_lock(&utn->lock); + utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { @@ -569,7 +561,7 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); - /* We don't release rtnl across ops */ + /* We don't release utn lock across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; @@ -579,6 +571,8 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) } __udp_tunnel_nic_device_sync(dev, utn); + + mutex_unlock(&utn->lock); } static size_t @@ -643,6 +637,33 @@ err_cancel: return -EMSGSIZE; } +static void __udp_tunnel_nic_assert_locked(struct net_device *dev) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (utn) + lockdep_assert_held(&utn->lock); +} + +static void __udp_tunnel_nic_lock(struct net_device *dev) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (utn) + mutex_lock(&utn->lock); +} + +static void __udp_tunnel_nic_unlock(struct net_device *dev) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (utn) + mutex_unlock(&utn->lock); +} + static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, @@ -651,6 +672,9 @@ static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, + .assert_locked = __udp_tunnel_nic_assert_locked, + .lock = __udp_tunnel_nic_lock, + .unlock = __udp_tunnel_nic_unlock, }; static void @@ -710,11 +734,15 @@ static void udp_tunnel_nic_device_sync_work(struct work_struct *work) container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); + mutex_lock(&utn->lock); + utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); + + mutex_unlock(&utn->lock); rtnl_unlock(); } @@ -730,6 +758,7 @@ udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); + mutex_init(&utn->lock); for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, @@ -821,8 +850,11 @@ static int udp_tunnel_nic_register(struct net_device *dev) dev_hold(dev); dev->udp_tunnel_nic = utn; - if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) { + udp_tunnel_nic_lock(dev); udp_tunnel_get_rx_info(dev); + udp_tunnel_nic_unlock(dev); + } return 0; } @@ -832,6 +864,8 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + udp_tunnel_nic_lock(dev); + /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ @@ -841,8 +875,10 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; - if (list_entry_is_head(node, &info->shared->devices, list)) + if (list_entry_is_head(node, &info->shared->devices, list)) { + udp_tunnel_nic_unlock(dev); return; + } list_del(&node->list); kfree(node); @@ -852,6 +888,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; + udp_tunnel_nic_unlock(dev); goto release_dev; } @@ -862,6 +899,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); + udp_tunnel_nic_unlock(dev); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. @@ -910,12 +948,16 @@ udp_tunnel_nic_netdevice_event(struct notifier_block *unused, return NOTIFY_DONE; if (event == NETDEV_UP) { + udp_tunnel_nic_lock(dev); WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); + udp_tunnel_nic_unlock(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { + udp_tunnel_nic_lock(dev); udp_tunnel_nic_flush(dev, utn); + udp_tunnel_nic_unlock(dev); return NOTIFY_OK; } diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index af37af3ab727b..d3e621a11a1aa 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -60,7 +60,7 @@ struct proto udplite_prot = { .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 0d31a8c108d4f..f28cfd88eaf59 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -202,6 +202,9 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; + /* set the transport header to ESP */ + skb_set_transport_header(skb, offset); + NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 3cff51ba72bb0..0ae67d537499a 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -31,7 +31,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, skb->dev, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst_dev(skb), __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c6b22170dc492..f17a5dd4789fb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -239,6 +239,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ndisc_evict_nocarrier = 1, .ra_honor_pio_life = 0, .ra_honor_pio_pflag = 0, + .force_forwarding = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -303,6 +304,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ndisc_evict_nocarrier = 1, .ra_honor_pio_life = 0, .ra_honor_pio_pflag = 0, + .force_forwarding = 0, }; /* Check if link is ready: is it up and is a valid qdisc available */ @@ -857,6 +859,9 @@ static void addrconf_forward_change(struct net *net, __s32 newf) idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); + /* Disabling all.forwarding sets 0 to force_forwarding for all interfaces */ + if (newf == 0) + WRITE_ONCE(idev->cnf.force_forwarding, 0); WRITE_ONCE(idev->cnf.forwarding, newf); if (changed) @@ -2229,32 +2234,29 @@ errdad: in6_ifa_put(ifp); } -/* Join to solicited addr multicast group. - * caller must hold RTNL */ +/* Join to solicited addr multicast group. */ void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr) { struct in6_addr maddr; - if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + if (READ_ONCE(dev->flags) & (IFF_LOOPBACK | IFF_NOARP)) return; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); } -/* caller must hold RTNL */ void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr) { struct in6_addr maddr; - if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + if (READ_ONCE(idev->dev->flags) & (IFF_LOOPBACK | IFF_NOARP)) return; addrconf_addr_solict_mult(addr, &maddr); __ipv6_dev_mc_dec(idev, &maddr); } -/* caller must hold RTNL */ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; @@ -2267,7 +2269,6 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp) __ipv6_dev_ac_inc(ifp->idev, &addr); } -/* caller must hold RTNL */ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) { struct in6_addr addr; @@ -3208,7 +3209,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, } } -#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) +#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) static void add_v4_addrs(struct inet6_dev *idev) { struct in6_addr addr; @@ -3367,7 +3368,7 @@ static int ipv6_generate_stable_address(struct in6_addr *address, retry: spin_lock_bh(&lock); - sha1_init(digest); + sha1_init_raw(digest); memset(&data, 0, sizeof(data)); memset(workspace, 0, sizeof(workspace)); memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); @@ -3463,6 +3464,7 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_IEEE1394) && (dev->type != ARPHRD_TUNNEL6) && (dev->type != ARPHRD_6LOWPAN) && + (dev->type != ARPHRD_IP6GRE) && (dev->type != ARPHRD_TUNNEL) && (dev->type != ARPHRD_NONE) && (dev->type != ARPHRD_RAWIP)) { @@ -3518,34 +3520,29 @@ static void addrconf_sit_config(struct net_device *dev) } #endif -#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) +#if IS_ENABLED(CONFIG_NET_IPGRE) static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; ASSERT_RTNL(); - idev = ipv6_find_idev(dev); - if (IS_ERR(idev)) { - pr_debug("%s: add_dev failed\n", __func__); + idev = addrconf_add_dev(dev); + if (IS_ERR(idev)) return; - } /* Generate the IPv6 link-local address using addrconf_addr_gen(), * unless we have an IPv4 GRE device not bound to an IP address and * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this * case). Such devices fall back to add_v4_addrs() instead. */ - if (!(dev->type == ARPHRD_IPGRE && *(__be32 *)dev->dev_addr == 0 && + if (!(*(__be32 *)dev->dev_addr == 0 && idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) { addrconf_addr_gen(idev, true); return; } add_v4_addrs(idev); - - if (dev->flags & IFF_POINTOPOINT) - addrconf_add_mroute(dev); } #endif @@ -3557,8 +3554,7 @@ static void addrconf_init_auto_addrs(struct net_device *dev) addrconf_sit_config(dev); break; #endif -#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) - case ARPHRD_IP6GRE: +#if IS_ENABLED(CONFIG_NET_IPGRE) case ARPHRD_IPGRE: addrconf_gre_config(dev); break; @@ -3865,7 +3861,7 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) * Do not dev_put! */ if (unregister) { - idev->dead = 1; + WRITE_ONCE(idev->dead, 1); /* protected by rtnl_lock */ RCU_INIT_POINTER(dev->ip6_ptr, NULL); @@ -4017,7 +4013,7 @@ restart: static void addrconf_rs_timer(struct timer_list *t) { - struct inet6_dev *idev = from_timer(idev, t, rs_timer); + struct inet6_dev *idev = timer_container_of(idev, t, rs_timer); struct net_device *dev = idev->dev; struct in6_addr lladdr; int rtr_solicits; @@ -5349,12 +5345,12 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request"); return -EINVAL; @@ -5487,7 +5483,8 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, struct ifaddrmsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request"); return -EINVAL; } @@ -5496,7 +5493,6 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request"); return -EINVAL; @@ -5719,6 +5715,7 @@ static void ipv6_store_devconf(const struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_UNTRACKED_NA] = READ_ONCE(cnf->accept_untracked_na); array[DEVCONF_ACCEPT_RA_MIN_LFT] = READ_ONCE(cnf->accept_ra_min_lft); + array[DEVCONF_FORCE_FORWARDING] = READ_ONCE(cnf->force_forwarding); } static inline size_t inet6_ifla6_size(void) @@ -6081,7 +6078,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, hdr->ifi_type = dev->type; ifindex = READ_ONCE(dev->ifindex); hdr->ifi_index = ifindex; - hdr->ifi_flags = dev_get_flags(dev); + hdr->ifi_flags = netif_get_flags(dev); hdr->ifi_change = 0; iflink = dev_get_iflink(dev); @@ -6115,7 +6112,8 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request"); return -EINVAL; } @@ -6125,7 +6123,6 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request"); @@ -6747,6 +6744,75 @@ static int addrconf_sysctl_disable_policy(const struct ctl_table *ctl, int write return ret; } +static void addrconf_force_forward_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + for_each_netdev(net, dev) { + idev = __in6_dev_get_rtnl_net(dev); + if (idev) { + int changed = (!idev->cnf.force_forwarding) ^ (!newf); + + WRITE_ONCE(idev->cnf.force_forwarding, newf); + if (changed) + inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + dev->ifindex, &idev->cnf); + } + } +} + +static int addrconf_sysctl_force_forwarding(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct inet6_dev *idev = ctl->extra1; + struct ctl_table tmp_ctl = *ctl; + struct net *net = ctl->extra2; + int *valp = ctl->data; + int new_val = *valp; + int old_val = *valp; + loff_t pos = *ppos; + int ret; + + tmp_ctl.extra1 = SYSCTL_ZERO; + tmp_ctl.extra2 = SYSCTL_ONE; + tmp_ctl.data = &new_val; + + ret = proc_douintvec_minmax(&tmp_ctl, write, buffer, lenp, ppos); + + if (write && old_val != new_val) { + if (!rtnl_net_trylock(net)) + return restart_syscall(); + + WRITE_ONCE(*valp, new_val); + + if (valp == &net->ipv6.devconf_dflt->force_forwarding) { + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + } else if (valp == &net->ipv6.devconf_all->force_forwarding) { + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + + addrconf_force_forward_change(net, new_val); + } else { + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORCE_FORWARDING, + idev->dev->ifindex, + &idev->cnf); + } + rtnl_net_unlock(net); + } + + if (ret) + *ppos = pos; + return ret; +} + static int minus_one = -1; static const int two_five_five = 255; static u32 ioam6_if_id_max = U16_MAX; @@ -7217,6 +7283,13 @@ static const struct ctl_table addrconf_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "force_forwarding", + .data = &ipv6_devconf.force_forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_force_forwarding, + }, }; static int __addrconf_sysctl_register(struct net *net, char *dev_name, diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index ab054f329e12d..567efd626ab4d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -20,12 +20,6 @@ #include <linux/netlink.h> #include <linux/rtnetlink.h> -#if 0 -#define ADDRLABEL(x...) printk(x) -#else -#define ADDRLABEL(x...) do { ; } while (0) -#endif - /* * Policy Table */ @@ -150,8 +144,8 @@ u32 ipv6_addr_label(struct net *net, label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT; rcu_read_unlock(); - ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", - __func__, addr, type, ifindex, label); + net_dbg_ratelimited("%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", __func__, addr, type, + ifindex, label); return label; } @@ -164,8 +158,8 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, struct ip6addrlbl_entry *newp; int addrtype; - ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", - __func__, prefix, prefixlen, ifindex, (unsigned int)label); + net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", __func__, + prefix, prefixlen, ifindex, (unsigned int)label); addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK); @@ -207,8 +201,7 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, struct hlist_node *n; int ret = 0; - ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, - replace); + net_dbg_ratelimited("%s(newp=%p, replace=%d)\n", __func__, newp, replace); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == newp->prefixlen && @@ -247,9 +240,8 @@ static int ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp; int ret = 0; - ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", - __func__, prefix, prefixlen, ifindex, (unsigned int)label, - replace); + net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n", + __func__, prefix, prefixlen, ifindex, (unsigned int)label, replace); newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); if (IS_ERR(newp)) @@ -271,8 +263,8 @@ static int __ip6addrlbl_del(struct net *net, struct hlist_node *n; int ret = -ESRCH; - ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", - __func__, prefix, prefixlen, ifindex); + net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, + prefixlen, ifindex); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { if (p->prefixlen == prefixlen && @@ -294,8 +286,8 @@ static int ip6addrlbl_del(struct net *net, struct in6_addr prefix_buf; int ret; - ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", - __func__, prefix, prefixlen, ifindex); + net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix, + prefixlen, ifindex); ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); spin_lock(&net->ipv6.ip6addrlbl_table.lock); @@ -312,8 +304,6 @@ static int __net_init ip6addrlbl_net_init(struct net *net) int err; int i; - ADDRLABEL(KERN_DEBUG "%s\n", __func__); - spin_lock_init(&net->ipv6.ip6addrlbl_table.lock); INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); @@ -473,12 +463,12 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, { struct ifaddrlblmsg *ifal; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request"); return -EINVAL; } - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_prefixlen || ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request"); @@ -543,7 +533,8 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, struct ifaddrlblmsg *ifal; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request"); return -EINVAL; } @@ -552,7 +543,6 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, extack); - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request"); return -EINVAL; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f60ec8b0f8ea4..1992621e3f3f4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -715,6 +715,7 @@ const struct proto_ops inet6_stream_ops = { #endif .set_rcvlowat = tcp_set_rcvlowat, }; +EXPORT_SYMBOL_GPL(inet6_stream_ops); const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, @@ -841,7 +842,7 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); @@ -881,7 +882,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, } return false; } -EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 21e01695b48cd..f8a8e46286b8e 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -47,6 +47,9 @@ static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE]; static DEFINE_SPINLOCK(acaddr_hash_lock); +#define ac_dereference(a, idev) \ + rcu_dereference_protected(a, lockdep_is_held(&(idev)->lock)) + static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); static u32 inet6_acaddr_hash(const struct net *net, @@ -64,14 +67,12 @@ static u32 inet6_acaddr_hash(const struct net *net, int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_ac_socklist *pac = NULL; + struct net *net = sock_net(sk); + netdevice_tracker dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev; - struct ipv6_ac_socklist *pac; - struct net *net = sock_net(sk); - int ishost = !net->ipv6.devconf_all->forwarding; - int err = 0; - - ASSERT_RTNL(); + int err = 0, ishost; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -79,32 +80,43 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return -EINVAL; if (ifindex) - dev = __dev_get_by_index(net, ifindex); + dev = netdev_get_by_index(net, ifindex, &dev_tracker, GFP_KERNEL); - if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) - return -EINVAL; + if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) { + err = -EINVAL; + goto error; + } pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); - if (!pac) - return -ENOMEM; + if (!pac) { + err = -ENOMEM; + goto error; + } + pac->acl_next = NULL; pac->acl_addr = *addr; + ishost = !READ_ONCE(net->ipv6.devconf_all->forwarding); + if (ifindex == 0) { struct rt6_info *rt; + rcu_read_lock(); rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { - dev = rt->dst.dev; + dev = dst_dev(&rt->dst); + netdev_hold(dev, &dev_tracker, GFP_ATOMIC); ip6_rt_put(rt); } else if (ishost) { + rcu_read_unlock(); err = -EADDRNOTAVAIL; goto error; } else { /* router, no matching interface: just pick one */ - dev = __dev_get_by_flags(net, IFF_UP, - IFF_UP | IFF_LOOPBACK); + dev = netdev_get_by_flags_rcu(net, &dev_tracker, IFF_UP, + IFF_UP | IFF_LOOPBACK); } + rcu_read_unlock(); } if (!dev) { @@ -112,7 +124,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) goto error; } - idev = __in6_dev_get(dev); + idev = in6_dev_get(dev); if (!idev) { if (ifindex) err = -ENODEV; @@ -120,8 +132,9 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) err = -EADDRNOTAVAIL; goto error; } + /* reset ishost, now that we have a specific device */ - ishost = !idev->cnf.forwarding; + ishost = !READ_ONCE(idev->cnf.forwarding); pac->acl_ifindex = dev->ifindex; @@ -134,7 +147,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ishost) err = -EADDRNOTAVAIL; if (err) - goto error; + goto error_idev; } err = __ipv6_dev_ac_inc(idev, addr); @@ -144,7 +157,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) pac = NULL; } +error_idev: + in6_dev_put(idev); error: + netdev_put(dev, &dev_tracker); + if (pac) sock_kfree_s(sk, pac, sizeof(*pac)); return err; @@ -155,12 +172,10 @@ error: */ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct net_device *dev; struct ipv6_ac_socklist *pac, *prev_pac; + struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); - - ASSERT_RTNL(); + struct net_device *dev; prev_pac = NULL; for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { @@ -176,9 +191,11 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) else np->ipv6_ac_list = pac->acl_next; - dev = __dev_get_by_index(net, pac->acl_ifindex); - if (dev) + dev = dev_get_by_index(net, pac->acl_ifindex); + if (dev) { ipv6_dev_ac_dec(dev, &pac->acl_addr); + dev_put(dev); + } sock_kfree_s(sk, pac, sizeof(*pac)); return 0; @@ -187,21 +204,20 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) void __ipv6_sock_ac_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); struct net_device *dev = NULL; struct ipv6_ac_socklist *pac; - struct net *net = sock_net(sk); - int prev_index; + int prev_index = 0; - ASSERT_RTNL(); pac = np->ipv6_ac_list; np->ipv6_ac_list = NULL; - prev_index = 0; while (pac) { struct ipv6_ac_socklist *next = pac->acl_next; if (pac->acl_ifindex != prev_index) { - dev = __dev_get_by_index(net, pac->acl_ifindex); + dev_put(dev); + dev = dev_get_by_index(net, pac->acl_ifindex); prev_index = pac->acl_ifindex; } if (dev) @@ -209,6 +225,8 @@ void __ipv6_sock_ac_close(struct sock *sk) sock_kfree_s(sk, pac, sizeof(*pac)); pac = next; } + + dev_put(dev); } void ipv6_sock_ac_close(struct sock *sk) @@ -217,9 +235,8 @@ void ipv6_sock_ac_close(struct sock *sk) if (!np->ipv6_ac_list) return; - rtnl_lock(); + __ipv6_sock_ac_close(sk); - rtnl_unlock(); } static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca) @@ -319,16 +336,14 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) struct net *net; int err; - ASSERT_RTNL(); - write_lock_bh(&idev->lock); if (idev->dead) { err = -ENODEV; goto out; } - for (aca = rtnl_dereference(idev->ac_list); aca; - aca = rtnl_dereference(aca->aca_next)) { + for (aca = ac_dereference(idev->ac_list, idev); aca; + aca = ac_dereference(aca->aca_next, idev)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) { aca->aca_users++; err = 0; @@ -380,12 +395,10 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca, *prev_aca; - ASSERT_RTNL(); - write_lock_bh(&idev->lock); prev_aca = NULL; - for (aca = rtnl_dereference(idev->ac_list); aca; - aca = rtnl_dereference(aca->aca_next)) { + for (aca = ac_dereference(idev->ac_list, idev); aca; + aca = ac_dereference(aca->aca_next, idev)) { if (ipv6_addr_equal(&aca->aca_addr, addr)) break; prev_aca = aca; @@ -414,14 +427,18 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) return 0; } -/* called with rtnl_lock() */ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr) { - struct inet6_dev *idev = __in6_dev_get(dev); + struct inet6_dev *idev = in6_dev_get(dev); + int err; if (!idev) return -ENODEV; - return __ipv6_dev_ac_dec(idev, addr); + + err = __ipv6_dev_ac_dec(idev, addr); + in6_dev_put(idev); + + return err; } void ipv6_ac_destroy_dev(struct inet6_dev *idev) @@ -429,7 +446,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) struct ifacaddr6 *aca; write_lock_bh(&idev->lock); - while ((aca = rtnl_dereference(idev->ac_list)) != NULL) { + while ((aca = ac_dereference(idev->ac_list, idev)) != NULL) { rcu_assign_pointer(idev->ac_list, aca->aca_next); write_unlock_bh(&idev->lock); diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 62618a058b8fa..df1986973430c 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -32,7 +32,7 @@ #include <linux/unaligned.h> #include <linux/crc-ccitt.h> -/* Maximium size of the calipso option including +/* Maximum size of the calipso option including * the two-byte TLV header. */ #define CALIPSO_OPT_LEN_MAX (2 + 252) @@ -42,13 +42,13 @@ */ #define CALIPSO_HDR_LEN (2 + 8) -/* Maximium size of the calipso option including +/* Maximum size of the calipso option including * the two-byte TLV header and upto 3 bytes of * leading pad and 7 bytes of trailing pad. */ #define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) - /* Maximium size of u32 aligned buffer required to hold calipso + /* Maximum size of u32 aligned buffer required to hold calipso * option. Max of 3 initial pad bytes starting from buffer + 3. * i.e. the worst case is when the previous tlv finishes on 4n + 3. */ @@ -1207,6 +1207,10 @@ static int calipso_req_setattr(struct request_sock *req, struct ipv6_opt_hdr *old, *new; struct sock *sk = sk_to_full_sk(req_to_sk(req)); + /* sk is NULL for SYN+ACK w/ SYN Cookie */ + if (!sk) + return -ENOMEM; + if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) old = req_inet->ipv6_opt->hopopt; else @@ -1247,6 +1251,10 @@ static void calipso_req_delattr(struct request_sock *req) struct ipv6_txoptions *txopts; struct sock *sk = sk_to_full_sk(req_to_sk(req)); + /* sk is NULL for SYN+ACK w/ SYN Cookie */ + if (!sk) + return; + if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) return; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index fff78496803da..972bf0426d599 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -53,7 +53,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); if (!oif) oif = np->sticky_pktinfo.ipi6_ifindex; @@ -127,7 +127,7 @@ void ip6_datagram_release_cb(struct sock *sk) rcu_read_lock(); dst = __sk_dst_get(sk); - if (!dst || !dst->obsolete || + if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { rcu_read_unlock(); return; @@ -1064,7 +1064,7 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, sk_wmem_alloc_get(sp), rqueue, 0, 0L, 0, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 457de0745a338..d1ef9644f8262 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -306,7 +306,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - __IP6_INC_STATS(dev_net(dst->dev), idev, + __IP6_INC_STATS(dev_net(dst_dev(dst)), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); @@ -460,7 +460,7 @@ looped_back: return -1; } - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, @@ -621,7 +621,7 @@ looped_back: return -1; } - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, @@ -783,7 +783,7 @@ looped_back: kfree_skb(skb); return -1; } - if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { + if (!ipv6_chk_home_addr(skb_dst_dev_net(skb), addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; @@ -809,7 +809,7 @@ looped_back: return -1; } - if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { + if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 3fd19a84b358d..44550957fd4e3 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -196,6 +196,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct flowi6 *fl6, bool apply_ratelimit) { struct net *net = sock_net(sk); + struct net_device *dev; struct dst_entry *dst; bool res = false; @@ -208,10 +209,11 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, * this lookup should be more aggressive (not longer than timeout). */ dst = ip6_route_output(net, sk, fl6); + dev = dst_dev(dst); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); - } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { + } else if (dev && (dev->flags & IFF_LOOPBACK)) { res = true; } else { struct rt6_info *rt = dst_rt6_info(dst); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 95e9146918cc6..b8d43ed4689db 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -86,7 +86,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&th->check, skb, - diff, true); + diff, true, true); } break; case NEXTHDR_UDP: @@ -97,7 +97,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&uh->check, skb, - diff, true); + diff, true, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } @@ -111,7 +111,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, - diff, true); + diff, true, true); } break; } diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 7d574f5132e2f..7bb9edc5c28c5 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -70,7 +70,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = orig_dst->dev->ifindex; + fl6.flowi6_oif = dst_dev(orig_dst)->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst), &ip6h->daddr); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index dbcf556a35bbc..333e43434dd78 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -45,7 +45,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); @@ -54,7 +54,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, return dst; } -EXPORT_SYMBOL(inet6_csk_route_req); static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) @@ -80,7 +79,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->flowi6_mark = sk->sk_mark; fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); @@ -137,4 +136,3 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; } -EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index a84d332f952f6..9553a32000813 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -696,6 +696,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_schema *sc, u8 sclen, bool is_input) { + struct net_device *dev = skb_dst_dev(skb); struct timespec64 ts; ktime_t tstamp; u64 raw64; @@ -712,7 +713,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (is_input) byte--; - raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id; + raw32 = dev_net(dev)->ipv6.sysctl.ioam6_id; *(__be32 *)data = cpu_to_be32((byte << 24) | raw32); data += sizeof(__be32); @@ -728,10 +729,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + if (dev->flags & IFF_LOOPBACK) raw16 = IOAM6_U16_UNAVAILABLE; else - raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id); + raw16 = (__force u16)READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); @@ -783,10 +784,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct Qdisc *qdisc; __u32 qlen, backlog; - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (dev->flags & IFF_LOOPBACK) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { - queue = skb_get_tx_queue(skb_dst(skb)->dev, skb); + queue = skb_get_tx_queue(dev, skb); qdisc = rcu_dereference(queue->qdisc); qdisc_qstats_qlen_backlog(qdisc, &qlen, &backlog); @@ -807,7 +808,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (is_input) byte--; - raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide; + raw64 = dev_net(dev)->ipv6.sysctl.ioam6_id_wide; *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64); data += sizeof(__be64); @@ -823,10 +824,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + if (dev->flags & IFF_LOOPBACK) raw32 = IOAM6_U32_UNAVAILABLE; else - raw32 = READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide); + raw32 = READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 09065187378e6..1fe7894f14dd9 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -38,6 +38,7 @@ struct ioam6_lwt_freq { }; struct ioam6_lwt { + struct dst_entry null_dst; struct dst_cache cache; struct ioam6_lwt_freq freq; atomic_t pkt_cnt; @@ -177,6 +178,14 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, if (err) goto free_lwt; + /* This "fake" dst_entry will be stored in a dst_cache, which will call + * dst_hold() and dst_release() on it. We must ensure that dst_destroy() + * will never be called. For that, its initial refcount is 1 and +1 when + * it is stored in the cache. Then, +1/-1 each time we read the cache + * and release it. Long story short, we're fine. + */ + dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); + atomic_set(&ilwt->pkt_cnt, 0); ilwt->freq.k = freq_k; ilwt->freq.n = freq_n; @@ -326,7 +335,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, if (has_tunsrc) memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc)); else - ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, + ipv6_dev_get_saddr(net, dst_dev(dst), &hdr->daddr, IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); skb_postpush_rcsum(skb, hdr, len); @@ -336,7 +345,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL; + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; struct ioam6_lwt *ilwt; int err = -EINVAL; u32 pkt_cnt; @@ -344,7 +354,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ilwt = ioam6_lwt_state(dst->lwtstate); + ilwt = ioam6_lwt_state(orig_dst->lwtstate); /* Check for insertion frequency (i.e., "k over n" insertions) */ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt); @@ -352,9 +362,20 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto out; local_bh_disable(); - cache_dst = dst_cache_get(&ilwt->cache); + dst = dst_cache_get(&ilwt->cache); local_bh_enable(); + /* This is how we notify that the destination does not change after + * transformation and that we need to use orig_dst instead of the cache + */ + if (dst == &ilwt->null_dst) { + dst_release(dst); + + dst = orig_dst; + /* keep refcount balance: dst_release() is called at the end */ + dst_hold(dst); + } + switch (ilwt->mode) { case IOAM6_IPTUNNEL_MODE_INLINE: do_inline: @@ -362,7 +383,7 @@ do_inline: if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) goto out; - err = ioam6_do_inline(net, skb, &ilwt->tuninfo, cache_dst); + err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst); if (unlikely(err)) goto drop; @@ -372,7 +393,7 @@ do_encap: /* Encapsulation (ip6ip6) */ err = ioam6_do_encap(net, skb, &ilwt->tuninfo, ilwt->has_tunsrc, &ilwt->tunsrc, - &ilwt->tundst, cache_dst); + &ilwt->tundst, dst); if (unlikely(err)) goto drop; @@ -390,7 +411,7 @@ do_encap: goto drop; } - if (unlikely(!cache_dst)) { + if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -401,20 +422,27 @@ do_encap: fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; - cache_dst = ip6_route_output(net, NULL, &fl6); - if (cache_dst->error) { - err = cache_dst->error; + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; goto drop; } - /* cache only if we don't create a dst reference loop */ - if (dst->lwtstate != cache_dst->lwtstate) { - local_bh_disable(); - dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); - local_bh_enable(); - } - - err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); + /* If the destination is the same after transformation (which is + * a valid use case for IOAM), then we don't want to add it to + * the cache in order to avoid a reference loop. Instead, we add + * our fake dst_entry to the cache as a way to detect this case. + * Otherwise, we add the resolved destination to the cache. + */ + local_bh_disable(); + if (orig_dst->lwtstate == dst->lwtstate) + dst_cache_set_ip6(&ilwt->cache, + &ilwt->null_dst, &fl6.saddr); + else + dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); + local_bh_enable(); + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } @@ -422,22 +450,26 @@ do_encap: /* avoid lwtunnel_output() reentry loop when destination is the same * after transformation (e.g., with the inline mode) */ - if (dst->lwtstate != cache_dst->lwtstate) { + if (orig_dst->lwtstate != dst->lwtstate) { skb_dst_drop(skb); - skb_dst_set(skb, cache_dst); + skb_dst_set(skb, dst); return dst_output(net, sk, skb); } out: - dst_release(cache_dst); - return dst->lwtstate->orig_output(net, sk, skb); + dst_release(dst); + return orig_dst->lwtstate->orig_output(net, sk, skb); drop: - dst_release(cache_dst); + dst_release(dst); kfree_skb(skb); return err; } static void ioam6_destroy_state(struct lwtunnel_state *lwt) { + /* Since the refcount of per-cpu dst_entry caches will never be 0 (see + * why above) when our "fake" dst_entry is used, it is not necessary to + * remove them before calling dst_cache_destroy() + */ dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bf727149fdece..02c16909f6182 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -249,40 +249,52 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) struct fib6_table *fib6_new_table(struct net *net, u32 id) { - struct fib6_table *tb; + struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; + tb = fib6_get_table(net, id); if (tb) return tb; - tb = fib6_alloc_table(net, id); - if (tb) - fib6_link_table(net, tb); + new_tb = fib6_alloc_table(net, id); + if (!new_tb) + return NULL; + + spin_lock_bh(&net->ipv6.fib_table_hash_lock); + + tb = fib6_get_table(net, id); + if (unlikely(tb)) { + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + kfree(new_tb); + return tb; + } - return tb; + fib6_link_table(net, new_tb); + + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + + return new_tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { - struct fib6_table *tb; struct hlist_head *head; - unsigned int h; + struct fib6_table *tb; - if (id == 0) + if (!id) id = RT6_TABLE_MAIN; - h = id & (FIB6_TABLE_HASHSZ - 1); - rcu_read_lock(); - head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - if (tb->tb6_id == id) { - rcu_read_unlock(); + + head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)]; + + /* See comment in fib6_link_table(). RCU is not required, + * but rcu_dereference_raw() is used to avoid data-race. + */ + hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) + if (tb->tb6_id == id) return tb; - } - } - rcu_read_unlock(); return NULL; } @@ -433,15 +445,17 @@ struct fib6_dump_arg { static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; + unsigned int nsiblings; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; - if (rt->fib6_nsiblings) + nsiblings = READ_ONCE(rt->fib6_nsiblings); + if (nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, - rt->fib6_nsiblings, + nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, @@ -951,8 +965,7 @@ insert_above: } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, - const struct fib6_info *match, - const struct fib6_table *table) + const struct fib6_info *match) { int cpu; @@ -987,21 +1000,15 @@ static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, rcu_read_unlock(); } -struct fib6_nh_pcpu_arg { - struct fib6_info *from; - const struct fib6_table *table; -}; - static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { - struct fib6_nh_pcpu_arg *arg = _arg; + struct fib6_info *arg = _arg; - __fib6_drop_pcpu_from(nh, arg->from, arg->table); + __fib6_drop_pcpu_from(nh, arg); return 0; } -static void fib6_drop_pcpu_from(struct fib6_info *f6i, - const struct fib6_table *table) +static void fib6_drop_pcpu_from(struct fib6_info *f6i) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. @@ -1010,18 +1017,14 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { - struct fib6_nh_pcpu_arg arg = { - .from = f6i, - .table = table - }; - - nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, - &arg); + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i); + rcu_read_unlock(); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; - __fib6_drop_pcpu_from(fib6_nh, f6i, table); + __fib6_drop_pcpu_from(fib6_nh, f6i); } } @@ -1032,10 +1035,16 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); - fib6_drop_pcpu_from(rt, table); + fib6_drop_pcpu_from(rt); + + if (rt->nh) { + spin_lock(&rt->nh->lock); + + if (!list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); - if (rt->nh && !list_empty(&rt->nh_list)) - list_del_init(&rt->nh_list); + spin_unlock(&rt->nh->lock); + } if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1069,8 +1078,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, - struct nl_info *info, - struct netlink_ext_ack *extack) + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -1119,7 +1128,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) - rt->fib6_nsiblings = 0; + WRITE_ONCE(rt->fib6_nsiblings, 0); if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) { @@ -1148,7 +1157,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) - rt->fib6_nsiblings++; + WRITE_ONCE(rt->fib6_nsiblings, + rt->fib6_nsiblings + 1); } if (iter->fib6_metric > rt->fib6_metric) @@ -1198,12 +1208,15 @@ next_iter: fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { - sibling->fib6_nsiblings++; + WRITE_ONCE(sibling->fib6_nsiblings, + sibling->fib6_nsiblings + 1); BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rcu_read_lock(); rt6_multipath_rebalance(temp_sibling); + rcu_read_unlock(); } /* @@ -1243,10 +1256,13 @@ add: list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) - sibling->fib6_nsiblings--; - rt->fib6_nsiblings = 0; + WRITE_ONCE(sibling->fib6_nsiblings, + sibling->fib6_nsiblings - 1); + WRITE_ONCE(rt->fib6_nsiblings, 0); list_del_rcu(&rt->fib6_siblings); + rcu_read_lock(); rt6_multipath_rebalance(next_sibling); + rcu_read_unlock(); return err; } } @@ -1294,10 +1310,9 @@ add: } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ @@ -1310,10 +1325,9 @@ add: if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { @@ -1329,6 +1343,28 @@ add: return 0; } +static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) +{ + int err; + + spin_lock(&rt->nh->lock); + + if (rt->nh->dead) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = fib6_add_rt2node(fn, rt, info, extack, purge_list); + if (!err) + list_add(&rt->nh_list, &rt->nh->f6i_list); + } + + spin_unlock(&rt->nh->lock); + + return err; +} + static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && @@ -1383,6 +1419,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; + LIST_HEAD(purge_list); struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; @@ -1485,10 +1522,19 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, extack); + if (rt->nh) + err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); + else + err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { - if (rt->nh) - list_add(&rt->nh_list, &rt->nh->f6i_list); + struct fib6_info *iter, *next; + + list_for_each_entry_safe(iter, next, &purge_list, purge_link) { + list_del(&iter->purge_link); + fib6_purge_rt(iter, fn, info->nl_net); + fib6_info_release(iter); + } + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) @@ -1961,8 +2007,9 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) - sibling->fib6_nsiblings--; - rt->fib6_nsiblings = 0; + WRITE_ONCE(sibling->fib6_nsiblings, + sibling->fib6_nsiblings - 1); + WRITE_ONCE(rt->fib6_nsiblings, 0); list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } @@ -2389,7 +2436,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) static void fib6_gc_timer_cb(struct timer_list *t) { - struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); + struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } @@ -2423,6 +2470,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib_table_hash) goto out_rt6_stats; + spin_lock_init(&net->ipv6.fib_table_hash_lock); + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 957ca98fa70f9..74d49dd6124d0 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -111,8 +111,32 @@ static u32 HASH_ADDR(const struct in6_addr *addr) #define tunnels_l tunnels[1] #define tunnels_wc tunnels[0] -/* Given src, dst and key, find appropriate for input tunnel. */ +static bool ip6gre_tunnel_match(struct ip6_tnl *t, int dev_type, int link, + int *cand_score, struct ip6_tnl **ret) +{ + int score = 0; + + if (t->dev->type != ARPHRD_IP6GRE && + t->dev->type != dev_type) + return false; + + if (t->parms.link != link) + score |= 1; + if (t->dev->type != dev_type) + score |= 2; + if (score == 0) { + *ret = t; + return true; + } + if (score < *cand_score) { + *ret = t; + *cand_score = score; + } + return false; +} + +/* Given src, dst and key, find appropriate for input tunnel. */ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, const struct in6_addr *remote, const struct in6_addr *local, __be32 key, __be16 gre_proto) @@ -127,8 +151,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, gre_proto == htons(ETH_P_ERSPAN) || gre_proto == htons(ETH_P_ERSPAN2)) ? ARPHRD_ETHER : ARPHRD_IP6GRE; - int score, cand_score = 4; struct net_device *ndev; + int cand_score = 4; for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || @@ -137,22 +161,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, !(t->dev->flags & IFF_UP)) continue; - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } + if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand)) + return cand; } for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { @@ -161,22 +171,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, !(t->dev->flags & IFF_UP)) continue; - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } + if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand)) + return cand; } for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { @@ -187,22 +183,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, !(t->dev->flags & IFF_UP)) continue; - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } + if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand)) + return cand; } for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { @@ -210,22 +192,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, !(t->dev->flags & IFF_UP)) continue; - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } + if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand)) + return cand; } if (cand) @@ -1085,9 +1053,11 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ - if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); - + if (!t->parms.collect_md && dst) { + mtu = READ_ONCE(dst_dev(dst)->mtu); + if (dst_mtu(dst) > mtu) + dst->ops->update_pmtu(dst, NULL, skb, mtu, false); + } err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, NEXTHDR_GRE); if (err != 0) { @@ -1570,7 +1540,7 @@ static struct inet6_protocol ip6gre_protocol __read_mostly = { .flags = INET6_PROTO_FINAL, }; -static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct net_device *dev, *aux; @@ -1587,16 +1557,16 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { struct ip6_tnl *t; - t = rtnl_dereference(ign->tunnels[prio][h]); + t = rtnl_net_dereference(net, ign->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1640,19 +1610,9 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6gre_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit_batch_rtnl = ip6gre_exit_batch_rtnl, + .exit_rtnl = ip6gre_exit_rtnl_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 39da6a7ce5f12..168ec07e31cc3 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -187,7 +187,9 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, * arrived via the sending interface (ethX), because of the * nature of scoping architecture. --yoshfuji */ - IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; + IP6CB(skb)->iif = skb_valid_dst(skb) ? + ip6_dst_idev(skb_dst(skb))->dev->ifindex : + dev->ifindex; if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) goto err; @@ -476,6 +478,13 @@ discard: static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INDISCARDS); + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); + return 0; + } + skb_clear_delivery_time(skb); ip6_protocol_deliver_rcu(net, skb, 0, false); @@ -499,38 +508,32 @@ EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + struct net_device *dev = skb->dev; int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; - struct net_device *dev; bool deliver; - __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), - __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, - skb->len); + __IP6_UPD_PO_STATS(skb_dst_dev_net_rcu(skb), + __in6_dev_get_safely(dev), IPSTATS_MIB_INMCAST, + skb->len); /* skb->dev passed may be master dev for vrfs. */ if (sdif) { - rcu_read_lock(); - dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + dev = dev_get_by_index_rcu(dev_net_rcu(dev), sdif); if (!dev) { - rcu_read_unlock(); kfree_skb(skb); return -ENODEV; } - } else { - dev = skb->dev; } hdr = ipv6_hdr(skb); deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL); - if (sdif) - rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* * IPv6 multicast router mode is now supported ;) */ - if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) && + if (atomic_read(&dev_net_rcu(skb->dev)->ipv6.devconf_all->mc_forwarding) && !(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { @@ -571,22 +574,21 @@ int ip6_mc_input(struct sk_buff *skb) /* unknown RA - process it normally */ } - if (deliver) + if (deliver) { skb2 = skb_clone(skb, GFP_ATOMIC); - else { + } else { skb2 = skb; skb = NULL; } - if (skb2) { + if (skb2) ip6_mr_input(skb2); - } } out: #endif - if (likely(deliver)) + if (likely(deliver)) { ip6_input(skb); - else { + } else { /* discard */ kfree_skb(skb); } diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 9822163428b02..fce91183797a6 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -148,7 +148,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { - skb_reset_transport_header(skb); + if (!skb_reset_transport_header_careful(skb)) + goto out; + segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 581bc62890818..1e1410237b6ef 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -60,7 +60,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; @@ -232,8 +232,9 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; - struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + struct dst_entry *dst = skb_dst(skb); + struct net_device *dev = dst_dev(dst), *indev = skb->dev; + struct inet6_dev *idev = ip6_dst_idev(dst); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -259,7 +260,7 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) } /* - * xmit an sk_buff (used by TCP, SCTP and DCCP) + * xmit an sk_buff (used by TCP and SCTP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. @@ -271,7 +272,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); @@ -503,13 +504,15 @@ int ip6_forward(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); - struct net *net = dev_net(dst->dev); + struct net *net = dev_net(dst_dev(dst)); + struct net_device *dev; struct inet6_dev *idev; SKB_DR(reason); u32 mtu; idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); - if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) + if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && + (!idev || !READ_ONCE(idev->cnf.force_forwarding))) goto error; if (skb->pkt_type != PACKET_HOST) @@ -561,7 +564,7 @@ int ip6_forward(struct sk_buff *skb) /* XXX: idev->cnf.proxy_ndp? */ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit @@ -591,12 +594,12 @@ int ip6_forward(struct sk_buff *skb) goto drop; } dst = skb_dst(skb); - + dev = dst_dev(dst); /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ - if (IP6CB(skb)->iif == dst->dev->ifindex && + if (IP6CB(skb)->iif == dev->ifindex && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct inet_peer *peer; @@ -644,7 +647,7 @@ int ip6_forward(struct sk_buff *skb) if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ - skb->dev = dst->dev; + skb->dev = dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); __IP6_INC_STATS(net, ip6_dst_idev(dst), @@ -653,7 +656,7 @@ int ip6_forward(struct sk_buff *skb) return -EMSGSIZE; } - if (skb_cow(skb, dst->dev->hard_header_len)) { + if (skb_cow(skb, dev->hard_header_len)) { __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; @@ -666,7 +669,7 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, - net, NULL, skb, skb->dev, dst->dev, + net, NULL, skb, skb->dev, dev, ip6_forward_finish); error: @@ -1093,7 +1096,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif - (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { + (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); dst = NULL; } @@ -1524,7 +1527,8 @@ emsgsize: uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1759,8 +1763,7 @@ alloc_new_skb: if (WARN_ON_ONCE(copy > msg->msg_iter.count)) goto error; - err = skb_splice_from_iter(skb, &msg->msg_iter, copy, - sk->sk_allocation); + err = skb_splice_from_iter(skb, &msg->msg_iter, copy); if (err < 0) goto error; copy = err; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a04dd1bb4b195..3262e81223dfc 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -632,7 +632,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || - skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) + skb_dst_dev(skb2)->type != ARPHRD_TUNNEL6) goto out; } @@ -1179,7 +1179,7 @@ route_lookup: ndst = dst; } - tdev = dst->dev; + tdev = dst_dev(dst); if (tdev == dev) { DEV_STATS_INC(dev, collisions); @@ -1255,7 +1255,7 @@ route_lookup: /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ - max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); @@ -1278,7 +1278,7 @@ route_lookup: ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; - ip6tunnel_xmit(NULL, skb, dev); + ip6tunnel_xmit(NULL, skb, dev, 0); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); @@ -1562,11 +1562,22 @@ static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) netdev_state_change(t->dev); } -static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) +static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, + bool strict) { - /* for default tnl0 device allow to change only the proto */ + /* For the default ip6tnl0 device, allow changing only the protocol + * (the IP6_TNL_F_CAP_PER_PACKET flag is set on ip6tnl0, and all other + * parameters are 0). + */ + if (strict && + (!ipv6_addr_any(&p->laddr) || !ipv6_addr_any(&p->raddr) || + p->flags != t->parms.flags || p->hop_limit || p->encap_limit || + p->flowinfo || p->link || p->fwmark || p->collect_md)) + return -EINVAL; + t->parms.proto = p->proto; netdev_state_change(t->dev); + return 0; } static void @@ -1680,7 +1691,7 @@ ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) - ip6_tnl0_update(t, &p1); + ip6_tnl0_update(t, &p1, false); else ip6_tnl_update(t, &p1); } @@ -2053,8 +2064,28 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; - if (dev == ip6n->fb_tnl_dev) - return -EINVAL; + if (dev == ip6n->fb_tnl_dev) { + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { + /* iproute2 always sets TUNNEL_ENCAP_FLAG_CSUM6, so + * let's ignore this flag. + */ + ipencap.flags &= ~TUNNEL_ENCAP_FLAG_CSUM6; + if (memchr_inv(&ipencap, 0, sizeof(ipencap))) { + NL_SET_ERR_MSG(extack, + "Only protocol can be changed for fallback tunnel, not encap params"); + return -EINVAL; + } + } + + ip6_tnl_netlink_parms(data, &p); + if (ip6_tnl0_update(t, &p, true) < 0) { + NL_SET_ERR_MSG(extack, + "Only protocol can be changed for fallback tunnel"); + return -EINVAL; + } + + return 0; + } if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); @@ -2210,7 +2241,7 @@ static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) +static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; @@ -2222,25 +2253,27 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } @@ -2287,19 +2320,9 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6_tnl_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, + .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index c99053189ea8a..0ff547a4bff71 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -74,13 +74,14 @@ error: } EXPORT_SYMBOL_GPL(udp_sock_create6); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck) +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -108,8 +109,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, ip6h->daddr = *daddr; ip6h->saddr = *saddr; - ip6tunnel_xmit(sk, skb, dev); - return 0; + ip6tunnel_xmit(sk, skb, dev, ip6cb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); @@ -168,7 +168,7 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); return ERR_PTR(-ENETUNREACH); } - if (dst->dev == dev) { /* is this necessary? */ + if (dst_dev(dst) == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); dst_release(dst); return ERR_PTR(-ELOOP); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 09ec4b0ad7dc6..ad5290be4dd61 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -497,7 +497,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) (const struct in6_addr *)&x->id.daddr)) goto tx_err_link_failure; - tdev = dst->dev; + tdev = dst_dev(dst); if (tdev == dev) { DEV_STATS_INC(dev, collisions); @@ -529,7 +529,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) xmit: skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); skb_dst_set(skb, dst); - skb->dev = skb_dst(skb)->dev; + skb->dev = dst_dev(dst); err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) @@ -1112,21 +1112,21 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, - struct list_head *list) +static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list) { - int h; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t; + int h; for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); if (t) unregister_netdevice_queue(t->dev, list); } @@ -1170,22 +1170,9 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct vti6_net *ip6n; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - ip6n = net_generic(net, vti6_net_id); - vti6_destroy_tunnels(ip6n, dev_to_kill); - } -} - static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit_batch_rtnl = vti6_exit_batch_rtnl, + .exit_rtnl = vti6_exit_rtnl_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 3276cde5ebd70..e047a4680ab0e 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -839,7 +839,7 @@ static void ipmr_do_expire_process(struct mr_table *mrt) static void ipmr_expire_process(struct timer_list *t) { - struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); + struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); @@ -2035,8 +2035,8 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct * Processing handlers for ip6mr_forward */ -static int ip6mr_forward2(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, int vifi) +static int ip6mr_prepare_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; @@ -2046,7 +2046,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, vif_dev = vif_dev_read(vif); if (!vif_dev) - goto out_free; + return -1; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { @@ -2055,7 +2055,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); - goto out_free; + return -1; } #endif @@ -2069,7 +2069,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); - goto out_free; + return -1; } skb_dst_drop(skb); @@ -2093,20 +2093,43 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) - goto out_free; + return -1; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; + return 0; +} + +static void ip6mr_forward2(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) +{ + struct net_device *indev = skb->dev; + + if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; IP6CB(skb)->flags |= IP6SKB_FORWARDED; - return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, - net, NULL, skb, skb->dev, vif_dev, - ip6mr_forward2_finish); + NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, + net, NULL, skb, indev, skb->dev, + ip6mr_forward2_finish); + return; + +out_free: + kfree_skb(skb); +} + +static void ip6mr_output2(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) +{ + if (ip6mr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; + + ip6_output(net, NULL, skb); + return; out_free: kfree_skb(skb); - return 0; } /* Called with rcu_read_lock() */ @@ -2221,6 +2244,56 @@ dont_forward: kfree_skb(skb); } +/* Called under rcu_read_lock() */ +static void ip6_mr_output_finish(struct net *net, struct mr_table *mrt, + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *c) +{ + int psend = -1; + int ct; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); + + /* Forward the frame */ + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_any(&c->mf6c_mcastgrp)) { + if (ipv6_hdr(skb)->hop_limit > + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = c->_c.mfc_parent; + goto last_forward; + } + goto dont_forward; + } + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { + if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ip6mr_output2(net, mrt, skb2, psend); + } + psend = ct; + } + } +last_forward: + if (psend != -1) { + ip6mr_output2(net, mrt, skb, psend); + return; + } + +dont_forward: + kfree_skb(skb); +} /* * Multicast packets for forwarding arrive here @@ -2228,21 +2301,20 @@ dont_forward: int ip6_mr_input(struct sk_buff *skb) { + struct net_device *dev = skb->dev; + struct net *net = dev_net_rcu(dev); struct mfc6_cache *cache; - struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { - .flowi6_iif = skb->dev->ifindex, + .flowi6_iif = dev->ifindex, .flowi6_mark = skb->mark, }; int err; - struct net_device *dev; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ - dev = skb->dev; - if (netif_is_l3_master(skb->dev)) { + if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); @@ -2288,6 +2360,61 @@ int ip6_mr_input(struct sk_buff *skb) return 0; } +int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + struct flowi6 fl6 = (struct flowi6) { + .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_mark = skb->mark, + }; + struct mfc6_cache *cache; + struct mr_table *mrt; + int err; + int vif; + + guard(rcu)(); + + if (IP6CB(skb)->flags & IP6SKB_FORWARDED) + goto ip6_output; + if (!(IP6CB(skb)->flags & IP6SKB_MCROUTE)) + goto ip6_output; + + err = ip6mr_fib_lookup(net, &fl6, &mrt); + if (err < 0) { + kfree_skb(skb); + return err; + } + + cache = ip6mr_cache_find(mrt, + &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); + if (!cache) { + vif = ip6mr_find_vif(mrt, dev); + if (vif >= 0) + cache = ip6mr_cache_find_any(mrt, + &ipv6_hdr(skb)->daddr, + vif); + } + + /* No usable cache entry */ + if (!cache) { + vif = ip6mr_find_vif(mrt, dev); + if (vif >= 0) + return ip6mr_cache_unresolved(mrt, vif, skb, dev); + goto ip6_output; + } + + /* Wrong interface */ + vif = cache->_c.mfc_parent; + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) + goto ip6_output; + + ip6_mr_output_finish(net, mrt, dev, skb, cache); + return 0; + +ip6_output: + return ip6_output(net, sk, skb); +} + int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 72d4858dec18a..8607569de34f3 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -71,6 +71,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -79,6 +80,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPV6; t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 1e225e6489ea1..e66ec623972e0 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -117,26 +117,6 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, return opt; } -static bool setsockopt_needs_rtnl(int optname) -{ - switch (optname) { - case IPV6_ADDRFORM: - case IPV6_ADD_MEMBERSHIP: - case IPV6_DROP_MEMBERSHIP: - case IPV6_JOIN_ANYCAST: - case IPV6_LEAVE_ANYCAST: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: - return true; - } - return false; -} - static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { @@ -395,9 +375,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); - int val, valbool; int retv = -ENOPROTOOPT; - bool needs_rtnl = setsockopt_needs_rtnl(optname); + int val, valbool; if (sockptr_is_null(optval)) val = 0; @@ -562,8 +541,7 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, return 0; } } - if (needs_rtnl) - rtnl_lock(); + sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with @@ -969,8 +947,6 @@ done: unlock: sockopt_release_sock(sk); - if (needs_rtnl) - rtnl_unlock(); return retv; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 65831b4fee1fd..36ca27496b3c0 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -108,9 +108,9 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF; int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; -/* - * socket join on multicast group - */ +#define mc_assert_locked(idev) \ + lockdep_assert_held(&(idev)->mc_lock) + #define mc_dereference(e, idev) \ rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock)) @@ -169,17 +169,18 @@ static int unsolicited_report_interval(struct inet6_dev *idev) return iv > 0 ? iv : 1; } +/* + * socket join on multicast group + */ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode) { - struct net_device *dev = NULL; - struct ipv6_mc_socklist *mc_lst; struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst; struct net *net = sock_net(sk); + struct net_device *dev = NULL; int err; - ASSERT_RTNL(); - if (!ipv6_addr_is_multicast(addr)) return -EINVAL; @@ -199,13 +200,18 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, if (ifindex == 0) { struct rt6_info *rt; + + rcu_read_lock(); rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { - dev = rt->dst.dev; + dev = dst_dev(&rt->dst); + dev_hold(dev); ip6_rt_put(rt); } - } else - dev = __dev_get_by_index(net, ifindex); + rcu_read_unlock(); + } else { + dev = dev_get_by_index(net, ifindex); + } if (!dev) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); @@ -216,12 +222,11 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, mc_lst->sfmode = mode; RCU_INIT_POINTER(mc_lst->sflist, NULL); - /* - * now add/increase the group membership on the device - */ - + /* now add/increase the group membership on the device */ err = __ipv6_dev_mc_inc(dev, addr, mode); + dev_put(dev); + if (err) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return err; @@ -248,14 +253,36 @@ int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, /* * socket leave on multicast group */ +static void __ipv6_sock_mc_drop(struct sock *sk, struct ipv6_mc_socklist *mc_lst) +{ + struct net *net = sock_net(sk); + struct net_device *dev; + + dev = dev_get_by_index(net, mc_lst->ifindex); + if (dev) { + struct inet6_dev *idev = in6_dev_get(dev); + + ip6_mc_leave_src(sk, mc_lst, idev); + + if (idev) { + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + in6_dev_put(idev); + } + + dev_put(dev); + } else { + ip6_mc_leave_src(sk, mc_lst, NULL); + } + + atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); + kfree_rcu(mc_lst, rcu); +} + int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) { struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_mc_socklist *mc_lst; struct ipv6_mc_socklist __rcu **lnk; - struct net *net = sock_net(sk); - - ASSERT_RTNL(); + struct ipv6_mc_socklist *mc_lst; if (!ipv6_addr_is_multicast(addr)) return -EINVAL; @@ -265,23 +292,8 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { - struct net_device *dev; - *lnk = mc_lst->next; - - dev = __dev_get_by_index(net, mc_lst->ifindex); - if (dev) { - struct inet6_dev *idev = __in6_dev_get(dev); - - ip6_mc_leave_src(sk, mc_lst, idev); - if (idev) - __ipv6_dev_mc_dec(idev, &mc_lst->addr); - } else { - ip6_mc_leave_src(sk, mc_lst, NULL); - } - - atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); - kfree_rcu(mc_lst, rcu); + __ipv6_sock_mc_drop(sk, mc_lst); return 0; } } @@ -290,31 +302,33 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) } EXPORT_SYMBOL(ipv6_sock_mc_drop); -static struct inet6_dev *ip6_mc_find_dev_rtnl(struct net *net, - const struct in6_addr *group, - int ifindex) +static struct inet6_dev *ip6_mc_find_dev(struct net *net, + const struct in6_addr *group, + int ifindex) { struct net_device *dev = NULL; - struct inet6_dev *idev = NULL; + struct inet6_dev *idev; if (ifindex == 0) { - struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0); + struct rt6_info *rt; + rcu_read_lock(); + rt = rt6_lookup(net, group, NULL, 0, NULL, 0); if (rt) { - dev = rt->dst.dev; + dev = dst_dev(&rt->dst); + dev_hold(dev); ip6_rt_put(rt); } + rcu_read_unlock(); } else { - dev = __dev_get_by_index(net, ifindex); + dev = dev_get_by_index(net, ifindex); } - if (!dev) return NULL; - idev = __in6_dev_get(dev); - if (!idev) - return NULL; - if (idev->dead) - return NULL; + + idev = in6_dev_get(dev); + dev_put(dev); + return idev; } @@ -322,28 +336,10 @@ void __ipv6_sock_mc_close(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_mc_socklist *mc_lst; - struct net *net = sock_net(sk); - - ASSERT_RTNL(); while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) { - struct net_device *dev; - np->ipv6_mc_list = mc_lst->next; - - dev = __dev_get_by_index(net, mc_lst->ifindex); - if (dev) { - struct inet6_dev *idev = __in6_dev_get(dev); - - ip6_mc_leave_src(sk, mc_lst, idev); - if (idev) - __ipv6_dev_mc_dec(idev, &mc_lst->addr); - } else { - ip6_mc_leave_src(sk, mc_lst, NULL); - } - - atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); - kfree_rcu(mc_lst, rcu); + __ipv6_sock_mc_drop(sk, mc_lst); } } @@ -354,24 +350,22 @@ void ipv6_sock_mc_close(struct sock *sk) if (!rcu_access_pointer(np->ipv6_mc_list)) return; - rtnl_lock(); lock_sock(sk); __ipv6_sock_mc_close(sk); release_sock(sk); - rtnl_unlock(); } int ip6_mc_source(int add, int omode, struct sock *sk, - struct group_source_req *pgsr) + struct group_source_req *pgsr) { + struct ipv6_pinfo *inet6 = inet6_sk(sk); struct in6_addr *source, *group; + struct net *net = sock_net(sk); struct ipv6_mc_socklist *pmc; - struct inet6_dev *idev; - struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; - struct net *net = sock_net(sk); - int i, j, rv; + struct inet6_dev *idev; int leavegroup = 0; + int i, j, rv; int err; source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; @@ -380,13 +374,19 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - idev = ip6_mc_find_dev_rtnl(net, group, pgsr->gsr_interface); + idev = ip6_mc_find_dev(net, group, pgsr->gsr_interface); if (!idev) return -ENODEV; + mutex_lock(&idev->mc_lock); + + if (idev->dead) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; - mutex_lock(&idev->mc_lock); for_each_pmc_socklock(inet6, sk, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; @@ -483,6 +483,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, ip6_mc_add_src(idev, group, omode, 1, source, 1); done: mutex_unlock(&idev->mc_lock); + in6_dev_put(idev); if (leavegroup) err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; @@ -491,12 +492,12 @@ done: int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list) { - const struct in6_addr *group; - struct ipv6_mc_socklist *pmc; - struct inet6_dev *idev; struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *newpsl, *psl; struct net *net = sock_net(sk); + const struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; int leavegroup = 0; int i, err; @@ -508,10 +509,17 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; - idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface); + idev = ip6_mc_find_dev(net, group, gsf->gf_interface); if (!idev) return -ENODEV; + mutex_lock(&idev->mc_lock); + + if (idev->dead) { + err = -ENODEV; + goto done; + } + err = 0; if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) { @@ -544,24 +552,19 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, psin6 = (struct sockaddr_in6 *)list; newpsl->sl_addr[i] = psin6->sin6_addr; } - mutex_lock(&idev->mc_lock); + err = ip6_mc_add_src(idev, group, gsf->gf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { - mutex_unlock(&idev->mc_lock); sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, newpsl->sl_max)); goto done; } - mutex_unlock(&idev->mc_lock); } else { newpsl = NULL; - mutex_lock(&idev->mc_lock); ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); - mutex_unlock(&idev->mc_lock); } - mutex_lock(&idev->mc_lock); psl = sock_dereference(pmc->sflist, sk); if (psl) { ip6_mc_del_src(idev, group, pmc->sfmode, @@ -571,12 +574,14 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, } else { ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); } + rcu_assign_pointer(pmc->sflist, newpsl); - mutex_unlock(&idev->mc_lock); kfree_rcu(psl, rcu); pmc->sfmode = gsf->gf_fmode; err = 0; done: + mutex_unlock(&idev->mc_lock); + in6_dev_put(idev); if (leavegroup) err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; @@ -597,10 +602,6 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - /* changes to the ipv6_mc_list require the socket lock and - * rtnl lock. We have the socket lock, so reading the list is safe. - */ - for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; @@ -668,12 +669,13 @@ bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, return rv; } -/* called with mc_lock */ static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + mc_assert_locked(mc->idev); + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; @@ -703,12 +705,13 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) mld_ifc_event(mc->idev); } -/* called with mc_lock */ static void igmp6_group_dropped(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; + mc_assert_locked(mc->idev); + if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; @@ -729,14 +732,13 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) refcount_dec(&mc->mca_refcnt); } -/* - * deleted ifmcaddr6 manipulation - * called with mc_lock - */ +/* deleted ifmcaddr6 manipulation */ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ifmcaddr6 *pmc; + mc_assert_locked(idev); + /* this is an "ifmcaddr6" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not * used for management of the delete list. Using the same structure @@ -770,54 +772,54 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) rcu_assign_pointer(idev->mc_tomb, pmc); } -/* called with mc_lock */ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ip6_sf_list *psf, *sources, *tomb; struct in6_addr *pmca = &im->mca_addr; struct ifmcaddr6 *pmc, *pmc_prev; + mc_assert_locked(idev); + pmc_prev = NULL; for_each_mc_tomb(idev, pmc) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) break; pmc_prev = pmc; } - if (pmc) { - if (pmc_prev) - rcu_assign_pointer(pmc_prev->next, pmc->next); - else - rcu_assign_pointer(idev->mc_tomb, pmc->next); - } - - if (pmc) { - im->idev = pmc->idev; - if (im->mca_sfmode == MCAST_INCLUDE) { - tomb = rcu_replace_pointer(im->mca_tomb, - mc_dereference(pmc->mca_tomb, pmc->idev), - lockdep_is_held(&im->idev->mc_lock)); - rcu_assign_pointer(pmc->mca_tomb, tomb); - - sources = rcu_replace_pointer(im->mca_sources, - mc_dereference(pmc->mca_sources, pmc->idev), - lockdep_is_held(&im->idev->mc_lock)); - rcu_assign_pointer(pmc->mca_sources, sources); - for_each_psf_mclock(im, psf) - psf->sf_crcount = idev->mc_qrv; - } else { - im->mca_crcount = idev->mc_qrv; - } - in6_dev_put(pmc->idev); - ip6_mc_clear_src(pmc); - kfree_rcu(pmc, rcu); + if (!pmc) + return; + if (pmc_prev) + rcu_assign_pointer(pmc_prev->next, pmc->next); + else + rcu_assign_pointer(idev->mc_tomb, pmc->next); + + im->idev = pmc->idev; + if (im->mca_sfmode == MCAST_INCLUDE) { + tomb = rcu_replace_pointer(im->mca_tomb, + mc_dereference(pmc->mca_tomb, pmc->idev), + lockdep_is_held(&im->idev->mc_lock)); + rcu_assign_pointer(pmc->mca_tomb, tomb); + + sources = rcu_replace_pointer(im->mca_sources, + mc_dereference(pmc->mca_sources, pmc->idev), + lockdep_is_held(&im->idev->mc_lock)); + rcu_assign_pointer(pmc->mca_sources, sources); + for_each_psf_mclock(im, psf) + psf->sf_crcount = idev->mc_qrv; + } else { + im->mca_crcount = idev->mc_qrv; } + ip6_mc_clear_src(pmc); + in6_dev_put(pmc->idev); + kfree_rcu(pmc, rcu); } -/* called with mc_lock */ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; + mc_assert_locked(idev); + pmc = mc_dereference(idev->mc_tomb, idev); RCU_INIT_POINTER(idev->mc_tomb, NULL); @@ -843,29 +845,18 @@ static void mld_clear_delrec(struct inet6_dev *idev) static void mld_clear_query(struct inet6_dev *idev) { - struct sk_buff *skb; - spin_lock_bh(&idev->mc_query_lock); - while ((skb = __skb_dequeue(&idev->mc_query_queue))) - kfree_skb(skb); + __skb_queue_purge(&idev->mc_query_queue); spin_unlock_bh(&idev->mc_query_lock); } static void mld_clear_report(struct inet6_dev *idev) { - struct sk_buff *skb; - spin_lock_bh(&idev->mc_report_lock); - while ((skb = __skb_dequeue(&idev->mc_report_queue))) - kfree_skb(skb); + __skb_queue_purge(&idev->mc_report_queue); spin_unlock_bh(&idev->mc_report_lock); } -static void mca_get(struct ifmcaddr6 *mc) -{ - refcount_inc(&mc->mca_refcnt); -} - static void ma_put(struct ifmcaddr6 *mc) { if (refcount_dec_and_test(&mc->mca_refcnt)) { @@ -874,13 +865,14 @@ static void ma_put(struct ifmcaddr6 *mc) } } -/* called with mc_lock */ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; + mc_assert_locked(idev); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; @@ -945,23 +937,22 @@ error: static int __ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr, unsigned int mode) { - struct ifmcaddr6 *mc; struct inet6_dev *idev; - - ASSERT_RTNL(); + struct ifmcaddr6 *mc; /* we need to take a reference on idev */ idev = in6_dev_get(dev); - if (!idev) return -EINVAL; - if (idev->dead) { + mutex_lock(&idev->mc_lock); + + if (READ_ONCE(idev->dead)) { + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return -ENODEV; } - mutex_lock(&idev->mc_lock); for_each_mc_mclock(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; @@ -982,13 +973,11 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, rcu_assign_pointer(mc->next, idev->mc_list); rcu_assign_pointer(idev->mc_list, mc); - mca_get(mc); - mld_del_delrec(idev, mc); igmp6_group_added(mc); inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST); mutex_unlock(&idev->mc_lock); - ma_put(mc); + return 0; } @@ -1005,9 +994,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifmcaddr6 *ma, __rcu **map; - ASSERT_RTNL(); - mutex_lock(&idev->mc_lock); + for (map = &idev->mc_list; (ma = mc_dereference(*map, idev)); map = &ma->next) { @@ -1038,13 +1026,12 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) struct inet6_dev *idev; int err; - ASSERT_RTNL(); - - idev = __in6_dev_get(dev); + idev = in6_dev_get(dev); if (!idev) - err = -ENODEV; - else - err = __ipv6_dev_mc_dec(idev, addr); + return -ENODEV; + + err = __ipv6_dev_mc_dec(idev, addr); + in6_dev_put(idev); return err; } @@ -1091,46 +1078,51 @@ unlock: return rv; } -/* called with mc_lock */ static void mld_gq_start_work(struct inet6_dev *idev) { unsigned long tv = get_random_u32_below(idev->mc_maxdelay); + mc_assert_locked(idev); + idev->mc_gq_running = 1; if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) in6_dev_hold(idev); } -/* called with mc_lock */ static void mld_gq_stop_work(struct inet6_dev *idev) { + mc_assert_locked(idev); + idev->mc_gq_running = 0; if (cancel_delayed_work(&idev->mc_gq_work)) __in6_dev_put(idev); } -/* called with mc_lock */ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); + mc_assert_locked(idev); + if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); } -/* called with mc_lock */ static void mld_ifc_stop_work(struct inet6_dev *idev) { + mc_assert_locked(idev); + idev->mc_ifc_count = 0; if (cancel_delayed_work(&idev->mc_ifc_work)) __in6_dev_put(idev); } -/* called with mc_lock */ static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = get_random_u32_below(delay); + mc_assert_locked(idev); + if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); } @@ -1155,14 +1147,13 @@ static void mld_report_stop_work(struct inet6_dev *idev) __in6_dev_put(idev); } -/* - * IGMP handling (alias multicast ICMPv6 messages) - * called with mc_lock - */ +/* IGMP handling (alias multicast ICMPv6 messages) */ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { unsigned long delay = resptime; + mc_assert_locked(ma->idev); + /* Do not start work for these addresses */ if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) @@ -1181,15 +1172,15 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) ma->mca_flags |= MAF_TIMER_RUNNING; } -/* mark EXCLUDE-mode sources - * called with mc_lock - */ +/* mark EXCLUDE-mode sources */ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; + mc_assert_locked(pmc->idev); + scount = 0; for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) @@ -1212,13 +1203,14 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, return true; } -/* called with mc_lock */ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { struct ip6_sf_list *psf; int i, scount; + mc_assert_locked(pmc->idev); + if (pmc->mca_sfmode == MCAST_EXCLUDE) return mld_xmarksources(pmc, nsrcs, srcs); @@ -1913,7 +1905,6 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) -/* called with mc_lock */ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) @@ -1927,6 +1918,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, struct mld2_report *pmr; unsigned int mtu; + mc_assert_locked(idev); + if (pmc->mca_flags & MAF_NOREPORT) return skb; @@ -2045,12 +2038,13 @@ empty_source: return skb; } -/* called with mc_lock */ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) { struct sk_buff *skb = NULL; int type; + mc_assert_locked(idev); + if (!pmc) { for_each_mc_mclock(idev, pmc) { if (pmc->mca_flags & MAF_NOREPORT) @@ -2072,10 +2066,7 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) mld_sendpack(skb); } -/* - * remove zero-count source records from a source filter list - * called with mc_lock - */ +/* remove zero-count source records from a source filter list */ static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev) { struct ip6_sf_list *psf_prev, *psf_next, *psf; @@ -2099,7 +2090,6 @@ static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *i } } -/* called with mc_lock */ static void mld_send_cr(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; @@ -2263,13 +2253,14 @@ err_out: goto out; } -/* called with mc_lock */ static void mld_send_initial_cr(struct inet6_dev *idev) { - struct sk_buff *skb; struct ifmcaddr6 *pmc; + struct sk_buff *skb; int type; + mc_assert_locked(idev); + if (mld_in_v1_mode(idev)) return; @@ -2316,13 +2307,14 @@ static void mld_dad_work(struct work_struct *work) in6_dev_put(idev); } -/* called with mc_lock */ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, - const struct in6_addr *psfsrc) + const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; int rv = 0; + mc_assert_locked(pmc->idev); + psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) @@ -2359,7 +2351,6 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, return rv; } -/* called with mc_lock */ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) @@ -2371,6 +2362,8 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; + mc_assert_locked(idev); + for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; @@ -2412,15 +2405,14 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, return err; } -/* - * Add multicast single-source filter to the interface list - * called with mc_lock - */ +/* Add multicast single-source filter to the interface list */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, - const struct in6_addr *psfsrc) + const struct in6_addr *psfsrc) { struct ip6_sf_list *psf, *psf_prev; + mc_assert_locked(pmc->idev); + psf_prev = NULL; for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) @@ -2443,11 +2435,12 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, return 0; } -/* called with mc_lock */ static void sf_markstate(struct ifmcaddr6 *pmc) { - struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + struct ip6_sf_list *psf; + + mc_assert_locked(pmc->idev); for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { @@ -2460,14 +2453,15 @@ static void sf_markstate(struct ifmcaddr6 *pmc) } } -/* called with mc_lock */ static int sf_setstate(struct ifmcaddr6 *pmc) { - struct ip6_sf_list *psf, *dpsf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + struct ip6_sf_list *psf, *dpsf; int qrv = pmc->idev->mc_qrv; int new_in, rv; + mc_assert_locked(pmc->idev); + rv = 0; for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { @@ -2526,10 +2520,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) return rv; } -/* - * Add multicast source filter list to the interface list - * called with mc_lock - */ +/* Add multicast source filter list to the interface list */ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) @@ -2541,6 +2532,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; + mc_assert_locked(idev); + for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; @@ -2588,11 +2581,12 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, return err; } -/* called with mc_lock */ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; + mc_assert_locked(pmc->idev); + for (psf = mc_dereference(pmc->mca_tomb, pmc->idev); psf; psf = nextpsf) { @@ -2613,11 +2607,12 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1); } -/* called with mc_lock */ static void igmp6_join_group(struct ifmcaddr6 *ma) { unsigned long delay; + mc_assert_locked(ma->idev); + if (ma->mca_flags & MAF_NOREPORT) return; @@ -2664,9 +2659,10 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, return err; } -/* called with mc_lock */ static void igmp6_leave_group(struct ifmcaddr6 *ma) { + mc_assert_locked(ma->idev); + if (mld_in_v1_mode(ma->idev)) { if (ma->mca_flags & MAF_LAST_REPORTER) { igmp6_send(&ma->mca_addr, ma->idev->dev, @@ -2711,9 +2707,10 @@ static void mld_ifc_work(struct work_struct *work) in6_dev_put(idev); } -/* called with mc_lock */ static void mld_ifc_event(struct inet6_dev *idev) { + mc_assert_locked(idev); + if (mld_in_v1_mode(idev)) return; @@ -2868,8 +2865,6 @@ static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) { struct ifmcaddr6 *pmc; - ASSERT_RTNL(); - mutex_lock(&idev->mc_lock); if (mld_in_v1_mode(idev)) { for_each_mc_mclock(idev, pmc) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index ecb5c4b8518fd..7d5abb3158ec9 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -243,9 +243,8 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { - ND_PRINTK(2, warn, - "%s: duplicated ND6 option found: type=%d\n", - __func__, nd_opt->nd_opt_type); + net_dbg_ratelimited("%s: duplicated ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } @@ -275,11 +274,8 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, * to accommodate future extension to the * protocol. */ - ND_PRINTK(2, notice, - "%s: ignored unsupported option; type=%d, len=%d\n", - __func__, - nd_opt->nd_opt_type, - nd_opt->nd_opt_len); + net_dbg_ratelimited("%s: ignored unsupported option; type=%d, len=%d\n", + __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); } next_opt: opt_len -= l; @@ -377,24 +373,25 @@ static int ndisc_constructor(struct neighbour *neigh) static int pndisc_constructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; - struct in6_addr maddr; struct net_device *dev = n->dev; + struct in6_addr maddr; - if (!dev || !__in6_dev_get(dev)) + if (!dev) return -EINVAL; + addrconf_addr_solict_mult(addr, &maddr); - ipv6_dev_mc_inc(dev, &maddr); - return 0; + return ipv6_dev_mc_inc(dev, &maddr); } static void pndisc_destructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; - struct in6_addr maddr; struct net_device *dev = n->dev; + struct in6_addr maddr; - if (!dev || !__in6_dev_get(dev)) + if (!dev) return; + addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_dec(dev, &maddr); } @@ -473,6 +470,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, { struct icmp6hdr *icmp6h = icmp6_hdr(skb); struct dst_entry *dst = skb_dst(skb); + struct net_device *dev; struct inet6_dev *idev; struct net *net; struct sock *sk; @@ -507,11 +505,12 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - idev = __in6_dev_get(dst->dev); + dev = dst_dev(dst); + idev = __in6_dev_get(dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, sk, skb, NULL, dst->dev, + net, sk, skb, NULL, dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); @@ -751,9 +750,8 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) { - ND_PRINTK(1, dbg, - "%s: trying to ucast probe in NUD_INVALID: %pI6\n", - __func__, target); + net_dbg_ratelimited("%s: trying to ucast probe in NUD_INVALID: %pI6\n", + __func__, target); } ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { @@ -770,11 +768,9 @@ static int pndisc_is_router(const void *pkey, struct pneigh_entry *n; int ret = -1; - read_lock_bh(&nd_tbl.lock); - n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); + n = pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) - ret = !!(n->flags & NTF_ROUTER); - read_unlock_bh(&nd_tbl.lock); + ret = !!(READ_ONCE(n->flags) & NTF_ROUTER); return ret; } @@ -811,7 +807,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK(2, warn, "NS: multicast target address\n"); + net_dbg_ratelimited("NS: multicast target address\n"); return reason; } @@ -820,7 +816,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) * DAD has to be destined for solicited node multicast address. */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { - ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); + net_dbg_ratelimited("NS: bad DAD packet (wrong destination)\n"); return reason; } @@ -830,8 +826,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { - ND_PRINTK(2, warn, - "NS: invalid link-layer address length\n"); + net_dbg_ratelimited("NS: invalid link-layer address length\n"); return reason; } @@ -841,8 +836,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) * in the message. */ if (dad) { - ND_PRINTK(2, warn, - "NS: bad DAD packet (link-layer address option)\n"); + net_dbg_ratelimited("NS: bad DAD packet (link-layer address option)\n"); return reason; } } @@ -859,10 +853,8 @@ have_ifp: if (nonce != 0 && ifp->dad_nonce == nonce) { u8 *np = (u8 *)&nonce; /* Matching nonce if looped back */ - ND_PRINTK(2, notice, - "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", - ifp->idev->dev->name, - &ifp->addr, np); + net_dbg_ratelimited("%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", + ifp->idev->dev->name, &ifp->addr, np); goto out; } /* @@ -1013,13 +1005,13 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK(2, warn, "NA: target address is multicast\n"); + net_dbg_ratelimited("NA: target address is multicast\n"); return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { - ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); + net_dbg_ratelimited("NA: solicited NA is multicasted\n"); return reason; } @@ -1038,8 +1030,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { - ND_PRINTK(2, warn, - "NA: invalid link-layer address length\n"); + net_dbg_ratelimited("NA: invalid link-layer address length\n"); return reason; } } @@ -1060,9 +1051,9 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) - ND_PRINTK(1, warn, - "NA: %pM advertised our address %pI6c on %s!\n", - eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); + net_warn_ratelimited("NA: %pM advertised our address %pI6c on %s!\n", + eth_hdr(skb)->h_source, &ifp->addr, + ifp->idev->dev->name); in6_ifa_put(ifp); return reason; } @@ -1107,7 +1098,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && READ_ONCE(net->ipv6.devconf_all->forwarding) && READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + pneigh_lookup(&nd_tbl, net, &msg->target, dev)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } @@ -1149,7 +1140,7 @@ static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) idev = __in6_dev_get(skb->dev); if (!idev) { - ND_PRINTK(1, err, "RS: can't find in6 device\n"); + net_err_ratelimited("RS: can't find in6 device\n"); return reason; } @@ -1257,11 +1248,9 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); - ND_PRINTK(2, info, - "RA: %s, dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, dev: %s\n", __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, "RA: source address is not link-local\n"); + net_dbg_ratelimited("RA: source address is not link-local\n"); return reason; } if (optlen < 0) @@ -1269,15 +1258,14 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { - ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); + net_dbg_ratelimited("RA: from host or unauthorized router\n"); return reason; } #endif in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { - ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", - skb->dev->name); + net_err_ratelimited("RA: can't find inet6 device for %s\n", skb->dev->name); return reason; } @@ -1285,18 +1273,16 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { - ND_PRINTK(2, info, - "RA: %s, did not accept ra for dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, did not accept ra for dev: %s\n", __func__, + skb->dev->name); goto skip_linkparms; } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { - ND_PRINTK(2, info, - "RA: %s, nodetype is NODEFAULT, dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__, + skb->dev->name); goto skip_linkparms; } #endif @@ -1325,18 +1311,16 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) send_ifinfo_notify = true; if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) { - ND_PRINTK(2, info, - "RA: %s, defrtr is false for dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, defrtr is false for dev: %s\n", __func__, + skb->dev->name); goto skip_defrtr; } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); if (lifetime != 0 && lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) { - ND_PRINTK(2, info, - "RA: router lifetime (%ds) is too short: %s\n", - lifetime, skb->dev->name); + net_dbg_ratelimited("RA: router lifetime (%ds) is too short: %s\n", lifetime, + skb->dev->name); goto skip_defrtr; } @@ -1346,9 +1330,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) net = dev_net(in6_dev->dev); if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { - ND_PRINTK(2, info, - "RA from local address detected on dev: %s: default router ignored\n", - skb->dev->name); + net_dbg_ratelimited("RA from local address detected on dev: %s: default router ignored\n", + skb->dev->name); goto skip_defrtr; } @@ -1366,9 +1349,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { - ND_PRINTK(0, err, - "RA: %s got default router without neighbour\n", - __func__); + net_err_ratelimited("RA: %s got default router without neighbour\n", + __func__); fib6_info_release(rt); return reason; } @@ -1381,10 +1363,10 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) rt = NULL; } - ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", - rt, lifetime, defrtr_usr_metric, skb->dev->name); + net_dbg_ratelimited("RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime, + defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { - ND_PRINTK(3, info, "RA: adding default router\n"); + net_dbg_ratelimited("RA: adding default router\n"); if (neigh) neigh_release(neigh); @@ -1393,9 +1375,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) skb->dev, pref, defrtr_usr_metric, lifetime); if (!rt) { - ND_PRINTK(0, err, - "RA: %s failed to add default route\n", - __func__); + net_err_ratelimited("RA: %s failed to add default route\n", __func__); return reason; } @@ -1403,9 +1383,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { - ND_PRINTK(0, err, - "RA: %s got default router without neighbour\n", - __func__); + net_err_ratelimited("RA: %s got default router without neighbour\n", + __func__); fib6_info_release(rt); return reason; } @@ -1436,7 +1415,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { - ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); + net_dbg_ratelimited("RA: Got route advertisement with lower hop_limit than minimum\n"); } } @@ -1492,8 +1471,7 @@ skip_linkparms: lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { - ND_PRINTK(2, warn, - "RA: invalid link-layer address length\n"); + net_dbg_ratelimited("RA: invalid link-layer address length\n"); goto out; } } @@ -1507,9 +1485,8 @@ skip_linkparms: } if (!ipv6_accept_ra(in6_dev)) { - ND_PRINTK(2, info, - "RA: %s, accept_ra is false for dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, accept_ra is false for dev: %s\n", __func__, + skb->dev->name); goto out; } @@ -1517,9 +1494,8 @@ skip_linkparms: if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { - ND_PRINTK(2, info, - "RA from local address detected on dev: %s: router info ignored.\n", - skb->dev->name); + net_dbg_ratelimited("RA from local address detected on dev: %s: router info ignored.\n", + skb->dev->name); goto skip_routeinfo; } @@ -1555,9 +1531,8 @@ skip_routeinfo: #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { - ND_PRINTK(2, info, - "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", + __func__, skb->dev->name); goto out; } #endif @@ -1586,7 +1561,7 @@ skip_routeinfo: } if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { - ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); + net_dbg_ratelimited("RA: invalid mtu: %d\n", mtu); } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) { WRITE_ONCE(in6_dev->cnf.mtu6, mtu); fib6_metric_set(rt, RTAX_MTU, mtu); @@ -1605,7 +1580,7 @@ skip_routeinfo: } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { - ND_PRINTK(2, warn, "RA: invalid RA options\n"); + net_dbg_ratelimited("RA: invalid RA options\n"); } out: /* Send a notify if RA changed managed/otherconf flags or @@ -1633,15 +1608,13 @@ static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: - ND_PRINTK(2, warn, - "Redirect: from host or unauthorized router\n"); + net_dbg_ratelimited("Redirect: from host or unauthorized router\n"); return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, - "Redirect: source address is not link-local\n"); + net_dbg_ratelimited("Redirect: source address is not link-local\n"); return reason; } @@ -1702,15 +1675,13 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) } if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { - ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", - dev->name); + net_dbg_ratelimited("Redirect: no link-local address on %s\n", dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, - "Redirect: target address is not link-local unicast\n"); + net_dbg_ratelimited("Redirect: target address is not link-local unicast\n"); return; } @@ -1729,8 +1700,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) { - ND_PRINTK(2, warn, - "Redirect: destination is not a neighbour\n"); + net_dbg_ratelimited("Redirect: destination is not a neighbour\n"); goto release; } @@ -1743,8 +1713,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) if (dev->addr_len) { struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); if (!neigh) { - ND_PRINTK(2, warn, - "Redirect: no neigh for target address\n"); + net_dbg_ratelimited("Redirect: no neigh for target address\n"); goto release; } @@ -1845,14 +1814,12 @@ enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { - ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", - ipv6_hdr(skb)->hop_limit); + net_dbg_ratelimited("NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { - ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", - msg->icmph.icmp6_code); + net_dbg_ratelimited("NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } @@ -2003,9 +1970,8 @@ static int __net_init ndisc_net_init(struct net *net) err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { - ND_PRINTK(0, err, - "NDISC: Failed to initialize the control socket (err %d)\n", - err); + net_err_ratelimited("NDISC: Failed to initialize the control socket (err %d)\n", + err); return err; } diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 581ce055bf520..45f9105f9ac1e 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -24,7 +24,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff { const struct ipv6hdr *iph = ipv6_hdr(skb); struct sock *sk = sk_to_full_sk(sk_partial); - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst_dev(skb); struct flow_keys flkeys; unsigned int hh_len; struct dst_entry *dst; @@ -72,7 +72,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff #endif /* Change in oif may mean change in hh_len. */ - hh_len = skb_dst(skb)->dev->hard_header_len; + hh_len = skb_dst_dev(skb)->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) @@ -164,20 +164,20 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct ip6_fraglist_iter iter; struct sk_buff *frag2; - if (first_len - hlen > mtu || - skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + if (first_len - hlen > mtu) goto blackhole; - if (skb_cloned(skb)) + if (skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag2) { - if (frag2->len > mtu || - skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + if (frag2->len > mtu) goto blackhole; /* Partially cloned skb? */ - if (skb_shared(frag2)) + if (skb_shared(frag2) || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path; } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index e087a8e97ba78..276860f65baae 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -9,9 +9,8 @@ menu "IPv6: Netfilter Configuration" # old sockopt interface and eval loop config IP6_NF_IPTABLES_LEGACY tristate "Legacy IP6 tables support" - depends on INET && IPV6 - select NETFILTER_XTABLES - default n + depends on INET && IPV6 && NETFILTER_XTABLES_LEGACY + default m if NETFILTER_XTABLES_LEGACY help ip6tables is a legacy packet classifier. This is not needed if you are using iptables over nftables @@ -196,8 +195,8 @@ config IP6_NF_TARGET_HL config IP6_NF_FILTER tristate "Packet filtering" - default m if NETFILTER_ADVANCED=n - select IP6_NF_IPTABLES_LEGACY + default m if NETFILTER_ADVANCED=n || IP6_NF_IPTABLES_LEGACY + depends on IP6_NF_IPTABLES_LEGACY tristate help Packet filtering defines a table `filter', which has a series of @@ -233,8 +232,8 @@ config IP6_NF_TARGET_SYNPROXY config IP6_NF_MANGLE tristate "Packet mangling" - default m if NETFILTER_ADVANCED=n - select IP6_NF_IPTABLES_LEGACY + default m if NETFILTER_ADVANCED=n || IP6_NF_IPTABLES_LEGACY + depends on IP6_NF_IPTABLES_LEGACY help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -244,7 +243,7 @@ config IP6_NF_MANGLE config IP6_NF_RAW tristate 'raw table support (required for TRACE)' - select IP6_NF_IPTABLES_LEGACY + depends on IP6_NF_IPTABLES_LEGACY help This option adds a `raw' table to ip6tables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -258,7 +257,7 @@ config IP6_NF_SECURITY tristate "Security table" depends on SECURITY depends on NETFILTER_ADVANCED - select IP6_NF_IPTABLES_LEGACY + depends on IP6_NF_IPTABLES_LEGACY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. @@ -269,8 +268,8 @@ config IP6_NF_NAT tristate "ip6tables NAT support" depends on NF_CONNTRACK depends on NETFILTER_ADVANCED + depends on IP6_NF_IPTABLES_LEGACY select NF_NAT - select IP6_NF_IPTABLES_LEGACY select NETFILTER_XT_NAT help This enables the `nat' table in ip6tables. This allows masquerading, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d5602950ae72..d585ac3c11133 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d6bd8f7079bb7..64ab23ff559bb 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -133,7 +133,7 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static void nf_ct_frag6_expire(struct timer_list *t) { - struct inet_frag_queue *frag = from_timer(frag, t, timer); + struct inet_frag_queue *frag = timer_container_of(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 0c39c77fe8a8a..6da3102b7c1b3 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -38,7 +38,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, } skb_dst_drop(skb); skb_dst_set(skb, dst); - skb->dev = dst->dev; + skb->dev = dst_dev(dst); skb->protocol = htons(ETH_P_IPV6); return true; @@ -48,7 +48,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in6_addr *gw, int oif) { local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) @@ -64,9 +64,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->hop_limit; } if (nf_dup_ipv6_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip6_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 9ae2b2725bf99..838295fa32e36 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -300,7 +300,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, skb_dst_set(oldskb, dst); } - fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); + fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst_dev(oldskb)); fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6)); dst = ip6_route_output(net, NULL, &fl6); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 7fd9d7b21cd42..421036a3605b4 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -50,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, fl6->flowi6_mark = pkt->skb->mark; fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; + fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev); return lookup_flags; } @@ -73,8 +74,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); - fl6.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); - nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) @@ -158,6 +157,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, { const struct nft_fib *priv = nft_expr_priv(expr); int noff = skb_network_offset(pkt->skb); + const struct net_device *found = NULL; const struct net_device *oif = NULL; u32 *dest = ®s->data[priv->dreg]; struct ipv6hdr *iph, _iph; @@ -165,7 +165,6 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), - .flowi6_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; struct rt6_info *rt; int lookup_flags; @@ -203,11 +202,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; - if (oif && oif != rt->rt6i_idev->dev && - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) - goto put_rt_err; + if (!oif) { + found = rt->rt6i_idev->dev; + } else { + if (oif == rt->rt6i_idev->dev || + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex) + found = oif; + } - nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); + nft_fib_store_result(dest, priv, found); put_rt_err: ip6_rt_put(rt); } diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 806d4b5dd1e60..d21fe27fe21e3 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -105,7 +105,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); if (hoplimit == 0) { - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); struct inet6_dev *idev; rcu_read_lock(); @@ -141,7 +141,7 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IPV6); return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, NULL, skb_dst_dev(skb), dst_output); } EXPORT_SYMBOL_GPL(__ip6_local_out); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 84d90dd8b3f0f..82b0492923d45 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -142,7 +142,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.saddr = np->saddr; fl6.daddr = *daddr; fl6.flowi6_mark = ipc6.sockc.mark; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index fda640ebd53f8..4c3f8245c40f1 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -777,7 +777,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = ipc6.sockc.mark; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 49740898bc137..25ec8001898df 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -73,7 +73,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, static void ip6_frag_expire(struct timer_list *t) { - struct inet_frag_queue *frag = from_timer(frag, t, timer); + struct inet_frag_queue *frag = timer_container_of(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); @@ -104,11 +104,11 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) return container_of(q, struct frag_queue, q); } -static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, +static int ip6_frag_queue(struct net *net, + struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, u32 *prob_offset, int *refs) { - struct net *net = dev_net(skb_dst(skb)->dev); int offset, end, fragsize; struct sk_buff *prev_tail; struct net_device *dev; @@ -324,10 +324,10 @@ out_fail: static int ipv6_frag_rcv(struct sk_buff *skb) { + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct net *net = skb_dst_dev_net(skb); struct frag_hdr *fhdr; struct frag_queue *fq; - const struct ipv6hdr *hdr = ipv6_hdr(skb); - struct net *net = dev_net(skb_dst(skb)->dev); u8 nexthdr; int iif; @@ -384,7 +384,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) spin_lock(&fq->q.lock); fq->iif = iif; - ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, + ret = ip6_frag_queue(net, fq, skb, fhdr, IP6CB(skb)->nhoff, &prob_offset, &refs); spin_unlock(&fq->q.lock); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 96f1621e2381c..3299cfa12e21c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -228,13 +228,13 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), - dst->dev, skb, daddr); + dst_dev(dst), skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) @@ -391,9 +391,8 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) static bool __rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) - return time_after(jiffies, rt->dst.expires); - else - return false; + return time_after(jiffies, READ_ONCE(rt->dst.expires)); + return false; } static bool rt6_check_expired(const struct rt6_info *rt) @@ -403,10 +402,10 @@ static bool rt6_check_expired(const struct rt6_info *rt) from = rcu_dereference(rt->from); if (rt->rt6i_flags & RTF_EXPIRES) { - if (time_after(jiffies, rt->dst.expires)) + if (time_after(jiffies, READ_ONCE(rt->dst.expires))) return true; } else if (from) { - return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || + return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK || fib6_check_expired(from); } return false; @@ -1145,6 +1144,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) rt->dst.input = ip6_input; } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; + rt->dst.output = ip6_mr_output; } else { rt->dst.input = ip6_forward; } @@ -1820,11 +1820,13 @@ static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) void rt6_flush_exceptions(struct fib6_info *f6i) { - if (f6i->nh) - nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, - f6i); - else + if (f6i->nh) { + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i); + rcu_read_unlock(); + } else { fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); + } } /* Find cached rt in the hash table inside passed in rt @@ -2131,12 +2133,13 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, * expired, independently from their aging, as per RFC 8201 section 4 */ if (!(rt->rt6i_flags & RTF_EXPIRES)) { - if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) + + gc_args->timeout)) { pr_debug("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } - } else if (time_after(jiffies, rt->dst.expires)) { + } else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) { pr_debug("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; @@ -2492,8 +2495,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net, hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; @@ -2547,7 +2554,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.ports.src = fl6->fl6_sport; + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } @@ -2767,11 +2777,10 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { if (!__rt6_check_expired(rt) && - rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && fib6_check(from, cookie)) return &rt->dst; - else - return NULL; + return NULL; } INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, @@ -2861,7 +2870,7 @@ static void rt6_update_expires(struct rt6_info *rt0, int timeout) rcu_read_lock(); from = rcu_dereference(rt0->from); if (from) - rt0->dst.expires = from->expires; + WRITE_ONCE(rt0->dst.expires, from->expires); rcu_read_unlock(); } @@ -2934,7 +2943,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst->dev, + .dev = dst_dev(dst), .gw = &rt6->rt6i_gateway, }; @@ -3001,10 +3010,10 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) oif = l3mdev_master_ifindex(skb->dev); ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), - sk->sk_uid); + sk_uid(sk)); dst = __sk_dst_get(sk); - if (!dst || !dst->obsolete || + if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) return; @@ -3223,13 +3232,13 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, - READ_ONCE(sk->sk_mark), sk->sk_uid); + READ_ONCE(sk->sk_mark), sk_uid(sk)); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); unsigned int mtu = dst_mtu(dst); struct net *net; @@ -3728,62 +3737,62 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) } } -static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, - gfp_t gfp_flags, - struct netlink_ext_ack *extack) +static int fib6_config_validate(struct fib6_config *cfg, + struct netlink_ext_ack *extack) { - struct net *net = cfg->fc_nlinfo.nl_net; - struct fib6_info *rt = NULL; - struct nexthop *nh = NULL; - struct fib6_table *table; - struct fib6_nh *fib6_nh; - int err = -EINVAL; - int addr_type; - /* RTF_PCPU is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_PCPU) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); - goto out; + goto errout; } /* RTF_CACHE is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_CACHE) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); - goto out; + goto errout; } if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); - goto out; + goto errout; } if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); - goto out; + goto errout; } + +#ifdef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len > 128) { NL_SET_ERR_MSG(extack, "Invalid source address length"); - goto out; + goto errout; + } + + if (cfg->fc_nh_id && cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto errout; } -#ifndef CONFIG_IPV6_SUBTREES +#else if (cfg->fc_src_len) { NL_SET_ERR_MSG(extack, "Specifying source address requires IPV6_SUBTREES to be enabled"); - goto out; + goto errout; } #endif - if (cfg->fc_nh_id) { - nh = nexthop_find_by_id(net, cfg->fc_nh_id); - if (!nh) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - goto out; - } - err = fib6_check_nexthop(nh, cfg, extack); - if (err) - goto out; - } + return 0; +errout: + return -EINVAL; +} + +static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, + gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct fib6_table *table; + struct fib6_info *rt; + int err; - err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { table = fib6_get_table(net, cfg->fc_table); @@ -3794,22 +3803,22 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, } else { table = fib6_new_table(net, cfg->fc_table); } + if (!table) { + err = -ENOBUFS; + goto err; + } - if (!table) - goto out; - - err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags, !nh); - if (!rt) - goto out; + rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id); + if (!rt) { + err = -ENOMEM; + goto err; + } rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); - /* Do not leave garbage there. */ - rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - goto out_free; + goto free; } if (cfg->fc_flags & RTF_ADDRCONF) @@ -3817,12 +3826,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + - clock_t_to_jiffies(cfg->fc_expires)); + clock_t_to_jiffies(cfg->fc_expires)); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; - rt->fib6_protocol = cfg->fc_protocol; + rt->fib6_protocol = cfg->fc_protocol; rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; @@ -3835,23 +3844,54 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - if (nh) { - if (rt->fib6_src.plen) { - NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + return rt; +free: + kfree(rt); +err: + return ERR_PTR(err); +} + +static int ip6_route_info_create_nh(struct fib6_info *rt, + struct fib6_config *cfg, + gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct fib6_nh *fib6_nh; + int err; + + if (cfg->fc_nh_id) { + struct nexthop *nh; + + rcu_read_lock(); + + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { err = -EINVAL; + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto out_free; } + + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out_free; + if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -ENOENT; goto out_free; } + rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); + + rcu_read_unlock(); } else { + int addr_type; + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) - goto out; + goto out_release; fib6_nh = rt->fib6_nh; @@ -3870,21 +3910,21 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; - goto out; + goto out_release; } rt->fib6_prefsrc.addr = cfg->fc_prefsrc; rt->fib6_prefsrc.plen = 128; - } else - rt->fib6_prefsrc.plen = 0; + } - return rt; -out: + return 0; +out_release: fib6_info_release(rt); - return ERR_PTR(err); + return err; out_free: + rcu_read_unlock(); ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); - return ERR_PTR(err); + return err; } int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, @@ -3893,10 +3933,18 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct fib6_info *rt; int err; + err = fib6_config_validate(cfg, extack); + if (err) + return err; + rt = ip6_route_info_create(cfg, gfp_flags, extack); if (IS_ERR(rt)) return PTR_ERR(rt); + err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack); + if (err) + return err; + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); @@ -4125,9 +4173,9 @@ static int ip6_route_del(struct fib6_config *cfg, if (rt->nh) { if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); - return __ip6_del_rt(rt, &cfg->fc_nlinfo); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + break; } if (cfg->fc_nh_id) continue; @@ -4142,13 +4190,13 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) - return __ip6_del_rt(rt, &cfg->fc_nlinfo); - - return __ip6_del_rt_siblings(rt, cfg); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + else + err = __ip6_del_rt_siblings(rt, cfg); + break; } } rcu_read_unlock(); @@ -4253,7 +4301,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst->dev, + .dev = dst_dev(dst), .gw = &rt->rt6i_gateway, }; @@ -4517,7 +4565,6 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) rtmsg_to_fib6_config(net, rtmsg, &cfg); - rtnl_lock(); switch (cmd) { case SIOCADDRT: /* Only do the default setting of fc_metric in route adding */ @@ -4529,7 +4576,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) err = ip6_route_del(&cfg, NULL); break; } - rtnl_unlock(); + return err; } @@ -4540,13 +4587,14 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(dst->dev); + struct net_device *dev = dst_dev(dst); + struct net *net = dev_net(dev); struct inet6_dev *idev; SKB_DR(reason); int type; if (netif_is_l3_master(skb->dev) || - dst->dev == net->loopback_dev) + dev == net->loopback_dev) idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); else idev = ip6_dst_idev(dst); @@ -4583,7 +4631,7 @@ static int ip6_pkt_discard(struct sk_buff *skb) static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb->dev = skb_dst(skb)->dev; + skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); } @@ -4594,7 +4642,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb) static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb->dev = skb_dst(skb)->dev; + skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } @@ -4619,6 +4667,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, .fc_ignore_dev_down = true, }; struct fib6_info *f6i; + int err; if (anycast) { cfg.fc_type = RTN_ANYCAST; @@ -4629,14 +4678,19 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, } f6i = ip6_route_info_create(&cfg, gfp_flags, extack); - if (!IS_ERR(f6i)) { - f6i->dst_nocount = true; + if (IS_ERR(f6i)) + return f6i; - if (!anycast && - (READ_ONCE(net->ipv6.devconf_all->disable_policy) || - READ_ONCE(idev->cnf.disable_policy))) - f6i->dst_nopolicy = true; - } + err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack); + if (err) + return ERR_PTR(err); + + f6i->dst_nocount = true; + + if (!anycast && + (READ_ONCE(net->ipv6.devconf_all->disable_policy) || + READ_ONCE(idev->cnf.disable_policy))) + f6i->dst_nopolicy = true; return f6i; } @@ -5051,12 +5105,60 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; +static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, + struct netlink_ext_ack *extack, + bool newroute) +{ + struct rtnexthop *rtnh; + int remaining; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + if (!rtnh_ok(rtnh, remaining)) { + NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + + do { + bool has_gateway = cfg->fc_flags & RTF_GATEWAY; + int attrlen = rtnh_attrlen(rtnh); + + if (attrlen > 0) { + struct nlattr *nla, *attrs; + + attrs = rtnh_attrs(rtnh); + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + if (nla_len(nla) < sizeof(cfg->fc_gateway)) { + NL_SET_ERR_MSG(extack, + "Invalid IPv6 address in RTA_GATEWAY"); + return -EINVAL; + } + + has_gateway = true; + } + } + + if (newroute && (cfg->fc_nh_id || !has_gateway)) { + NL_SET_ERR_MSG(extack, + "Device only routes can not be added for IPv6 using the multipath API."); + return -EINVAL; + } + + rtnh = rtnh_next(rtnh, &remaining); + } while (rtnh_ok(rtnh, remaining)); + + return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack); +} + static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rtmsg *rtm; + bool newroute = nlh->nlmsg_type == RTM_NEWROUTE; struct nlattr *tb[RTA_MAX+1]; + struct rtmsg *rtm; unsigned int pref; int err; @@ -5165,9 +5267,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); - err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, - cfg->fc_mp_len, - extack, true); + err = rtm_to_fib6_multipath_config(cfg, extack, newroute); if (err < 0) goto errout; } @@ -5186,8 +5286,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack, true); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; } @@ -5209,29 +5308,28 @@ errout: struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; - struct list_head next; + struct list_head list; }; -static int ip6_route_info_append(struct net *net, - struct list_head *rt6_nh_list, +static int ip6_route_info_append(struct list_head *rt6_nh_list, struct fib6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; - int err = -EEXIST; - list_for_each_entry(nh, rt6_nh_list, next) { + list_for_each_entry(nh, rt6_nh_list, list) { /* check if fib6_info already exists */ if (rt6_duplicate_nexthop(nh->fib6_info, rt)) - return err; + return -EEXIST; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; + nh->fib6_info = rt; memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); - list_add_tail(&nh->next, rt6_nh_list); + list_add_tail(&nh->list, rt6_nh_list); return 0; } @@ -5249,7 +5347,8 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, */ rcu_read_lock(); - if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { + if ((nlflags & NLM_F_APPEND) && rt_last && + READ_ONCE(rt_last->fib6_nsiblings)) { rt = list_first_or_null_rcu(&rt_last->fib6_siblings, struct fib6_info, fib6_siblings); @@ -5287,37 +5386,30 @@ out: return should_notify; } -static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, - struct netlink_ext_ack *extack) -{ - if (nla_len(nla) < sizeof(*gw)) { - NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); - return -EINVAL; - } - - *gw = nla_get_in6_addr(nla); - - return 0; -} - static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; + struct rt6_nh *nh, *nh_safe; struct fib6_config r_cfg; struct rtnexthop *rtnh; - struct fib6_info *rt; + LIST_HEAD(rt6_nh_list); struct rt6_nh *err_nh; - struct rt6_nh *nh, *nh_safe; + struct fib6_info *rt; __u16 nlflags; int remaining; int attrlen; - int err = 1; + int replace; int nhn = 0; - int replace = (cfg->fc_nlinfo.nlh && - (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); - LIST_HEAD(rt6_nh_list); + int err; + + err = fib6_config_validate(cfg, extack); + if (err) + return err; + + replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) @@ -5340,18 +5432,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) - goto cleanup; - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } - r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); - /* RTA_ENCAP_TYPE length checked in - * lwtunnel_valid_encap_type_attr - */ + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); @@ -5364,18 +5449,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt = NULL; goto cleanup; } - if (!rt6_qualify_for_ecmp(rt)) { - err = -EINVAL; - NL_SET_ERR_MSG(extack, - "Device only routes can not be added for IPv6 using the multipath API."); - fib6_info_release(rt); + + err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack); + if (err) { + rt = NULL; goto cleanup; } rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; - err = ip6_route_info_append(info->nl_net, &rt6_nh_list, - rt, &r_cfg); + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { fib6_info_release(rt); goto cleanup; @@ -5384,12 +5467,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } - if (list_empty(&rt6_nh_list)) { - NL_SET_ERR_MSG(extack, - "Invalid nexthop configuration - no valid nexthops"); - return -EINVAL; - } - /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done @@ -5402,7 +5479,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, info->skip_notify_kernel = 1; err_nh = NULL; - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { err = __ip6_ins_rt(nh->fib6_info, info, extack); if (err) { @@ -5470,16 +5547,16 @@ add_errout: ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); /* Delete routes that were already added */ - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { if (err_nh == nh) break; ip6_route_del(&nh->r_cfg, extack); } cleanup: - list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { fib6_info_release(nh->fib6_info); - list_del(&nh->next); + list_del(&nh->list); kfree(nh); } @@ -5511,21 +5588,15 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) { - last_err = err; - goto next_rtnh; - } - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } } + err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; -next_rtnh: rtnh = rtnh_next(rtnh, &remaining); } @@ -5542,15 +5613,20 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (cfg.fc_nh_id && - !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - return -EINVAL; + if (cfg.fc_nh_id) { + rcu_read_lock(); + err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id); + rcu_read_unlock(); + + if (err) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } } - if (cfg.fc_mp) + if (cfg.fc_mp) { return ip6_route_multipath_del(&cfg, extack); - else { + } else { cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg, extack); } @@ -5596,32 +5672,34 @@ static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) static size_t rt6_nlmsg_size(struct fib6_info *f6i) { + struct fib6_info *sibling; + struct fib6_nh *nh; int nexthop_len; if (f6i->nh) { nexthop_len = nla_total_size(4); /* RTA_NH_ID */ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); - } else { - struct fib6_nh *nh = f6i->fib6_nh; - struct fib6_info *sibling; - - nexthop_len = 0; - if (f6i->fib6_nsiblings) { - rt6_nh_nlmsg_size(nh, &nexthop_len); - - rcu_read_lock(); + goto common; + } - list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, - fib6_siblings) { - rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); - } + rcu_read_lock(); +retry: + nh = f6i->fib6_nh; + nexthop_len = 0; + if (READ_ONCE(f6i->fib6_nsiblings)) { + rt6_nh_nlmsg_size(nh, &nexthop_len); - rcu_read_unlock(); + list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, + fib6_siblings) { + rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); + if (!READ_ONCE(f6i->fib6_nsiblings)) + goto retry; } - nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } - + rcu_read_unlock(); + nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); +common: return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ + nla_total_size(16) /* RTA_DST */ @@ -5770,17 +5848,19 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, * each as a nexthop within RTA_MULTIPATH. */ if (rt6) { + struct net_device *dev; + if (rt6_flags & RTF_GATEWAY && nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) goto nla_put_failure; - if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) + dev = dst_dev(dst); + if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; - if (dst->lwtstate && - lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) + if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; - } else if (rt->fib6_nsiblings) { + } else if (READ_ONCE(rt->fib6_nsiblings)) { struct fib6_info *sibling; struct nlattr *mp; @@ -5830,7 +5910,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, } if (rt6_flags & RTF_EXPIRES) { - expires = dst ? dst->expires : rt->expires; + expires = dst ? READ_ONCE(dst->expires) : rt->expires; expires -= jiffies; } @@ -5882,16 +5962,21 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, if (f6i->fib6_nh->fib_nh_dev == dev) return true; - if (f6i->fib6_nsiblings) { - struct fib6_info *sibling, *next_sibling; + if (READ_ONCE(f6i->fib6_nsiblings)) { + const struct fib6_info *sibling; - list_for_each_entry_safe(sibling, next_sibling, - &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh->fib_nh_dev == dev) + rcu_read_lock(); + list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, + fib6_siblings) { + if (sibling->fib6_nh->fib_nh_dev == dev) { + rcu_read_unlock(); return true; + } + if (!READ_ONCE(f6i->fib6_nsiblings)) + break; } + rcu_read_unlock(); } - return false; } @@ -6030,7 +6115,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -6040,7 +6126,6 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || @@ -6247,30 +6332,41 @@ errout: void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int nlm_flags) { - struct sk_buff *skb; struct net *net = info->nl_net; + struct sk_buff *skb; + size_t sz; u32 seq; int err; err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC); + rcu_read_lock(); + sz = rt6_nlmsg_size(rt); +retry: + skb = nlmsg_new(sz, GFP_ATOMIC); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, event, info->portid, seq, nlm_flags); if (err < 0) { - /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ - WARN_ON(err == -EMSGSIZE); kfree_skb(skb); + /* -EMSGSIZE implies needed space grew under us. */ + if (err == -EMSGSIZE) { + sz = max(rt6_nlmsg_size(rt), sz << 1); + goto retry; + } goto errout; } + + rcu_read_unlock(); + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, GFP_ATOMIC); return; errout: + rcu_read_unlock(); rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } @@ -6725,8 +6821,7 @@ void __init ip6_route_init_special_entries(void) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) -BTF_ID_LIST(btf_fib6_info_id) -BTF_ID(struct, fib6_info) +BTF_ID_LIST_SINGLE(btf_fib6_info_id, struct, fib6_info) static const struct bpf_iter_seq_info ipv6_route_seq_info = { .seq_ops = &ipv6_route_seq_ops, @@ -6760,9 +6855,9 @@ static void bpf_iter_unregister(void) static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, - .doit = inet6_rtm_newroute}, + .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, - .doit = inet6_rtm_delroute}, + .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 7c05ac846646f..c7942cf655671 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -129,13 +129,13 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, struct dst_entry *cache_dst) { struct ipv6_rpl_sr_hdr *isrh, *csrh; - const struct ipv6hdr *oldhdr; + struct ipv6hdr oldhdr; struct ipv6hdr *hdr; unsigned char *buf; size_t hdrlen; int err; - oldhdr = ipv6_hdr(skb); + memcpy(&oldhdr, ipv6_hdr(skb), sizeof(oldhdr)); buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC); if (!buf) @@ -147,7 +147,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, memcpy(isrh, srh, sizeof(*isrh)); memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1], (srh->segments_left - 1) * 16); - isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr->daddr; + isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr.daddr; ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0], isrh->segments_left - 1); @@ -169,7 +169,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, skb_mac_header_rebuild(skb); hdr = ipv6_hdr(skb); - memmove(hdr, oldhdr, sizeof(*hdr)); + memmove(hdr, &oldhdr, sizeof(*hdr)); isrh = (void *)hdr + sizeof(*hdr); memcpy(isrh, csrh, hdrlen); @@ -242,7 +242,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) local_bh_enable(); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } @@ -297,7 +297,7 @@ static int rpl_input(struct sk_buff *skb) local_bh_enable(); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } else { diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index bbf5b84a70fca..f78ecb6ad8383 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -40,7 +40,14 @@ #include <net/seg6_hmac.h> #include <linux/random.h> -static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring); +struct hmac_storage { + local_lock_t bh_lock; + char hmac_ring[SEG6_HMAC_RING_SIZE]; +}; + +static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -187,7 +194,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, */ local_bh_disable(); - ring = this_cpu_ptr(hmac_ring); + local_lock_nested_bh(&hmac_storage.bh_lock); + ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ @@ -212,6 +220,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, dgsize = __do_hmac(hinfo, ring, plen, tmp_out, SEG6_HMAC_MAX_DIGESTSIZE); + local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); if (dgsize < 0) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 51583461ae29b..3e1b9991131a2 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -128,7 +128,8 @@ static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(dst->dev); + struct net_device *dev = dst_dev(dst); + struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; struct ipv6_sr_hdr *isrh; int hdrlen, tot_len, err; @@ -181,7 +182,7 @@ static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, isrh->nexthdr = proto; hdr->daddr = isrh->segments[isrh->first_segment]; - set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { @@ -212,7 +213,8 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(dst->dev); + struct net_device *dev = dst_dev(dst); + struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; int hdrlen = ipv6_optlen(osrh); int red_tlv_offset, tlv_offset; @@ -270,7 +272,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, if (skip_srh) { hdr->nexthdr = proto; - set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); goto out; } @@ -306,7 +308,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, srcaddr: isrh->nexthdr = proto; - set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (unlikely(!skip_srh && sr_has_hmac(isrh))) { @@ -362,7 +364,7 @@ static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net(skb); err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) @@ -507,7 +509,7 @@ static int seg6_input_core(struct net *net, struct sock *sk, local_bh_enable(); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } else { @@ -518,7 +520,7 @@ static int seg6_input_core(struct net *net, struct sock *sk, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, seg6_input_finish); + skb_dst_dev(skb), seg6_input_finish); return seg6_input_finish(dev_net(skb->dev), NULL, skb); drop: @@ -528,7 +530,7 @@ drop: static int seg6_input_nf(struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst_dev(skb); struct net *net = dev_net(skb->dev); switch (skb->protocol) { @@ -593,7 +595,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, local_bh_enable(); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } @@ -603,7 +605,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, - NULL, skb_dst(skb)->dev, dst_output); + NULL, dst_dev(dst), dst_output); return dst_output(net, sk, skb); drop: @@ -614,7 +616,7 @@ drop: static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst_dev(skb); switch (skb->protocol) { case htons(ETH_P_IP): diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index ac1dbd492c22d..2b41e4c0dddd1 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -270,7 +270,7 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr) static int seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, - u32 tbl_id, bool local_delivery) + u32 tbl_id, bool local_delivery, int oif) { struct net *net = dev_net(skb->dev); struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -282,6 +282,7 @@ seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_iif = skb->dev->ifindex; + fl6.flowi6_oif = oif; fl6.daddr = nhaddr ? *nhaddr : hdr->daddr; fl6.saddr = hdr->saddr; fl6.flowlabel = ip6_flowinfo(hdr); @@ -291,17 +292,19 @@ seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, if (nhaddr) fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; - if (!tbl_id) { + if (!tbl_id && !oif) { dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags); - } else { + } else if (tbl_id) { struct fib6_table *table; table = fib6_get_table(net, tbl_id); if (!table) goto out; - rt = ip6_pol_route(net, table, 0, &fl6, skb, flags); + rt = ip6_pol_route(net, table, oif, &fl6, skb, flags); dst = &rt->dst; + } else { + dst = ip6_route_output(net, NULL, &fl6); } /* we want to discard traffic destined for local packet processing, @@ -310,7 +313,7 @@ seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, if (!local_delivery) dev_flags |= IFF_LOOPBACK; - if (dst && (dst->dev->flags & dev_flags) && !dst->error) { + if (dst && (dst_dev(dst)->flags & dev_flags) && !dst->error) { dst_release(dst); dst = NULL; } @@ -330,7 +333,7 @@ out: int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, u32 tbl_id) { - return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false); + return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false, 0); } static __u8 seg6_flv_lcblock_octects(const struct seg6_flavors_info *finfo) @@ -418,7 +421,7 @@ static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) static int input_action_end_x_finish(struct sk_buff *skb, struct seg6_local_lwt *slwt) { - seg6_lookup_nexthop(skb, &slwt->nh6, 0); + seg6_lookup_any_nexthop(skb, &slwt->nh6, 0, false, slwt->oif); return dst_input(skb); } @@ -1277,7 +1280,7 @@ static int input_action_end_dt6(struct sk_buff *skb, /* note: this time we do not need to specify the table because the VRF * takes care of selecting the correct table. */ - seg6_lookup_any_nexthop(skb, NULL, 0, true); + seg6_lookup_any_nexthop(skb, NULL, 0, true, 0); return dst_input(skb); @@ -1285,7 +1288,7 @@ legacy_mode: #endif skb_set_transport_header(skb, sizeof(struct ipv6hdr)); - seg6_lookup_any_nexthop(skb, NULL, slwt->table, true); + seg6_lookup_any_nexthop(skb, NULL, slwt->table, true, 0); return dst_input(skb); @@ -1477,7 +1480,8 @@ static struct seg6_action_desc seg6_action_table[] = { .action = SEG6_LOCAL_ACTION_END_X, .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6), .optattrs = SEG6_F_LOCAL_COUNTERS | - SEG6_F_LOCAL_FLAVORS, + SEG6_F_LOCAL_FLAVORS | + SEG6_F_ATTR(SEG6_LOCAL_OIF), .input = input_action_end_x, }, { @@ -1644,10 +1648,8 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, [SEG6_LOCAL_TABLE] = { .type = NLA_U32 }, [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 }, - [SEG6_LOCAL_NH4] = { .type = NLA_BINARY, - .len = sizeof(struct in_addr) }, - [SEG6_LOCAL_NH6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, + [SEG6_LOCAL_NH4] = NLA_POLICY_EXACT_LEN(sizeof(struct in_addr)), + [SEG6_LOCAL_NH6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, @@ -2085,7 +2087,7 @@ struct nla_policy seg6_local_flavors_policy[SEG6_LOCAL_FLV_MAX + 1] = { static int seg6_chk_next_csid_cfg(__u8 block_len, __u8 func_len) { /* Locator-Block and Locator-Node Function cannot exceed 128 bits - * (i.e. C-SID container lenghts). + * (i.e. C-SID container length). */ if (next_csid_chk_cntr_bits(block_len, func_len)) return -EINVAL; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 9a0f32acb7500..12496ba1b7d4d 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1035,7 +1035,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, IPPROTO_IPV6); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return NETDEV_TX_OK; tx_error_icmp: @@ -1804,8 +1804,7 @@ static struct xfrm_tunnel mplsip_handler __read_mostly = { }; #endif -static void __net_exit sit_destroy_tunnels(struct net *net, - struct list_head *head) +static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head) { struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; @@ -1820,15 +1819,15 @@ static void __net_exit sit_destroy_tunnels(struct net *net, for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; - t = rtnl_dereference(sitn->tunnels[prio][h]); + t = rtnl_net_dereference(net, sitn->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1881,19 +1880,9 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - sit_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit_batch_rtnl = sit_exit_batch_rtnl, + .exit_rtnl = sit_exit_rtnl_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 9d83eadd308b0..f0ee1a9097716 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -236,7 +236,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b03c223eda4fa..7577e7eb2c97b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -41,6 +41,7 @@ #include <linux/random.h> #include <linux/indirect_call_wrapper.h> +#include <net/aligned_data.h> #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> @@ -267,7 +268,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sk->sk_uid; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) + fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; + fl6.flowi6_uid = sk_uid(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); @@ -833,7 +836,6 @@ static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), - .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, @@ -866,7 +868,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { - struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); + struct net *net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); unsigned int tot_len = sizeof(struct tcphdr); struct sock *ctl_sk = net->ipv6.tcp_sk; const struct tcphdr *th = tcp_hdr(skb); @@ -1041,7 +1043,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, if (!sk && !ipv6_unicast_destination(skb)) return; - net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; @@ -1832,14 +1834,12 @@ lookup: } refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) { + if (!tcp_filter(sk, skb, &drop_reason)) { th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen, &drop_reason); - } else { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); @@ -1895,10 +1895,9 @@ process: nf_reset_ct(skb); - if (tcp_filter(sk, skb)) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + if (tcp_filter(sk, skb, &drop_reason)) goto discard_and_relse; - } + th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); @@ -1970,7 +1969,8 @@ do_time_wait: goto csum_error; } - tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); switch (tw_status) { case TCP_TW_SYN: { @@ -2165,7 +2165,7 @@ static void get_openreq6(struct seq_file *seq, jiffies_to_clock_t(ttd), req->num_timeout, from_kuid_munged(seq_user_ns(seq), - sock_i_uid(req->rsk_listener)), + sk_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -2231,7 +2231,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, @@ -2354,7 +2354,7 @@ struct proto tcpv6_prot = { .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, - .memory_allocated = &tcp_memory_allocated, + .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 024458ef163c9..6a68f77da44b5 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include <net/tcp_states.h> #include <net/ip6_checksum.h> #include <net/ip6_tunnel.h> +#include <net/udp_tunnel.h> #include <net/xfrm.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> @@ -749,7 +750,8 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) { if (tunnel) { ip6_redirect(skb, sock_net(sk), inet6_iif(skb), - READ_ONCE(sk->sk_mark), sk->sk_uid); + READ_ONCE(sk->sk_mark), + sk_uid(sk)); } else { ip6_sk_redirect(skb, sk); } @@ -892,10 +894,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) udp_lib_checksum_complete(skb)) goto csum_error; - if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop; - } udp_csum_pull_header(skb); @@ -1619,7 +1619,7 @@ do_udp_sendmsg: if (!fl6->flowi6_oif) fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); if (msg->msg_controllen) { opt = &opt_space; @@ -1825,6 +1825,7 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } @@ -1922,7 +1923,7 @@ struct proto udpv6_prot = { .psock_update_sk_prot = udp_bpf_update_proto, #endif - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 0590f566379d7..8a406be25a3a6 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _UDP6_IMPL_H #define _UDP6_IMPL_H +#include <net/aligned_data.h> #include <net/udp.h> #include <net/udplite.h> #include <net/protocol.h> diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 404212dfc99ab..d8445ac1b2e43 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,8 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index a60bec9b14f14..2cec542437f74 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -59,7 +59,7 @@ struct proto udplitev6_prot = { .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 841c81abaaf4f..9005fc156a20e 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -202,6 +202,9 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; + /* set the transport header to ESP */ + skb_set_transport_header(skb, offset); + NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index b3d5d1f266eee..512bdaf136997 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -106,7 +106,7 @@ skip_frag: int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, skb->dev, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst_dev(skb), __xfrm6_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index bf140ef781c1f..5120a763da0d9 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,8 +334,8 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; + xfrm_state_flush(net, IPSEC_PROTO_ANY, false); xfrm_flush_gc(); - xfrm_state_flush(net, 0, false, true); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 83070a2e44857..473a7847d80bb 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -24,6 +24,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel_stat.h> +#include <linux/export.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/spinlock.h> diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 24aec295a51cf..a4971e6fa9433 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -19,6 +19,7 @@ #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/socket.h> +#include <linux/splice.h> #include <linux/uaccess.h> #include <linux/workqueue.h> #include <linux/syscalls.h> @@ -835,8 +836,7 @@ start: if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; - err = skb_splice_from_iter(skb, &msg->msg_iter, copy, - sk->sk_allocation); + err = skb_splice_from_iter(skb, &msg->msg_iter, copy); if (err < 0) { if (err == -EMSGSIZE) goto wait_for_memory; @@ -1030,6 +1030,11 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, ssize_t copied; struct sk_buff *skb; + if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) + flags = MSG_DONTWAIT; + else + flags = 0; + /* Only support splice for SOCKSEQPACKET */ skb = skb_recv_datagram(sk, flags, &err); diff --git a/net/key/af_key.c b/net/key/af_key.c index c56bb4f451e6d..2ebde03522459 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1766,7 +1766,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_m if (proto == 0) return -EINVAL; - err = xfrm_state_flush(net, proto, true, false); + err = xfrm_state_flush(net, proto, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ @@ -2630,7 +2630,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL, net, NULL, 0, NULL); + kma ? &k : NULL, net, NULL, 0, NULL, NULL); out: return err; @@ -3788,7 +3788,7 @@ static int pfkey_seq_show(struct seq_file *f, void *v) refcount_read(&s->sk_refcnt), sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), - from_kuid_munged(seq_user_ns(f), sock_i_uid(s)), + from_kuid_munged(seq_user_ns(f), sk_uid(s)), sock_i_ino(s) ); return 0; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index b98d13584c81f..ea232f338dcb6 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -545,7 +545,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = READ_ONCE(sk->sk_mark); - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); ipcm6_init_sk(&ipc6, sk); diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c index 5b3f3b444d197..9fde6cf20f10b 100644 --- a/net/lapb/lapb_timer.c +++ b/net/lapb/lapb_timer.c @@ -74,7 +74,7 @@ int lapb_t1timer_running(struct lapb_cb *lapb) static void lapb_t2timer_expiry(struct timer_list *t) { - struct lapb_cb *lapb = from_timer(lapb, t, t2timer); + struct lapb_cb *lapb = timer_container_of(lapb, t, t2timer); spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t2timer)) /* A new timer has been set up */ @@ -94,7 +94,7 @@ out: static void lapb_t1timer_expiry(struct timer_list *t) { - struct lapb_cb *lapb = from_timer(lapb, t, t1timer); + struct lapb_cb *lapb = timer_container_of(lapb, t, t1timer); spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t1timer)) /* A new timer has been set up */ diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index cc77ec5769d82..5958a80fe14cf 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -210,7 +210,7 @@ static int llc_ui_release(struct socket *sock) dprintk("%s: closing local(%02X) remote(%02X)\n", __func__, llc->laddr.lsap, llc->daddr.lsap); if (!llc_send_disc(sk)) - llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); + llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo)); if (!sock_flag(sk, SOCK_ZAPPED)) { struct llc_sap *sap = llc->sap; @@ -455,7 +455,7 @@ static int llc_ui_shutdown(struct socket *sock, int how) goto out; rc = llc_send_disc(sk); if (!rc) - rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); + rc = llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo)); /* Wake up anyone sleeping in poll */ sk->sk_state_change(sk); out: @@ -712,7 +712,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, goto out; /* wait for a connection to arrive. */ if (skb_queue_empty(&sk->sk_receive_queue)) { - rc = llc_wait_data(sk, sk->sk_rcvtimeo); + rc = llc_wait_data(sk, READ_ONCE(sk->sk_rcvtimeo)); if (rc) goto out; } diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 7e8fc710c5908..0779daa8aa8f3 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -1335,28 +1335,31 @@ static void llc_conn_tmr_common_cb(struct sock *sk, u8 type) void llc_conn_pf_cycle_tmr_cb(struct timer_list *t) { - struct llc_sock *llc = from_timer(llc, t, pf_cycle_timer.timer); + struct llc_sock *llc = timer_container_of(llc, t, + pf_cycle_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_P_TMR); } void llc_conn_busy_tmr_cb(struct timer_list *t) { - struct llc_sock *llc = from_timer(llc, t, busy_state_timer.timer); + struct llc_sock *llc = timer_container_of(llc, t, + busy_state_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_BUSY_TMR); } void llc_conn_ack_tmr_cb(struct timer_list *t) { - struct llc_sock *llc = from_timer(llc, t, ack_timer.timer); + struct llc_sock *llc = timer_container_of(llc, t, ack_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_ACK_TMR); } void llc_conn_rej_tmr_cb(struct timer_list *t) { - struct llc_sock *llc = from_timer(llc, t, rej_sent_timer.timer); + struct llc_sock *llc = timer_container_of(llc, t, + rej_sent_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_REJ_TMR); } diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index 07e9abb5978a7..aa81c67b24a15 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -151,7 +151,7 @@ static int llc_seq_socket_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk) - llc->copied_seq, sk->sk_state, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), llc->link); out: return 0; diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 85612234742ac..e38f46ffebfa7 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -143,7 +143,8 @@ EXPORT_SYMBOL(ieee80211_stop_rx_ba_session); */ static void sta_rx_agg_session_timer_expired(struct timer_list *t) { - struct tid_ampdu_rx *tid_rx = from_timer(tid_rx, t, session_timer); + struct tid_ampdu_rx *tid_rx = timer_container_of(tid_rx, t, + session_timer); struct sta_info *sta = tid_rx->sta; u8 tid = tid_rx->tid; unsigned long timeout; @@ -163,7 +164,8 @@ static void sta_rx_agg_session_timer_expired(struct timer_list *t) static void sta_rx_agg_reorder_timer_expired(struct timer_list *t) { - struct tid_ampdu_rx *tid_rx = from_timer(tid_rx, t, reorder_timer); + struct tid_ampdu_rx *tid_rx = timer_container_of(tid_rx, t, + reorder_timer); rcu_read_lock(); ieee80211_release_reorder_timeout(tid_rx->sta, tid_rx->tid); @@ -297,7 +299,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, if (!sta->sta.valid_links && !sta->sta.deflink.ht_cap.ht_supported && - !sta->sta.deflink.he_cap.has_he) { + !sta->sta.deflink.he_cap.has_he && + !sta->sta.deflink.s1g_cap.s1g) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o HT\n", sta->sta.addr, tid); @@ -325,7 +328,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, /* XXX: check own ht delayed BA capability?? */ if (((ba_policy != 1) && (sta->sta.valid_links || - !(sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || + !(sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA) || + !(sta->sta.deflink.s1g_cap.cap[3] & S1G_CAP3_HT_DELAYED_BA))) || (buf_size > max_buf_size)) { status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 8dc8c3c96b969..d981b0fc57bf0 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -422,7 +422,8 @@ int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, */ static void sta_addba_resp_timer_expired(struct timer_list *t) { - struct tid_ampdu_tx *tid_tx = from_timer(tid_tx, t, addba_resp_timer); + struct tid_ampdu_tx *tid_tx = timer_container_of(tid_tx, t, + addba_resp_timer); struct sta_info *sta = tid_tx->sta; u8 tid = tid_tx->tid; @@ -574,7 +575,8 @@ EXPORT_SYMBOL(ieee80211_refresh_tx_agg_session_timer); */ static void sta_tx_agg_session_timer_expired(struct timer_list *t) { - struct tid_ampdu_tx *tid_tx = from_timer(tid_tx, t, session_timer); + struct tid_ampdu_tx *tid_tx = timer_container_of(tid_tx, t, + session_timer); struct sta_info *sta = tid_tx->sta; u8 tid = tid_tx->tid; unsigned long timeout; @@ -614,7 +616,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, !pubsta->deflink.ht_cap.ht_supported && !pubsta->deflink.vht_cap.vht_supported && !pubsta->deflink.he_cap.has_he && - !pubsta->deflink.eht_cap.has_eht) + !pubsta->deflink.eht_cap.has_eht && + !pubsta->deflink.s1g_cap.s1g) return -EINVAL; if (WARN_ON_ONCE(!local->ops->ampdu_action)) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9f683f838431d..2ed07fa121ab7 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -146,8 +146,8 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf) { struct ieee80211_sub_if_data *tx_sdata; + struct ieee80211_bss_conf *old; - sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; @@ -156,16 +156,29 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, if (sdata->vif.type != NL80211_IFTYPE_AP || !params->tx_wdev) return -EINVAL; + old = sdata_dereference(link_conf->tx_bss_conf, sdata); + if (old) + return -EALREADY; + tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params->tx_wdev); if (!tx_sdata) return -EINVAL; if (tx_sdata == sdata) { - sdata->vif.mbssid_tx_vif = &sdata->vif; + rcu_assign_pointer(link_conf->tx_bss_conf, link_conf); } else { - sdata->vif.mbssid_tx_vif = &tx_sdata->vif; + struct ieee80211_bss_conf *tx_bss_conf; + + tx_bss_conf = sdata_dereference(tx_sdata->vif.link_conf[params->tx_link_id], + sdata); + if (rcu_access_pointer(tx_bss_conf->tx_bss_conf) != tx_bss_conf) + return -EINVAL; + + rcu_assign_pointer(link_conf->tx_bss_conf, tx_bss_conf); + link_conf->nontransmitted = true; link_conf->bssid_index = params->index; + link_conf->bssid_indicator = tx_bss_conf->bssid_indicator; } if (params->ema) link_conf->ema_ap = true; @@ -873,6 +886,13 @@ static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev, ret = 0; memcpy(mac, sta->sta.addr, ETH_ALEN); sta_set_sinfo(sta, sinfo, true); + + /* Add accumulated removed link data to sinfo data for + * consistency for MLO + */ + if (sinfo->valid_links) + sta_set_accumulated_removed_links_sinfo(sta, sinfo); + } return ret; @@ -900,6 +920,12 @@ static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, if (sta) { ret = 0; sta_set_sinfo(sta, sinfo, true); + + /* Add accumulated removed link data to sinfo data for + * consistency for MLO + */ + if (sinfo->valid_links) + sta_set_accumulated_removed_links_sinfo(sta, sinfo); } return ret; @@ -1045,6 +1071,47 @@ ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata, return 0; } +static int +ieee80211_set_s1g_short_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_data *link, + struct cfg80211_s1g_short_beacon *params) +{ + struct s1g_short_beacon_data *new; + struct s1g_short_beacon_data *old = + sdata_dereference(link->u.ap.s1g_short_beacon, sdata); + size_t new_len = + sizeof(*new) + params->short_head_len + params->short_tail_len; + + if (!params->update) + return 0; + + if (!params->short_head) + return -EINVAL; + + new = kzalloc(new_len, GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* Memory layout: | struct | head | tail | */ + new->short_head = (u8 *)new + sizeof(*new); + new->short_head_len = params->short_head_len; + memcpy(new->short_head, params->short_head, params->short_head_len); + + if (params->short_tail) { + new->short_tail = new->short_head + params->short_head_len; + new->short_tail_len = params->short_tail_len; + memcpy(new->short_tail, params->short_tail, + params->short_tail_len); + } + + rcu_assign_pointer(link->u.ap.s1g_short_beacon, new); + + if (old) + kfree_rcu(old, rcu_head); + + return 0; +} + static int ieee80211_set_ftm_responder_params( struct ieee80211_sub_if_data *sdata, const u8 *lci, size_t lci_len, @@ -1109,13 +1176,13 @@ ieee80211_copy_rnr_beacon(u8 *pos, struct cfg80211_rnr_elems *dst, { int i, offset = 0; + dst->cnt = src->cnt; for (i = 0; i < src->cnt; i++) { memcpy(pos + offset, src->elem[i].data, src->elem[i].len); dst->elem[i].len = src->elem[i].len; dst->elem[i].data = pos + offset; offset += dst->elem[i].len; } - dst->cnt = src->cnt; return offset; } @@ -1206,8 +1273,11 @@ ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, ieee80211_copy_rnr_beacon(pos, new->rnr_ies, rnr); } /* update bssid_indicator */ - link_conf->bssid_indicator = - ilog2(__roundup_pow_of_two(mbssid->cnt + 1)); + if (new->mbssid_ies->cnt && new->mbssid_ies->elem[0].len > 2) + link_conf->bssid_indicator = + *(new->mbssid_ies->elem[0].data + 2); + else + link_conf->bssid_indicator = 0; } if (csa) { @@ -1278,9 +1348,9 @@ static u8 ieee80211_num_beaconing_links(struct ieee80211_sub_if_data *sdata) sdata->vif.type != NL80211_IFTYPE_P2P_GO) return num; - if (!sdata->vif.valid_links) - return num; - + /* non-MLO mode of operation also uses link_id 0 in sdata so it is + * safe to directly proceed with the below loop + */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) @@ -1312,6 +1382,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_link_data *link; struct ieee80211_bss_conf *link_conf; struct ieee80211_chan_req chanreq = { .oper = params->chandef }; + u64 tsf; lockdep_assert_wiphy(local->hw.wiphy); @@ -1409,6 +1480,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, (IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ); + link_conf->eht_disable_mcs15 = + u8_get_bits(params->eht_oper->params, + IEEE80211_EHT_OPER_MCS15_DISABLE); } else { link_conf->eht_su_beamformer = false; link_conf->eht_su_beamformee = false; @@ -1461,8 +1535,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, link_conf->twt_responder = params->twt_responder; link_conf->he_obss_pd = params->he_obss_pd; link_conf->he_bss_color = params->beacon.he_bss_color; - sdata->vif.cfg.s1g = params->chandef.chan->band == - NL80211_BAND_S1GHZ; + link_conf->s1g_long_beacon_period = params->s1g_long_beacon_period; + sdata->vif.cfg.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ; sdata->vif.cfg.ssid_len = params->ssid_len; if (params->ssid_len) @@ -1509,6 +1583,13 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (err < 0) goto error; + if (sdata->vif.cfg.s1g) { + err = ieee80211_set_s1g_short_beacon(sdata, link, + ¶ms->s1g_short_beacon); + if (err < 0) + goto error; + } + err = drv_start_ap(sdata->local, sdata, link_conf); if (err) { old = sdata_dereference(link->u.ap.beacon, sdata); @@ -1523,7 +1604,12 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, goto error; } - ieee80211_recalc_dtim(local, sdata); + tsf = drv_get_tsf(local, sdata); + ieee80211_recalc_dtim(sdata, tsf); + + if (link->u.ap.s1g_short_beacon) + ieee80211_recalc_sb_count(sdata, tsf); + ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID); ieee80211_link_info_change_notify(sdata, link, changed); @@ -1587,6 +1673,13 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, if (err < 0) return err; + if (link->u.ap.s1g_short_beacon) { + err = ieee80211_set_s1g_short_beacon(sdata, link, + ¶ms->s1g_short_beacon); + if (err < 0) + return err; + } + if (beacon->he_bss_color_valid && beacon->he_bss_color.enabled != link_conf->he_bss_color.enabled) { link_conf->he_bss_color.enabled = beacon->he_bss_color.enabled; @@ -1618,6 +1711,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, struct probe_resp *old_probe_resp; struct fils_discovery_data *old_fils_discovery; struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp; + struct s1g_short_beacon_data *old_s1g_short_beacon; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); @@ -1636,6 +1730,8 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, old_unsol_bcast_probe_resp = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); + old_s1g_short_beacon = + sdata_dereference(link->u.ap.s1g_short_beacon, sdata); /* abort any running channel switch or color change */ link_conf->csa_active = false; @@ -1658,6 +1754,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, RCU_INIT_POINTER(link->u.ap.probe_resp, NULL); RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); + RCU_INIT_POINTER(link->u.ap.s1g_short_beacon, NULL); kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); @@ -1665,11 +1762,12 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, kfree_rcu(old_fils_discovery, rcu_head); if (old_unsol_bcast_probe_resp) kfree_rcu(old_unsol_bcast_probe_resp, rcu_head); + if (old_s1g_short_beacon) + kfree_rcu(old_s1g_short_beacon, rcu_head); kfree(link_conf->ftmr_params); link_conf->ftmr_params = NULL; - sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; @@ -1683,9 +1781,13 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, ieee80211_free_key_list(local, &keys); } + ieee80211_stop_mbssid(sdata); + RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL); + link_conf->enable_beacon = false; sdata->beacon_rate_set = false; sdata->vif.cfg.ssid_len = 0; + sdata->vif.cfg.s1g = false; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BEACON_ENABLED); @@ -1861,6 +1963,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, params->vht_capa || params->he_capa || params->eht_capa || + params->s1g_capa || params->opmode_notif_used; switch (mode) { @@ -1939,9 +2042,27 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, params->eht_capa_len, link_sta); + if (params->s1g_capa) + ieee80211_s1g_cap_to_sta_s1g_cap(sdata, params->s1g_capa, + link_sta); + ieee80211_sta_init_nss(link_sta); if (params->opmode_notif_used) { + enum nl80211_chan_width width = link->conf->chanreq.oper.width; + + switch (width) { + case NL80211_CHAN_WIDTH_20: + case NL80211_CHAN_WIDTH_40: + case NL80211_CHAN_WIDTH_80: + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_80P80: + case NL80211_CHAN_WIDTH_320: /* not VHT, allowed for HE/EHT */ + break; + default: + return -EINVAL; + } + /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it */ @@ -2066,6 +2187,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->listen_interval >= 0) sta->listen_interval = params->listen_interval; + if (params->eml_cap_present) + sta->sta.eml_cap = params->eml_cap; + ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_STA_MODIFY, ¶ms->link_sta_params); if (ret) @@ -2904,7 +3028,7 @@ static int ieee80211_scan(struct wiphy *wiphy, * the frames sent while scanning on other channel will be * lost) */ - if (sdata->deflink.u.ap.beacon && + if (ieee80211_num_beaconing_links(sdata) && (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || !(req->flags & NL80211_SCAN_FLAG_AP))) return -EOPNOTSUPP; @@ -3008,7 +3132,8 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int ieee80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct ieee80211_local *local = wiphy_priv(wiphy); int err; @@ -3016,7 +3141,8 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { ieee80211_check_fast_xmit_all(local); - err = drv_set_frag_threshold(local, wiphy->frag_threshold); + err = drv_set_frag_threshold(local, radio_idx, + wiphy->frag_threshold); if (err) { ieee80211_check_fast_xmit_all(local); @@ -3030,14 +3156,23 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? wiphy->coverage_class : -1; - err = drv_set_coverage_class(local, coverage_class); + err = drv_set_coverage_class(local, radio_idx, + coverage_class); if (err) return err; } if (changed & WIPHY_PARAM_RTS_THRESHOLD) { - err = drv_set_rts_threshold(local, wiphy->rts_threshold); + u32 rts_threshold; + + if ((radio_idx == -1) || (radio_idx >= wiphy->n_radio)) + rts_threshold = wiphy->rts_threshold; + else + rts_threshold = + wiphy->radio_cfg[radio_idx].rts_threshold; + + err = drv_set_rts_threshold(local, radio_idx, rts_threshold); if (err) return err; @@ -3055,18 +3190,19 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) } if (changed & (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); + ieee80211_hw_config(local, radio_idx, + IEEE80211_CONF_CHANGE_RETRY_LIMITS); if (changed & (WIPHY_PARAM_TXQ_LIMIT | WIPHY_PARAM_TXQ_MEMORY_LIMIT | WIPHY_PARAM_TXQ_QUANTUM)) - ieee80211_txq_set_params(local); + ieee80211_txq_set_params(local, radio_idx); return 0; } static int ieee80211_set_tx_power(struct wiphy *wiphy, - struct wireless_dev *wdev, + struct wireless_dev *wdev, int radio_idx, enum nl80211_tx_power_setting type, int mbm) { struct ieee80211_local *local = wiphy_priv(wiphy); @@ -3194,6 +3330,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, static int ieee80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, unsigned int link_id, int *dbm) { @@ -3372,7 +3509,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, } if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); @@ -3529,6 +3666,56 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, return 0; } +static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy, + struct ieee80211_local *local, + struct cfg80211_chan_def *chandef) +{ + struct cfg80211_scan_request *scan_req; + int chan_radio_idx, req_radio_idx; + struct ieee80211_roc_work *roc; + + if (list_empty(&local->roc_list) && !local->scanning) + return false; + + if (wiphy->n_radio < 2) + return true; + + req_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chandef->chan); + if (req_radio_idx < 0) + return true; + + if (local->scanning) { + scan_req = wiphy_dereference(wiphy, local->scan_req); + /* + * Scan is going on but info is not there. Should not happen + * but if it does, let's not take risk and assume we can't use + * the hw hence return true + */ + if (WARN_ON_ONCE(!scan_req)) + return true; + + return ieee80211_is_radio_idx_in_scan_req(wiphy, scan_req, + req_radio_idx); + } + + list_for_each_entry(roc, &local->roc_list, list) { + chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, + roc->chan); + /* + * The roc work is added but chan_radio_idx is invalid. + * Should not happen but if it does, let's not take + * risk and return true. + */ + if (chan_radio_idx < 0) + return true; + + if (chan_radio_idx == req_radio_idx) + return true; + } + + return false; +} + static int ieee80211_start_radar_detection(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef, @@ -3542,7 +3729,7 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy, lockdep_assert_wiphy(local->hw.wiphy); - if (!list_empty(&local->roc_list) || local->scanning) + if (ieee80211_is_scan_ongoing(wiphy, local, chandef)) return -EBUSY; link_data = sdata_dereference(sdata->link[link_id], sdata); @@ -3700,6 +3887,7 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; + struct ieee80211_bss_conf *tx_bss_conf; struct ieee80211_link_data *link_data; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) @@ -3713,25 +3901,24 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) return; } - /* TODO: MBSSID with MLO changes */ - if (vif->mbssid_tx_vif == vif) { + tx_bss_conf = rcu_dereference(link_data->conf->tx_bss_conf); + if (tx_bss_conf == link_data->conf) { /* Trigger ieee80211_csa_finish() on the non-transmitting * interfaces when channel switch is received on * transmitting interface */ - struct ieee80211_sub_if_data *iter; - - list_for_each_entry_rcu(iter, &local->interfaces, list) { - if (!ieee80211_sdata_running(iter)) - continue; + struct ieee80211_link_data *iter; - if (iter == sdata || iter->vif.mbssid_tx_vif != vif) + for_each_sdata_link_rcu(local, iter) { + if (iter->sdata == sdata || + rcu_access_pointer(iter->conf->tx_bss_conf) != tx_bss_conf) continue; - wiphy_work_queue(iter->local->hw.wiphy, - &iter->deflink.csa.finalize_work); + wiphy_work_queue(iter->sdata->local->hw.wiphy, + &iter->csa.finalize_work); } } + wiphy_work_queue(local->hw.wiphy, &link_data->csa.finalize_work); rcu_read_unlock(); @@ -4034,7 +4221,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, lockdep_assert_wiphy(local->hw.wiphy); - if (!list_empty(&local->roc_list) || local->scanning) + if (ieee80211_is_scan_ongoing(wiphy, local, ¶ms->chandef)) return -EBUSY; if (sdata->wdev.links[link_id].cac_started) @@ -4068,6 +4255,12 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; } + err = ieee80211_set_unsol_bcast_probe_resp(sdata, + ¶ms->unsol_bcast_probe_resp, + link_data, link_conf, &changed); + if (err) + goto out; + chanctx = container_of(conf, struct ieee80211_chanctx, conf); ch_switch.timestamp = 0; @@ -4218,7 +4411,8 @@ ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, ieee80211_configure_filter(local); } -static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) +static int ieee80211_set_antenna(struct wiphy *wiphy, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); int ret; @@ -4234,11 +4428,12 @@ static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) return 0; } -static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) +static int ieee80211_get_antenna(struct wiphy *wiphy, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); - return drv_get_antenna(local, tx_ant, rx_ant); + return drv_get_antenna(local, radio_idx, tx_ant, rx_ant); } static int ieee80211_set_rekey_data(struct wiphy *wiphy, @@ -4833,17 +5028,19 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link, ieee80211_link_info_change_notify(sdata, link, changed); - if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) { - struct ieee80211_sub_if_data *child; + if (!link->conf->nontransmitted && + rcu_access_pointer(link->conf->tx_bss_conf)) { + struct ieee80211_link_data *tmp; - list_for_each_entry(child, &sdata->local->interfaces, list) { - if (child != sdata && child->vif.mbssid_tx_vif == &sdata->vif) { - child->vif.bss_conf.he_bss_color.color = color; - child->vif.bss_conf.he_bss_color.enabled = enable; - ieee80211_link_info_change_notify(child, - &child->deflink, - BSS_CHANGED_HE_BSS_COLOR); - } + for_each_sdata_link(sdata->local, tmp) { + if (tmp->sdata == sdata || + rcu_access_pointer(tmp->conf->tx_bss_conf) != link->conf) + continue; + + tmp->conf->he_bss_color.color = color; + tmp->conf->he_bss_color.enabled = enable; + ieee80211_link_info_change_notify(tmp->sdata, tmp, + BSS_CHANGED_HE_BSS_COLOR); } } } @@ -5005,6 +5202,12 @@ ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, goto out; } + err = ieee80211_set_unsol_bcast_probe_resp(sdata, + ¶ms->unsol_bcast_probe_resp, + link, link_conf, &changed); + if (err) + goto out; + err = ieee80211_set_color_change_beacon(link, params, &changed); if (err) goto out; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index c3bfac58151f6..c9cea0e7ac169 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -644,15 +644,39 @@ ieee80211_find_chanctx(struct ieee80211_local *local, return NULL; } -bool ieee80211_is_radar_required(struct ieee80211_local *local) +bool ieee80211_is_radar_required(struct ieee80211_local *local, + struct cfg80211_scan_request *req) { + struct wiphy *wiphy = local->hw.wiphy; struct ieee80211_link_data *link; + struct ieee80211_channel *chan; + int radio_idx; lockdep_assert_wiphy(local->hw.wiphy); + if (!req) + return false; + for_each_sdata_link(local, link) { - if (link->radar_required) - return true; + if (link->radar_required) { + if (wiphy->n_radio < 2) + return true; + + chan = link->conf->chanreq.oper.chan; + radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); + /* + * The radio index (radio_idx) is expected to be valid, + * as it's derived from a channel tied to a link. If + * it's invalid (i.e., negative), return true to avoid + * potential issues with radar-sensitive operations. + */ + if (radio_idx < 0) + return true; + + if (ieee80211_is_radio_idx_in_scan_req(wiphy, req, + radio_idx)) + return true; + } } return false; @@ -720,7 +744,7 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local, /* turn idle off *before* setting channel -- some drivers need that */ changed = ieee80211_idle_off(local); if (changed) - ieee80211_hw_config(local, changed); + ieee80211_hw_config(local, -1, changed); err = drv_add_chanctx(local, ctx); if (err) { @@ -886,7 +910,7 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, conf = rcu_dereference_protected(link->conf->chanctx_conf, lockdep_is_held(&local->hw.wiphy->mtx)); - if (conf) { + if (conf && !local->in_reconfig) { curr_ctx = container_of(conf, struct ieee80211_chanctx, conf); drv_unassign_vif_chanctx(local, sdata, link->conf, curr_ctx); @@ -906,8 +930,9 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, /* succeeded, so commit it to the data structures */ conf = &new_ctx->conf; - list_add(&link->assigned_chanctx_list, - &new_ctx->assigned_links); + if (!local->in_reconfig) + list_add(&link->assigned_chanctx_list, + &new_ctx->assigned_links); } } else { ret = 0; @@ -1084,7 +1109,7 @@ void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link, __ieee80211_link_copy_chanctx_to_vlans(link, clear); } -int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) +void ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chanctx *ctx = link->reserved_chanctx; @@ -1092,7 +1117,7 @@ int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) lockdep_assert_wiphy(sdata->local->hw.wiphy); if (WARN_ON(!ctx)) - return -EINVAL; + return; list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; @@ -1100,7 +1125,7 @@ int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) { if (WARN_ON(!ctx->replace_ctx)) - return -EINVAL; + return; WARN_ON(ctx->replace_ctx->replace_state != IEEE80211_CHANCTX_WILL_BE_REPLACED); @@ -1116,8 +1141,6 @@ int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) ieee80211_free_chanctx(sdata->local, ctx, false); } } - - return 0; } static struct ieee80211_chanctx * @@ -1381,6 +1404,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) goto out; } + link->radar_required = link->reserved_radar_required; list_move(&link->assigned_chanctx_list, &new_ctx->assigned_links); rcu_assign_pointer(link_conf->chanctx_conf, &new_ctx->conf); @@ -1909,7 +1933,8 @@ int _ieee80211_link_use_channel(struct ieee80211_link_data *link, if (ret < 0) goto out; - __ieee80211_link_release_channel(link, false); + if (!local->in_reconfig) + __ieee80211_link_release_channel(link, false); ctx = ieee80211_find_chanctx(local, link, chanreq, mode); /* Note: context is now reserved */ @@ -2131,6 +2156,9 @@ void ieee80211_link_release_channel(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + return; + lockdep_assert_wiphy(sdata->local->hw.wiphy); if (rcu_access_pointer(link->conf->chanctx_conf)) diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h index 5b81998cb0c98..ef7c1a68d88da 100644 --- a/net/mac80211/debug.h +++ b/net/mac80211/debug.h @@ -1,10 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions - * Copyright (C) 2022 - 2024 Intel Corporation + * Copyright (C) 2022 - 2025 Intel Corporation */ #ifndef __MAC80211_DEBUG_H #define __MAC80211_DEBUG_H +#include <linux/once_lite.h> #include <net/cfg80211.h> #ifdef CONFIG_MAC80211_OCB_DEBUG @@ -152,6 +153,8 @@ do { \ else \ _sdata_err((link)->sdata, fmt, ##__VA_ARGS__); \ } while (0) +#define link_err_once(link, fmt, ...) \ + DO_ONCE_LITE(link_err, link, fmt, ##__VA_ARGS__) #define link_id_info(sdata, link_id, fmt, ...) \ do { \ if (ieee80211_vif_is_mld(&sdata->vif)) \ diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 69e03630f64c9..e8b78ec682da8 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -4,7 +4,7 @@ * * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2018 - 2019, 2021-2024 Intel Corporation + * Copyright (C) 2018 - 2019, 2021-2025 Intel Corporation */ #include <linux/debugfs.h> @@ -490,7 +490,6 @@ static const char *hw_flag_names[] = { FLAG(DETECTS_COLOR_COLLISION), FLAG(MLO_MCAST_MULTI_LINK_TX), FLAG(DISALLOW_PUNCTURING), - FLAG(DISALLOW_PUNCTURING_5GHZ), FLAG(HANDLES_QUIET_CSA), FLAG(STRICT), #undef FLAG diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 54c479910d054..1dac78271045a 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -704,7 +704,7 @@ static ssize_t ieee80211_if_parse_tsf( } } - ieee80211_recalc_dtim(local, sdata); + ieee80211_recalc_dtim(sdata, drv_get_tsf(local, sdata)); return buflen; } IEEE80211_IF_FILE_RW(tsf); diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index a8948f4d983e5..49061bd4151bc 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -152,12 +152,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, p += scnprintf(p, bufsz + buf - p, - "target %uus interval %uus ecn %s\n", - codel_time_to_us(sta->cparams.target), - codel_time_to_us(sta->cparams.interval), - sta->cparams.ecn ? "yes" : "no"); - p += scnprintf(p, - bufsz + buf - p, "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 35349a7f16cb4..ba9fba1659265 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2015 Intel Deutschland GmbH - * Copyright (C) 2022-2024 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation */ #include <net/mac80211.h> #include "ieee80211_i.h" @@ -515,6 +515,9 @@ int drv_set_key(struct ieee80211_local *local, !(sdata->vif.active_links & BIT(key->link_id)))) return -ENOLINK; + if (fips_enabled) + return -EOPNOTSUPP; + trace_drv_set_key(local, cmd, sdata, sta, key); ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key); trace_drv_return_int(local, ret); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 307587c8a0037..181bcb34b795f 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -8,6 +8,7 @@ #ifndef __MAC80211_DRIVER_OPS #define __MAC80211_DRIVER_OPS +#include <linux/fips.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "trace.h" @@ -143,15 +144,16 @@ int drv_change_interface(struct ieee80211_local *local, void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); -static inline int drv_config(struct ieee80211_local *local, u32 changed) +static inline int drv_config(struct ieee80211_local *local, int radio_idx, + u32 changed) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_config(local, changed); - ret = local->ops->config(&local->hw, changed); + trace_drv_config(local, radio_idx, changed); + ret = local->ops->config(&local->hw, radio_idx, changed); trace_drv_return_int(local, ret); return ret; } @@ -387,45 +389,47 @@ static inline void drv_get_key_seq(struct ieee80211_local *local, } static inline int drv_set_frag_threshold(struct ieee80211_local *local, - u32 value) + int radio_idx, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_set_frag_threshold(local, value); + trace_drv_set_frag_threshold(local, radio_idx, value); if (local->ops->set_frag_threshold) - ret = local->ops->set_frag_threshold(&local->hw, value); + ret = local->ops->set_frag_threshold(&local->hw, radio_idx, + value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_rts_threshold(struct ieee80211_local *local, - u32 value) + int radio_idx, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_set_rts_threshold(local, value); + trace_drv_set_rts_threshold(local, radio_idx, value); if (local->ops->set_rts_threshold) - ret = local->ops->set_rts_threshold(&local->hw, value); + ret = local->ops->set_rts_threshold(&local->hw, radio_idx, + value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_coverage_class(struct ieee80211_local *local, - s16 value) + int radio_idx, s16 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_set_coverage_class(local, value); + trace_drv_set_coverage_class(local, radio_idx, value); if (local->ops->set_coverage_class) - local->ops->set_coverage_class(&local->hw, value); + local->ops->set_coverage_class(&local->hw, radio_idx, value); else ret = -EOPNOTSUPP; @@ -631,6 +635,25 @@ static inline void drv_sta_statistics(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void drv_link_sta_statistics(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta, + struct link_station_info *link_sinfo) +{ + might_sleep(); + lockdep_assert_wiphy(local->hw.wiphy); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_link_sta_statistics(local, sdata, link_sta); + if (local->ops->link_sta_statistics) + local->ops->link_sta_statistics(&local->hw, &sdata->vif, + link_sta, link_sinfo); + trace_drv_return_void(local); +} + int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_link_data *link, u16 ac, const struct ieee80211_tx_queue_params *params); @@ -753,20 +776,21 @@ static inline int drv_set_antenna(struct ieee80211_local *local, might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->set_antenna) - ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant); + ret = local->ops->set_antenna(&local->hw, -1, tx_ant, rx_ant); trace_drv_set_antenna(local, tx_ant, rx_ant, ret); return ret; } -static inline int drv_get_antenna(struct ieee80211_local *local, +static inline int drv_get_antenna(struct ieee80211_local *local, int radio_idx, u32 *tx_ant, u32 *rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->get_antenna) - ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant); - trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret); + ret = local->ops->get_antenna(&local->hw, radio_idx, + tx_ant, rx_ant); + trace_drv_get_antenna(local, radio_idx, *tx_ant, *rx_ant, ret); return ret; } @@ -879,6 +903,9 @@ static inline void drv_set_rekey_data(struct ieee80211_local *local, if (!check_sdata_in_driver(sdata)) return; + if (fips_enabled) + return; + trace_drv_set_rekey_data(local, sdata, data); if (local->ops->set_rekey_data) local->ops->set_rekey_data(&local->hw, &sdata->vif, data); diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 32390d8a9d753..1c82a28b03de4 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright 2017 Intel Deutschland GmbH - * Copyright(c) 2020-2024 Intel Corporation + * Copyright(c) 2020-2025 Intel Corporation */ #include <linux/ieee80211.h> @@ -603,3 +603,41 @@ out: } /* this might change ... don't want non-open drivers using it */ EXPORT_SYMBOL_GPL(ieee80211_request_smps); + +void ieee80211_ht_handle_chanwidth_notif(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct link_sta_info *link_sta, + u8 chanwidth, enum nl80211_band band) +{ + enum ieee80211_sta_rx_bandwidth max_bw, new_bw; + struct ieee80211_supported_band *sband; + struct sta_opmode_info sta_opmode = {}; + + lockdep_assert_wiphy(local->hw.wiphy); + + if (chanwidth == IEEE80211_HT_CHANWIDTH_20MHZ) + max_bw = IEEE80211_STA_RX_BW_20; + else + max_bw = ieee80211_sta_cap_rx_bw(link_sta); + + /* set cur_max_bandwidth and recalc sta bw */ + link_sta->cur_max_bandwidth = max_bw; + new_bw = ieee80211_sta_cur_vht_bw(link_sta); + + if (link_sta->pub->bandwidth == new_bw) + return; + + link_sta->pub->bandwidth = new_bw; + sband = local->hw.wiphy->bands[band]; + sta_opmode.bw = + ieee80211_sta_rx_bw_to_chan_width(link_sta); + sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED; + + rate_control_rate_update(local, sband, link_sta, + IEEE80211_RC_BW_CHANGED); + cfg80211_sta_opmode_change_notify(sdata->dev, + sta->addr, + &sta_opmode, + GFP_KERNEL); +} diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 4246d168374ff..6e36b09fe97f8 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -48,7 +48,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; u8 *pos; struct ieee80211_supported_band *sband; - u32 rate_flags, rates = 0, rates_added = 0; + u32 rates = 0, rates_added = 0; struct beacon_data *presp; int frame_len; @@ -90,14 +90,11 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, pos += ifibss->ssid_len; sband = local->hw.wiphy->bands[chandef->chan->band]; - rate_flags = ieee80211_chandef_rate_flags(chandef); rates_n = 0; if (have_higher_than_11mbit) *have_higher_than_11mbit = false; for (i = 0; i < sband->n_bitrates; i++) { - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; if (sband->bitrates[i].bitrate > 110 && have_higher_than_11mbit) *have_higher_than_11mbit = true; @@ -395,7 +392,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, const struct cfg80211_bss_ies *ies; enum nl80211_channel_type chan_type; u64 tsf; - u32 rate_flags; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -429,7 +425,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, } sband = sdata->local->hw.wiphy->bands[cbss->channel->band]; - rate_flags = ieee80211_chandef_rate_flags(&sdata->u.ibss.chandef); basic_rates = 0; @@ -439,9 +434,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, for (j = 0; j < sband->n_bitrates; j++) { int brate; - if ((rate_flags & sband->bitrates[j].flags) - != rate_flags) - continue; brate = DIV_ROUND_UP(sband->bitrates[j].bitrate, 5); if (brate == rate) { @@ -643,7 +635,7 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { - unsigned long last_active = ieee80211_sta_last_active(sta); + unsigned long last_active = ieee80211_sta_last_active(sta, -1); if (sta->sdata == sdata && time_is_after_jiffies(last_active + @@ -1236,7 +1228,7 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { - unsigned long last_active = ieee80211_sta_last_active(sta); + unsigned long last_active = ieee80211_sta_last_active(sta, -1); if (sdata != sta->sdata) continue; @@ -1681,7 +1673,7 @@ void ieee80211_ibss_work(struct ieee80211_sub_if_data *sdata) static void ieee80211_ibss_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - from_timer(sdata, t, u.ibss.timer); + timer_container_of(sdata, t, u.ibss.timer); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } @@ -1717,12 +1709,9 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params) { u64 changed = 0; - u32 rate_flags; - struct ieee80211_supported_band *sband; enum ieee80211_chanctx_mode chanmode; struct ieee80211_local *local = sdata->local; int radar_detect_width = 0; - int i; int ret; lockdep_assert_wiphy(local->hw.wiphy); @@ -1765,12 +1754,6 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, sdata->u.ibss.last_scan_completed = jiffies; /* fix basic_rates if channel does not support these rates */ - rate_flags = ieee80211_chandef_rate_flags(¶ms->chandef); - sband = local->hw.wiphy->bands[params->chandef.chan->band]; - for (i = 0; i < sband->n_bitrates; i++) { - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - sdata->u.ibss.basic_rates &= ~BIT(i); - } memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate, sizeof(params->mcast_rate)); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index fb05f3cd37ec4..8afa2404eaa8e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -296,6 +296,14 @@ struct unsol_bcast_probe_resp_data { u8 data[]; }; +struct s1g_short_beacon_data { + struct rcu_head rcu_head; + u8 *short_head; + u8 *short_tail; + int short_head_len; + int short_tail_len; +}; + struct ps_data { /* yes, this looks ugly, but guarantees that we can later use * bitmap_empty :) @@ -306,6 +314,7 @@ struct ps_data { atomic_t num_sta_ps; /* number of stations in PS mode */ int dtim_count; bool dtim_bc_mc; + int sb_count; /* num short beacons til next long beacon */ }; struct ieee80211_if_ap { @@ -1042,6 +1051,7 @@ struct ieee80211_link_data_ap { struct probe_resp __rcu *probe_resp; struct fils_discovery_data __rcu *fils_discovery; struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp; + struct s1g_short_beacon_data __rcu *s1g_short_beacon; /* to be used after channel switch. */ struct cfg80211_beacon_data *next_beacon; @@ -1226,6 +1236,45 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) if ((_link = wiphy_dereference((_local)->hw.wiphy, \ ___sdata->link[___link_id]))) +/* + * for_each_sdata_link_rcu() must be used under RCU read lock. + */ +#define for_each_sdata_link_rcu(_local, _link) \ + /* outer loop just to define the variables ... */ \ + for (struct ieee80211_sub_if_data *___sdata = NULL; \ + !___sdata; \ + ___sdata = (void *)~0 /* always stop */) \ + list_for_each_entry_rcu(___sdata, &(_local)->interfaces, list) \ + if (ieee80211_sdata_running(___sdata)) \ + for (int ___link_id = 0; \ + ___link_id < ARRAY_SIZE((___sdata)->link); \ + ___link_id++) \ + if ((_link = rcu_dereference((___sdata)->link[___link_id]))) + +#define for_each_link_data(sdata, __link) \ + /* outer loop just to define the variable ... */ \ + for (struct ieee80211_sub_if_data *__sdata = (sdata); __sdata; \ + __sdata = NULL /* always stop */) \ + for (int __link_id = 0; \ + __link_id < ARRAY_SIZE((__sdata)->link); __link_id++) \ + if ((!(__sdata)->vif.valid_links || \ + (__sdata)->vif.valid_links & BIT(__link_id)) && \ + ((__link) = sdata_dereference((__sdata)->link[__link_id], \ + (__sdata)))) + +/* + * for_each_link_data_rcu should be used under RCU read lock. + */ +#define for_each_link_data_rcu(sdata, __link) \ + /* outer loop just to define the variable ... */ \ + for (struct ieee80211_sub_if_data *__sdata = (sdata); __sdata; \ + __sdata = NULL /* always stop */) \ + for (int __link_id = 0; \ + __link_id < ARRAY_SIZE((__sdata)->link); __link_id++) \ + if ((!(__sdata)->vif.valid_links || \ + (__sdata)->vif.valid_links & BIT(__link_id)) && \ + ((__link) = rcu_dereference((__sdata)->link[__link_id]))) \ + static inline int ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems, struct cfg80211_rnr_elems *rnr_elems, @@ -1394,8 +1443,6 @@ struct ieee80211_local { bool rx_mcast_action_reg; unsigned int filter_flags; /* FIF_* */ - bool wiphy_ciphers_allocated; - struct cfg80211_chan_def dflt_chandef; bool emulate_chanctx; @@ -1863,7 +1910,8 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset); -int ieee80211_hw_config(struct ieee80211_local *local, u32 changed); +int ieee80211_hw_config(struct ieee80211_local *local, int radio_idx, + u32 changed); int ieee80211_hw_conf_chan(struct ieee80211_local *local); void ieee80211_hw_conf_init(struct ieee80211_local *local); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); @@ -2078,6 +2126,9 @@ static inline void ieee80211_vif_clear_links(struct ieee80211_sub_if_data *sdata ieee80211_vif_set_links(sdata, 0, 0); } +void ieee80211_apvlan_link_setup(struct ieee80211_sub_if_data *sdata); +void ieee80211_apvlan_link_clear(struct ieee80211_sub_if_data *sdata); + /* tx handling */ void ieee80211_clear_tx_pending(struct ieee80211_local *local); void ieee80211_tx_pending(struct tasklet_struct *t); @@ -2193,6 +2244,12 @@ u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs); enum nl80211_smps_mode ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps); +void ieee80211_ht_handle_chanwidth_notif(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct link_sta_info *link_sta, + u8 chanwidth, enum nl80211_band band); + /* VHT */ void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, @@ -2257,6 +2314,9 @@ void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_s1g_cap *s1g_cap_ie, + struct link_sta_info *link_sta); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, @@ -2530,7 +2590,7 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) } int ieee80211_txq_setup_flows(struct ieee80211_local *local); -void ieee80211_txq_set_params(struct ieee80211_local *local); +void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx); void ieee80211_txq_teardown_flows(struct ieee80211_local *local); void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, @@ -2613,7 +2673,7 @@ void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, /* element building in SKBs */ int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, - u32 basic_rates, u32 rate_flags, u32 masked_rates, + u32 basic_rates, u32 masked_rates, u8 element_id); int ieee80211_put_he_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, @@ -2626,6 +2686,8 @@ int ieee80211_put_eht_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn); +int ieee80211_put_reg_conn(struct sk_buff *skb, + enum ieee80211_channel_flags flags); /* channel management */ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, @@ -2679,7 +2741,7 @@ ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, bool radar_required); int __must_check ieee80211_link_use_reserved_context(struct ieee80211_link_data *link); -int ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link); +void ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link); int __must_check ieee80211_link_change_chanreq(struct ieee80211_link_data *link, @@ -2700,7 +2762,11 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_link_data *rsvd_for, bool check_reserved); -bool ieee80211_is_radar_required(struct ieee80211_local *local); +bool ieee80211_is_radar_required(struct ieee80211_local *local, + struct cfg80211_scan_request *req); +bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, + struct cfg80211_scan_request *scan_req, + int radio_idx); void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work); void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, @@ -2709,9 +2775,8 @@ void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct wiphy_work *work); int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings); - -void ieee80211_recalc_dtim(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata); +void ieee80211_recalc_sb_count(struct ieee80211_sub_if_data *sdata, u64 tsf); +void ieee80211_recalc_dtim(struct ieee80211_sub_if_data *sdata, u64 tsf); int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, enum ieee80211_chanctx_mode chanmode, @@ -2795,6 +2860,8 @@ int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); +void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata); + #if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST) #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 969b3e2c496af..07ba68f7cd817 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -146,7 +146,7 @@ void ieee80211_recalc_idle(struct ieee80211_local *local) { u32 change = __ieee80211_recalc_idle(local, false); if (change) - ieee80211_hw_config(local, change); + ieee80211_hw_config(local, -1, change); } static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr, @@ -485,6 +485,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do case NL80211_IFTYPE_MONITOR: list_del_rcu(&sdata->u.mntr.list); break; + case NL80211_IFTYPE_AP_VLAN: + ieee80211_apvlan_link_clear(sdata); + break; default: break; } @@ -723,36 +726,65 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do /* do after stop to avoid reconfiguring when we stop anyway */ ieee80211_configure_filter(local); - ieee80211_hw_config(local, hw_reconf_flags); + ieee80211_hw_config(local, -1, hw_reconf_flags); if (local->virt_monitors == local->open_count) ieee80211_add_virtual_monitor(local); } -static void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) +void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_sub_if_data *tx_sdata, *non_tx_sdata, *tmp_sdata; - struct ieee80211_vif *tx_vif = sdata->vif.mbssid_tx_vif; + struct ieee80211_sub_if_data *tx_sdata; + struct ieee80211_bss_conf *link_conf, *tx_bss_conf; + struct ieee80211_link_data *tx_link, *link; + unsigned int link_id; - if (!tx_vif) - return; + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + /* Check if any of the links of current sdata is an MBSSID. */ + for_each_vif_active_link(&sdata->vif, link_conf, link_id) { + tx_bss_conf = sdata_dereference(link_conf->tx_bss_conf, sdata); + if (!tx_bss_conf) + continue; - tx_sdata = vif_to_sdata(tx_vif); - sdata->vif.mbssid_tx_vif = NULL; + tx_sdata = vif_to_sdata(tx_bss_conf->vif); + RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL); - list_for_each_entry_safe(non_tx_sdata, tmp_sdata, - &tx_sdata->local->interfaces, list) { - if (non_tx_sdata != sdata && non_tx_sdata != tx_sdata && - non_tx_sdata->vif.mbssid_tx_vif == tx_vif && - ieee80211_sdata_running(non_tx_sdata)) { - non_tx_sdata->vif.mbssid_tx_vif = NULL; - dev_close(non_tx_sdata->wdev.netdev); + /* If we are not tx sdata reset tx sdata's tx_bss_conf to avoid recusrion + * while closing tx sdata at the end of outer loop below. + */ + if (sdata != tx_sdata) { + tx_link = sdata_dereference(tx_sdata->link[tx_bss_conf->link_id], + tx_sdata); + if (!tx_link) + continue; + + RCU_INIT_POINTER(tx_link->conf->tx_bss_conf, NULL); } - } - if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) { - tx_sdata->vif.mbssid_tx_vif = NULL; - dev_close(tx_sdata->wdev.netdev); + /* loop through sdatas to find if any of their links + * belong to same MBSSID set as the one getting deleted. + */ + for_each_sdata_link(tx_sdata->local, link) { + struct ieee80211_sub_if_data *link_sdata = link->sdata; + + if (link_sdata == sdata || link_sdata == tx_sdata || + rcu_access_pointer(link->conf->tx_bss_conf) != tx_bss_conf) + continue; + + RCU_INIT_POINTER(link->conf->tx_bss_conf, NULL); + + /* Remove all links of matching MLD until dynamic link + * removal can be supported. + */ + cfg80211_stop_iface(link_sdata->wdev.wiphy, &link_sdata->wdev, + GFP_KERNEL); + } + + /* If we are not tx sdata, remove links of tx sdata and proceed */ + if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) + cfg80211_stop_iface(tx_sdata->wdev.wiphy, + &tx_sdata->wdev, GFP_KERNEL); } } @@ -760,21 +792,25 @@ static int ieee80211_stop(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - /* close dependent VLAN and MBSSID interfaces before locking wiphy */ + /* close dependent VLAN interfaces before locking wiphy */ if (sdata->vif.type == NL80211_IFTYPE_AP) { struct ieee80211_sub_if_data *vlan, *tmpsdata; list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, u.vlan.list) dev_close(vlan->dev); - - ieee80211_stop_mbssid(sdata); } guard(wiphy)(sdata->local->hw.wiphy); wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->activate_links_work); + /* Close the dependent MBSSID interfaces with wiphy lock as we may be + * terminating its partner links too in case of MLD. + */ + if (sdata->vif.type == NL80211_IFTYPE_AP) + ieee80211_stop_mbssid(sdata); + ieee80211_do_stop(sdata, true); return 0; @@ -1114,6 +1150,8 @@ static void ieee80211_sdata_init(struct ieee80211_local *local, { sdata->local = local; + INIT_LIST_HEAD(&sdata->key_list); + /* * Initialize the default link, so we can use link_id 0 for non-MLD, * and that continues to work for non-MLD-aware drivers that use just @@ -1268,6 +1306,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->crypto_tx_tailroom_needed_cnt += master->crypto_tx_tailroom_needed_cnt; + ieee80211_apvlan_link_setup(sdata); + break; } case NL80211_IFTYPE_AP: @@ -1324,7 +1364,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NL80211_IFTYPE_AP_VLAN: /* no need to tell driver, but set carrier and chanctx */ if (sdata->bss->active) { - ieee80211_link_vlan_copy_chanctx(&sdata->deflink); + struct ieee80211_link_data *link; + + for_each_link_data(sdata, link) { + ieee80211_link_vlan_copy_chanctx(link); + } + netif_carrier_on(dev); ieee80211_set_vif_encap_ops(sdata); } else { @@ -1448,7 +1493,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (local->open_count == 1) ieee80211_hw_conf_init(local); else if (hw_reconf_flags) - ieee80211_hw_config(local, hw_reconf_flags); + ieee80211_hw_config(local, -1, hw_reconf_flags); ieee80211_recalc_ps(local); @@ -1512,6 +1557,35 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } } else if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_HT) { + switch (mgmt->u.action.u.ht_smps.action) { + case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { + u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth; + struct ieee80211_rx_status *status; + struct link_sta_info *link_sta; + struct sta_info *sta; + + sta = sta_info_get_bss(sdata, mgmt->sa); + if (!sta) + break; + + status = IEEE80211_SKB_RXCB(skb); + if (!status->link_valid) + link_sta = &sta->deflink; + else + link_sta = rcu_dereference_protected(sta->link[status->link_id], + lockdep_is_held(&local->hw.wiphy->mtx)); + if (link_sta) + ieee80211_ht_handle_chanwidth_notif(local, sdata, sta, + link_sta, chanwidth, + status->band); + break; + } + default: + WARN_ON(1); + break; + } + } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_VHT) { switch (mgmt->u.action.u.vht_group_notif.action_code) { case WLAN_VHT_ACTION_OPMODE_NOTIF: { @@ -2167,8 +2241,6 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ieee80211_init_frag_cache(&sdata->frags); - INIT_LIST_HEAD(&sdata->key_list); - wiphy_delayed_work_init(&sdata->dec_tailroom_needed_wk, ieee80211_delayed_tailroom_dec); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index dcf8643a0baa5..b14e9cd9713ff 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -6,7 +6,7 @@ * Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright 2018-2020, 2022-2024 Intel Corporation + * Copyright 2018-2020, 2022-2025 Intel Corporation */ #include <crypto/utils.h> @@ -510,7 +510,8 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, } else { if (!new->local->wowlan) ret = ieee80211_key_enable_hw_accel(new); - else + else if (link_id < 0 || !sdata->vif.active_links || + BIT(link_id) & sdata->vif.active_links) new->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; } @@ -1353,38 +1354,14 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, } EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq); -void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) -{ - struct ieee80211_key *key; - - key = container_of(keyconf, struct ieee80211_key, conf); - - lockdep_assert_wiphy(key->local->hw.wiphy); - - /* - * if key was uploaded, we assume the driver will/has remove(d) - * it, so adjust bookkeeping accordingly - */ - if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { - key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; - - if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | - IEEE80211_KEY_FLAG_PUT_MIC_SPACE | - IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) - increment_tailroom_need_count(key->sdata); - } - - ieee80211_key_free(key, false); -} -EXPORT_SYMBOL_GPL(ieee80211_remove_key); - struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, - struct ieee80211_key_conf *keyconf, + u8 idx, u8 *key_data, u8 key_len, int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; + struct ieee80211_key *prev_key; struct ieee80211_key *key; int err; struct ieee80211_link_data *link_data = @@ -1400,8 +1377,37 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return ERR_PTR(-EINVAL); - key = ieee80211_key_alloc(keyconf->cipher, keyconf->keyidx, - keyconf->keylen, keyconf->key, + if (WARN_ON(idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + + NUM_DEFAULT_BEACON_KEYS)) + return ERR_PTR(-EINVAL); + + prev_key = wiphy_dereference(local->hw.wiphy, + link_data->gtk[idx]); + if (!prev_key) { + if (idx < NUM_DEFAULT_KEYS) { + for (int i = 0; i < NUM_DEFAULT_KEYS; i++) { + if (i == idx) + continue; + prev_key = wiphy_dereference(local->hw.wiphy, + link_data->gtk[i]); + if (prev_key) + break; + } + } else { + /* For IGTK we have 4 and 5 and for BIGTK - 6 and 7 */ + prev_key = wiphy_dereference(local->hw.wiphy, + link_data->gtk[idx ^ 1]); + } + } + + if (WARN_ON(!prev_key)) + return ERR_PTR(-EINVAL); + + if (WARN_ON(key_len < prev_key->conf.keylen)) + return ERR_PTR(-EINVAL); + + key = ieee80211_key_alloc(prev_key->conf.cipher, idx, + prev_key->conf.keylen, key_data, 0, NULL); if (IS_ERR(key)) return ERR_CAST(key); diff --git a/net/mac80211/led.c b/net/mac80211/led.c index 885fa6aa3fc1a..fabbffdd3ac26 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -257,7 +257,8 @@ static unsigned long tpt_trig_traffic(struct ieee80211_local *local, static void tpt_trig_timer(struct timer_list *t) { - struct tpt_led_trigger *tpt_trig = from_timer(tpt_trig, t, timer); + struct tpt_led_trigger *tpt_trig = timer_container_of(tpt_trig, t, + timer); struct ieee80211_local *local = tpt_trig->local; unsigned long on, off, tpt; int i; diff --git a/net/mac80211/link.c b/net/mac80211/link.c index 58a76bcd6ae68..d71eabe5abf8d 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -2,7 +2,7 @@ /* * MLO link handling * - * Copyright (C) 2022-2024 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation */ #include <linux/slab.h> #include <linux/kernel.h> @@ -12,6 +12,71 @@ #include "key.h" #include "debugfs_netdev.h" +static void ieee80211_update_apvlan_links(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_sub_if_data *vlan; + struct ieee80211_link_data *link; + u16 ap_bss_links = sdata->vif.valid_links; + u16 new_links, vlan_links; + unsigned long add; + + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { + int link_id; + + if (!vlan) + continue; + + /* No support for 4addr with MLO yet */ + if (vlan->wdev.use_4addr) + return; + + vlan_links = vlan->vif.valid_links; + + new_links = ap_bss_links; + + add = new_links & ~vlan_links; + if (!add) + continue; + + ieee80211_vif_set_links(vlan, add, 0); + + for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { + link = sdata_dereference(vlan->link[link_id], vlan); + ieee80211_link_vlan_copy_chanctx(link); + } + } +} + +void ieee80211_apvlan_link_setup(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_sub_if_data *ap_bss = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + u16 new_links = ap_bss->vif.valid_links; + unsigned long add; + int link_id; + + if (!ap_bss->vif.valid_links) + return; + + add = new_links; + for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { + sdata->wdev.valid_links |= BIT(link_id); + ether_addr_copy(sdata->wdev.links[link_id].addr, + ap_bss->wdev.links[link_id].addr); + } + + ieee80211_vif_set_links(sdata, new_links, 0); +} + +void ieee80211_apvlan_link_clear(struct ieee80211_sub_if_data *sdata) +{ + if (!sdata->wdev.valid_links) + return; + + sdata->wdev.valid_links = 0; + ieee80211_vif_clear_links(sdata); +} + void ieee80211_link_setup(struct ieee80211_link_data *link) { if (link->sdata->vif.type == NL80211_IFTYPE_STATION) @@ -28,8 +93,16 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, if (link_id < 0) link_id = 0; - rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf); - rcu_assign_pointer(sdata->link[link_id], link); + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + struct ieee80211_sub_if_data *ap_bss; + struct ieee80211_bss_conf *ap_bss_conf; + + ap_bss = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + ap_bss_conf = sdata_dereference(ap_bss->vif.link_conf[link_id], + ap_bss); + memcpy(link_conf, ap_bss_conf, sizeof(*link_conf)); + } link->sdata = sdata; link->link_id = link_id; @@ -54,6 +127,7 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, if (!deflink) { switch (sdata->vif.type) { case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: ether_addr_copy(link_conf->addr, sdata->wdev.links[link_id].addr); link_conf->bssid = link_conf->addr; @@ -68,6 +142,9 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, ieee80211_link_debugfs_add(link); } + + rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf); + rcu_assign_pointer(sdata->link[link_id], link); } void ieee80211_link_stop(struct ieee80211_link_data *link) @@ -177,6 +254,7 @@ static void ieee80211_set_vif_links_bitmaps(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: /* in an AP all links are always active */ sdata->vif.active_links = valid_links; @@ -278,14 +356,25 @@ static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata, ieee80211_set_vif_links_bitmaps(sdata, new_links, dormant_links); /* tell the driver */ - ret = drv_change_vif_links(sdata->local, sdata, - old_links & old_active, - new_links & sdata->vif.active_links, - old); + if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN) + ret = drv_change_vif_links(sdata->local, sdata, + old_links & old_active, + new_links & sdata->vif.active_links, + old); if (!new_links) ieee80211_debugfs_recreate_netdev(sdata, false); + + if (sdata->vif.type == NL80211_IFTYPE_AP) + ieee80211_update_apvlan_links(sdata); } + /* + * Ignore errors if we are only removing links as removal should + * always succeed + */ + if (!new_links) + ret = 0; + if (ret) { /* restore config */ memcpy(sdata->link, old_data, sizeof(old_data)); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 6b6de43d9420a..9c8f18b258a68 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include <net/mac80211.h> @@ -190,7 +190,8 @@ static u32 ieee80211_calc_hw_conf_chan(struct ieee80211_local *local, return changed; } -int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) +int ieee80211_hw_config(struct ieee80211_local *local, int radio_idx, + u32 changed) { int ret = 0; @@ -201,7 +202,7 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) IEEE80211_CONF_CHANGE_SMPS)); if (changed && local->open_count) { - ret = drv_config(local, changed); + ret = drv_config(local, radio_idx, changed); /* * Goal: * HW reconfiguration should never fail, the driver has told @@ -235,7 +236,7 @@ static int _ieee80211_hw_conf_chan(struct ieee80211_local *local, if (!changed) return 0; - return drv_config(local, changed); + return drv_config(local, -1, changed); } int ieee80211_hw_conf_chan(struct ieee80211_local *local) @@ -269,7 +270,7 @@ void ieee80211_hw_conf_init(struct ieee80211_local *local) ctx ? &ctx->conf : NULL); } - WARN_ON(drv_config(local, changed)); + WARN_ON(drv_config(local, -1, changed)); } int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, @@ -407,9 +408,20 @@ void ieee80211_link_info_change_notify(struct ieee80211_sub_if_data *sdata, WARN_ON_ONCE(changed & BSS_CHANGED_VIF_CFG_FLAGS); - if (!changed || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + if (!changed) return; + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + return; + case NL80211_IFTYPE_MONITOR: + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + return; + break; + default: + break; + } + if (!check_sdata_in_driver(sdata)) return; @@ -1024,12 +1036,9 @@ EXPORT_SYMBOL(ieee80211_alloc_hw_nm); static int ieee80211_init_cipher_suites(struct ieee80211_local *local) { - bool have_wep = !fips_enabled; /* FIPS does not permit the use of RC4 */ bool have_mfp = ieee80211_hw_check(&local->hw, MFP_CAPABLE); - int r = 0, w = 0; - u32 *suites; static const u32 cipher_suites[] = { - /* keep WEP first, it may be removed below */ + /* keep WEP and TKIP first, they may be removed below */ WLAN_CIPHER_SUITE_WEP40, WLAN_CIPHER_SUITE_WEP104, WLAN_CIPHER_SUITE_TKIP, @@ -1045,34 +1054,17 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) WLAN_CIPHER_SUITE_BIP_GMAC_256, }; - if (ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL) || - local->hw.wiphy->cipher_suites) { - /* If the driver advertises, or doesn't support SW crypto, - * we only need to remove WEP if necessary. - */ - if (have_wep) - return 0; - - /* well if it has _no_ ciphers ... fine */ - if (!local->hw.wiphy->n_cipher_suites) - return 0; - - /* Driver provides cipher suites, but we need to exclude WEP */ - suites = kmemdup_array(local->hw.wiphy->cipher_suites, - local->hw.wiphy->n_cipher_suites, - sizeof(u32), GFP_KERNEL); - if (!suites) - return -ENOMEM; - - for (r = 0; r < local->hw.wiphy->n_cipher_suites; r++) { - u32 suite = local->hw.wiphy->cipher_suites[r]; - - if (suite == WLAN_CIPHER_SUITE_WEP40 || - suite == WLAN_CIPHER_SUITE_WEP104) - continue; - suites[w++] = suite; - } - } else { + if (ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL) && fips_enabled) { + dev_err(local->hw.wiphy->dev.parent, + "Drivers with SW_CRYPTO_CONTROL cannot work with FIPS\n"); + return -EINVAL; + } + + if (WARN_ON(ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL) && + !local->hw.wiphy->cipher_suites)) + return -EINVAL; + + if (fips_enabled || !local->hw.wiphy->cipher_suites) { /* assign the (software supported and perhaps offloaded) * cipher suites */ @@ -1082,19 +1074,13 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) if (!have_mfp) local->hw.wiphy->n_cipher_suites -= 4; - if (!have_wep) { - local->hw.wiphy->cipher_suites += 2; - local->hw.wiphy->n_cipher_suites -= 2; + /* FIPS does not permit the use of RC4 */ + if (fips_enabled) { + local->hw.wiphy->cipher_suites += 3; + local->hw.wiphy->n_cipher_suites -= 3; } - - /* not dynamically allocated, so just return */ - return 0; } - local->hw.wiphy->cipher_suites = suites; - local->hw.wiphy->n_cipher_suites = w; - local->wiphy_ciphers_allocated = true; - return 0; } @@ -1359,7 +1345,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) GFP_KERNEL); if (!local->int_scan_req) return -ENOMEM; - local->int_scan_req->n_channels = channels; eth_broadcast_addr(local->int_scan_req->bssid); @@ -1650,10 +1635,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) ieee80211_led_exit(local); destroy_workqueue(local->workqueue); fail_workqueue: - if (local->wiphy_ciphers_allocated) { - kfree(local->hw.wiphy->cipher_suites); - local->wiphy_ciphers_allocated = false; - } kfree(local->int_scan_req); return result; } @@ -1724,11 +1705,6 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) mutex_destroy(&local->iflist_mtx); - if (local->wiphy_ciphers_allocated) { - kfree(local->hw.wiphy->cipher_suites); - local->wiphy_ciphers_allocated = false; - } - idr_for_each(&local->ack_status_frames, ieee80211_free_ack_frame, NULL); idr_destroy(&local->ack_status_frames); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 7257f5610af5a..a4a715f6f1c32 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/unaligned.h> +#include <net/sock.h> #include "ieee80211_i.h" #include "mesh.h" #include "wme.h" @@ -39,7 +40,7 @@ void ieee80211s_stop(void) static void ieee80211_mesh_housekeeping_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - from_timer(sdata, t, u.mesh.housekeeping_timer); + timer_container_of(sdata, t, u.mesh.housekeeping_timer); struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; @@ -683,7 +684,7 @@ int mesh_add_eht_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *sk static void ieee80211_mesh_path_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - from_timer(sdata, t, u.mesh.mesh_path_timer); + timer_container_of(sdata, t, u.mesh.mesh_path_timer); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } @@ -691,7 +692,7 @@ static void ieee80211_mesh_path_timer(struct timer_list *t) static void ieee80211_mesh_path_root_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - from_timer(sdata, t, u.mesh.mesh_path_root_timer); + timer_container_of(sdata, t, u.mesh.mesh_path_root_timer); struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; set_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags); @@ -776,7 +777,7 @@ bool ieee80211_mesh_xmit_fast(struct ieee80211_sub_if_data *sdata, if (ethertype < ETH_P_802_3_MIN) return false; - if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + if (sk_requests_wifi_status(skb->sk)) return false; if (skb->ip_summed == CHECKSUM_PARTIAL) { @@ -956,13 +957,10 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) u8 *pos; struct ieee80211_sub_if_data *sdata; int hdr_len = offsetofend(struct ieee80211_mgmt, u.beacon); - u32 rate_flags; sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); sband = ieee80211_get_sband(sdata); - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); ie_len_he_cap = ieee80211_ie_len_he_cap(sdata); ie_len_eht_cap = ieee80211_ie_len_eht_cap(sdata); @@ -1091,7 +1089,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) if (ieee80211_put_srates_elem(skb, sband, sdata->vif.bss_conf.basic_rates, - rate_flags, 0, WLAN_EID_SUPP_RATES) || + 0, WLAN_EID_SUPP_RATES) || mesh_add_ds_params_ie(sdata, skb)) goto out_free; @@ -1104,7 +1102,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) if (ieee80211_put_srates_elem(skb, sband, sdata->vif.bss_conf.basic_rates, - rate_flags, 0, WLAN_EID_EXT_SUPP_RATES) || + 0, WLAN_EID_EXT_SUPP_RATES) || mesh_add_rsn_ie(sdata, skb) || mesh_add_ht_cap_ie(sdata, skb) || mesh_add_ht_oper_ie(sdata, skb) || @@ -1204,7 +1202,7 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) return -ENOMEM; } - ieee80211_recalc_dtim(local, sdata); + ieee80211_recalc_dtim(sdata, drv_get_tsf(local, sdata)); ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); netif_carrier_on(sdata->dev); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index c94a9c7ca960e..9101858525dd8 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -636,7 +636,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, mesh_path_add_gate(mpath); } rcu_read_unlock(); - } else { + } else if (ifmsh->mshcfg.dot11MeshForwarding) { rcu_read_lock(); mpath = mesh_path_lookup(sdata, target_addr); if (mpath) { @@ -654,6 +654,8 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, } } rcu_read_unlock(); + } else { + forward = false; } if (reply) { @@ -671,7 +673,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, } } - if (forward && ifmsh->mshcfg.dot11MeshForwarding) { + if (forward) { u32 preq_id; u8 hopcount; @@ -1290,7 +1292,7 @@ int mesh_nexthop_lookup(struct ieee80211_sub_if_data *sdata, void mesh_path_timer(struct timer_list *t) { - struct mesh_path *mpath = from_timer(mpath, t, timer); + struct mesh_path *mpath = timer_container_of(mpath, t, timer); struct ieee80211_sub_if_data *sdata = mpath->sdata; int ret; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 9f9cb5af0a973..0319674be8322 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -22,7 +22,7 @@ static void mesh_path_free_rcu(struct mesh_table *tbl, struct mesh_path *mpath); static u32 mesh_table_hash(const void *addr, u32 len, u32 seed) { /* Use last four bytes of hw addr as hash index */ - return jhash_1word(__get_unaligned_cpu32((u8 *)addr + 2), seed); + return jhash_1word(get_unaligned((u32 *)((u8 *)addr + 2)), seed); } static const struct rhashtable_params mesh_rht_params = { diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 96e0a861886a1..cb45a5d2009d2 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -264,7 +264,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (action != WLAN_SP_MESH_PEERING_CLOSE) { struct ieee80211_supported_band *sband; - u32 rate_flags, basic_rates; + u32 basic_rates; sband = ieee80211_get_sband(sdata); if (!sband) { @@ -280,16 +280,12 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, put_unaligned_le16(sta->sta.aid, pos); } - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); basic_rates = sdata->vif.bss_conf.basic_rates; if (ieee80211_put_srates_elem(skb, sband, basic_rates, - rate_flags, 0, - WLAN_EID_SUPP_RATES) || + 0, WLAN_EID_SUPP_RATES) || ieee80211_put_srates_elem(skb, sband, basic_rates, - rate_flags, 0, - WLAN_EID_EXT_SUPP_RATES) || + 0, WLAN_EID_EXT_SUPP_RATES) || mesh_add_rsn_ie(sdata, skb) || mesh_add_meshid_ie(sdata, skb) || mesh_add_meshconf_ie(sdata, skb)) @@ -657,7 +653,7 @@ out: void mesh_plink_timer(struct timer_list *t) { - struct mesh_sta *mesh = from_timer(mesh, t, plink_timer); + struct mesh_sta *mesh = timer_container_of(mesh, t, plink_timer); struct sta_info *sta; u16 reason = 0; struct ieee80211_sub_if_data *sdata; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 35eaf0812c5b2..1008eb8e9b13b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -776,10 +776,6 @@ static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata, ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING)) return false; - if (chandef->punctured && chandef->chan->band == NL80211_BAND_5GHZ && - ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING_5GHZ)) - return false; - return true; } @@ -1218,18 +1214,36 @@ EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_determine_chan_mode); static int ieee80211_config_bw(struct ieee80211_link_data *link, struct ieee802_11_elems *elems, - bool update, u64 *changed, - const char *frame) + bool update, u64 *changed, u16 stype) { struct ieee80211_channel *channel = link->conf->chanreq.oper.chan; struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chan_req chanreq = {}; struct cfg80211_chan_def ap_chandef; enum ieee80211_conn_mode ap_mode; + const char *frame; u32 vht_cap_info = 0; u16 ht_opmode; int ret; + switch (stype) { + case IEEE80211_STYPE_BEACON: + frame = "beacon"; + break; + case IEEE80211_STYPE_ASSOC_RESP: + frame = "assoc response"; + break; + case IEEE80211_STYPE_REASSOC_RESP: + frame = "reassoc response"; + break; + case IEEE80211_STYPE_ACTION: + /* the only action frame that gets here */ + frame = "ML reconf response"; + break; + default: + return -EINVAL; + } + /* don't track any bandwidth changes in legacy/S1G modes */ if (link->u.mgd.conn.mode == IEEE80211_CONN_MODE_LEGACY || link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G) @@ -1278,7 +1292,9 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, ieee80211_min_bw_limit_from_chandef(&chanreq.oper)) ieee80211_chandef_downgrade(&chanreq.oper, NULL); - if (ap_chandef.chan->band == NL80211_BAND_6GHZ && + /* TPE element is not present in (re)assoc/ML reconfig response */ + if (stype == IEEE80211_STYPE_BEACON && + ap_chandef.chan->band == NL80211_BAND_6GHZ && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { ieee80211_rearrange_tpe(&elems->tpe, &ap_chandef, &chanreq.oper); @@ -1525,9 +1541,9 @@ static void ieee80211_assoc_add_rates(struct ieee80211_local *local, rates = ~0; } - ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates, + ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_SUPP_RATES); - ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates, + ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_EXT_SUPP_RATES); } @@ -1645,6 +1661,30 @@ static size_t ieee80211_add_before_he_elems(struct sk_buff *skb, return noffset; } +static size_t ieee80211_add_before_reg_conn(struct sk_buff *skb, + const u8 *elems, size_t elems_len, + size_t offset) +{ + static const u8 before_reg_conn[] = { + /* + * no need to list the ones split off before HE + * or generated here + */ + WLAN_EID_EXTENSION, WLAN_EID_EXT_DH_PARAMETER, + WLAN_EID_EXTENSION, WLAN_EID_EXT_KNOWN_STA_IDENTIFCATION, + }; + size_t noffset; + + if (!elems_len) + return offset; + + noffset = ieee80211_ie_split(elems, elems_len, before_reg_conn, + ARRAY_SIZE(before_reg_conn), offset); + skb_put_data(skb, elems + offset, noffset - offset); + + return noffset; +} + #define PRESENT_ELEMS_MAX 8 #define PRESENT_ELEM_EXT_OFFS 0x100 @@ -1806,6 +1846,22 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, } /* + * if present, add any custom IEs that go before regulatory + * connectivity element + */ + offset = ieee80211_add_before_reg_conn(skb, extra_elems, + extra_elems_len, offset); + + if (sband->band == NL80211_BAND_6GHZ) { + /* + * as per Section E.2.7 of IEEE 802.11 REVme D7.0, non-AP STA + * capable of operating on the 6 GHz band shall transmit + * regulatory connectivity element. + */ + ieee80211_put_reg_conn(skb, chan->flags); + } + + /* * careful - need to know about all the present elems before * calling ieee80211_assoc_add_ml_elem(), so add this one if * we're going to put it after the ML element @@ -1943,14 +1999,7 @@ ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, } skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops)); - /* Many APs have broken parsing of the extended MLD capa/ops field, - * dropping (re-)association request frames or replying with association - * response with a failure status if it's present. Without a clear - * indication as to whether the AP supports parsing this field or not do - * not include it in the common information unless strict mode is set. - */ - if (ieee80211_hw_check(&local->hw, STRICT) && - assoc_data->ext_mld_capa_ops) { + if (assoc_data->ext_mld_capa_ops) { ml_elem->control |= cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP); common->len += 2; @@ -2381,9 +2430,26 @@ static void ieee80211_csa_switch_work(struct wiphy *wiphy, * update cfg80211 directly. */ if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) { + struct link_sta_info *link_sta; + struct sta_info *ap_sta; + link->conf->chanreq = link->csa.chanreq; cfg80211_ch_switch_notify(sdata->dev, &link->csa.chanreq.oper, link->link_id); + link->conf->csa_active = false; + + ap_sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); + if (WARN_ON(!ap_sta)) + return; + + link_sta = wiphy_dereference(wiphy, + ap_sta->link[link->link_id]); + if (WARN_ON(!link_sta)) + return; + + link_sta->pub->bandwidth = + _ieee80211_sta_cur_vht_bw(link_sta, + &link->csa.chanreq.oper); return; } @@ -2439,6 +2505,21 @@ static void ieee80211_csa_switch_work(struct wiphy *wiphy, } } + /* + * It is not necessary to reset these timers if any link does not + * have an active CSA and that link still receives the beacons + * when other links have active CSA. + */ + for_each_link_data(sdata, link) { + if (!link->conf->csa_active) + return; + } + + /* + * Reset the beacon monitor and connection monitor timers when CSA + * is active for all links in MLO when channel switch occurs in all + * the links. + */ ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); } @@ -2515,7 +2596,8 @@ ieee80211_sta_abort_chanswitch(struct ieee80211_link_data *link) if (!local->ops->abort_channel_switch) return; - ieee80211_link_unreserve_chanctx(link); + if (rcu_access_pointer(link->conf->chanctx_conf)) + ieee80211_link_unreserve_chanctx(link); ieee80211_vif_unblock_queues_csa(sdata); @@ -3181,7 +3263,7 @@ static void ieee80211_enable_ps(struct ieee80211_local *local, return; conf->flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } @@ -3193,7 +3275,7 @@ static void ieee80211_change_ps(struct ieee80211_local *local) ieee80211_enable_ps(local, local->ps_sdata); } else if (conf->flags & IEEE80211_CONF_PS) { conf->flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); @@ -3302,7 +3384,7 @@ void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy, if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_wake_queues_by_reason(&local->hw, @@ -3377,13 +3459,14 @@ void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy, (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; local->hw.conf.flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } void ieee80211_dynamic_ps_timer(struct timer_list *t) { - struct ieee80211_local *local = from_timer(local, t, dynamic_ps_timer); + struct ieee80211_local *local = timer_container_of(local, t, + dynamic_ps_timer); wiphy_work_queue(local->hw.wiphy, &local->dynamic_ps_enable_work); } @@ -3933,6 +4016,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, lockdep_assert_wiphy(local->hw.wiphy); + if (frame_buf) + memset(frame_buf, 0, IEEE80211_DEAUTH_FRAME_LEN); + if (WARN_ON(!ap_sta)) return; @@ -3985,7 +4071,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, */ if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } local->ps_sdata = NULL; @@ -4281,9 +4367,6 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) lockdep_assert_wiphy(sdata->local->hw.wiphy); - if (WARN_ON(ieee80211_vif_is_mld(&sdata->vif))) - return; - /* * Try sending broadcast probe requests for the last three * probe requests after the first ones failed since some @@ -4329,9 +4412,6 @@ static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, lockdep_assert_wiphy(sdata->local->hw.wiphy); - if (WARN_ON_ONCE(ieee80211_vif_is_mld(&sdata->vif))) - return; - if (!ieee80211_sdata_running(sdata)) return; @@ -4733,6 +4813,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, }; + bool sae_need_confirm = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -4778,6 +4859,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY; ifmgd->auth_data->timeout_started = true; run_again(sdata, ifmgd->auth_data->timeout); + if (auth_transaction == 1) + sae_need_confirm = true; goto notify_driver; } @@ -4821,6 +4904,9 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, if (!ieee80211_mark_sta_auth(sdata)) return; /* ignore frame -- wait for timeout */ } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && + auth_transaction == 1) { + sae_need_confirm = true; + } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 2) { sdata_info(sdata, "SAE peer confirmed\n"); ifmgd->auth_data->peer_confirmed = true; @@ -4828,7 +4914,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); notify_driver: - drv_mgd_complete_tx(sdata->local, sdata, &info); + if (!sae_need_confirm) + drv_mgd_complete_tx(sdata->local, sdata, &info); } #define case_WLAN(type) \ @@ -5290,7 +5377,9 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, /* check/update if AP changed anything in assoc response vs. scan */ if (ieee80211_config_bw(link, elems, link_id == assoc_data->assoc_link_id, - changed, "assoc response")) { + changed, + le16_to_cpu(mgmt->frame_control) & + IEEE80211_FCTL_STYPE)) { ret = false; goto out; } @@ -5398,6 +5487,12 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, bss_conf->epcs_support = false; } + if (elems->s1g_oper && + link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G && + elems->s1g_capab) + ieee80211_s1g_cap_to_sta_s1g_cap(sdata, elems->s1g_capab, + link_sta); + bss_conf->twt_broadcast = ieee80211_twt_bcast_support(sdata, bss_conf, sband, link_sta); @@ -5918,6 +6013,7 @@ ieee80211_ap_power_type(u8 control) return IEEE80211_REG_LPI_AP; case IEEE80211_6GHZ_CTRL_REG_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: return IEEE80211_REG_SP_AP; case IEEE80211_6GHZ_CTRL_REG_VLP_AP: return IEEE80211_REG_VLP_AP; @@ -7194,6 +7290,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg; struct ieee80211_mgmt *mgmt = (void *) hdr; + struct ieee80211_ext *ext = NULL; size_t baselen; struct ieee802_11_elems *elems; struct ieee80211_local *local = sdata->local; @@ -7219,12 +7316,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, /* Process beacon from the current BSS */ bssid = ieee80211_get_bssid(hdr, len, sdata->vif.type); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { - struct ieee80211_ext *ext = (void *) mgmt; - - if (ieee80211_is_s1g_short_beacon(ext->frame_control)) - variable = ext->u.s1g_short_beacon.variable; - else - variable = ext->u.s1g_beacon.variable; + ext = (void *)mgmt; + variable = ext->u.s1g_beacon.variable + + ieee80211_s1g_optional_len(ext->frame_control); } baselen = (u8 *) variable - (u8 *) mgmt; @@ -7342,7 +7436,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_send_nullfunc(local, sdata, false); @@ -7409,7 +7503,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, } if ((ncrc == link->u.mgd.beacon_crc && link->u.mgd.beacon_crc_valid) || - ieee80211_is_s1g_short_beacon(mgmt->frame_control)) + (ext && ieee80211_is_s1g_short_beacon(ext->frame_control, + parse_params.start, + parse_params.len))) goto free; link->u.mgd.beacon_crc = ncrc; link->u.mgd.beacon_crc_valid = true; @@ -7478,7 +7574,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); - if (ieee80211_config_bw(link, elems, true, &changed, "beacon")) { + if (ieee80211_config_bw(link, elems, true, &changed, + IEEE80211_STYPE_BEACON)) { ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, deauth_buf); @@ -8082,7 +8179,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, static void ieee80211_sta_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - from_timer(sdata, t, u.mgd.timer); + timer_container_of(sdata, t, u.mgd.timer); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } @@ -8385,16 +8482,32 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) } } +static bool +ieee80211_is_csa_in_progress(struct ieee80211_sub_if_data *sdata) +{ + /* + * In MLO, check the CSA flags 'active' and 'waiting_bcn' for all + * the links. + */ + struct ieee80211_link_data *link; + + guard(rcu)(); + + for_each_link_data_rcu(sdata, link) { + if (!(link->conf->csa_active && + !link->u.mgd.csa.waiting_bcn)) + return false; + } + + return true; +} + static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - from_timer(sdata, t, u.mgd.bcn_mon_timer); - - if (WARN_ON(ieee80211_vif_is_mld(&sdata->vif))) - return; + timer_container_of(sdata, t, u.mgd.bcn_mon_timer); - if (sdata->vif.bss_conf.csa_active && - !sdata->deflink.u.mgd.csa.waiting_bcn) + if (ieee80211_is_csa_in_progress(sdata)) return; if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) @@ -8405,36 +8518,69 @@ static void ieee80211_sta_bcn_mon_timer(struct timer_list *t) &sdata->u.mgd.beacon_connection_loss_work); } -static void ieee80211_sta_conn_mon_timer(struct timer_list *t) +static unsigned long +ieee80211_latest_active_link_conn_timeout(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_sub_if_data *sdata = - from_timer(sdata, t, u.mgd.conn_mon_timer); - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_local *local = sdata->local; + unsigned long latest_timeout = jiffies; + unsigned int link_id; struct sta_info *sta; - unsigned long timeout; - if (WARN_ON(ieee80211_vif_is_mld(&sdata->vif))) - return; - - if (sdata->vif.bss_conf.csa_active && - !sdata->deflink.u.mgd.csa.waiting_bcn) - return; + guard(rcu)(); sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); if (!sta) + return 0; + + for (link_id = 0; link_id < ARRAY_SIZE(sta->link); + link_id++) { + struct link_sta_info *link_sta; + unsigned long timeout; + + link_sta = rcu_dereference(sta->link[link_id]); + if (!link_sta) + continue; + + timeout = link_sta->status_stats.last_ack; + if (time_before(timeout, link_sta->rx_stats.last_rx)) + timeout = link_sta->rx_stats.last_rx; + + timeout += IEEE80211_CONNECTION_IDLE_TIME; + + /* + * latest_timeout holds the timeout of the link + * that will expire last among all links in an + * non-AP MLD STA. This ensures that the connection + * monitor timer is only reset if at least one link + * is still active, and it is scheduled to fire at + * the latest possible timeout. + */ + if (time_after(timeout, latest_timeout)) + latest_timeout = timeout; + } + + return latest_timeout; +} + +static void ieee80211_sta_conn_mon_timer(struct timer_list *t) +{ + struct ieee80211_sub_if_data *sdata = + timer_container_of(sdata, t, u.mgd.conn_mon_timer); + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_local *local = sdata->local; + unsigned long latest_timeout; + + if (ieee80211_is_csa_in_progress(sdata)) return; - timeout = sta->deflink.status_stats.last_ack; - if (time_before(sta->deflink.status_stats.last_ack, sta->deflink.rx_stats.last_rx)) - timeout = sta->deflink.rx_stats.last_rx; - timeout += IEEE80211_CONNECTION_IDLE_TIME; + latest_timeout = ieee80211_latest_active_link_conn_timeout(sdata); - /* If timeout is after now, then update timer to fire at + /* + * If latest timeout is after now, then update timer to fire at * the later date, but do not actually probe at this time. */ - if (time_is_after_jiffies(timeout)) { - mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(timeout)); + if (time_is_after_jiffies(latest_timeout)) { + mod_timer(&ifmgd->conn_mon_timer, + round_jiffies_up(latest_timeout)); return; } @@ -8694,21 +8840,33 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, bool have_sta = false; bool mlo; int err; + u16 new_links; if (link_id >= 0) { mlo = true; if (WARN_ON(!ap_mld_addr)) return -EINVAL; - err = ieee80211_vif_set_links(sdata, BIT(link_id), 0); + new_links = BIT(link_id); } else { if (WARN_ON(ap_mld_addr)) return -EINVAL; ap_mld_addr = cbss->bssid; - err = ieee80211_vif_set_links(sdata, 0, 0); + new_links = 0; link_id = 0; mlo = false; } + if (assoc) { + rcu_read_lock(); + have_sta = sta_info_get(sdata, ap_mld_addr); + rcu_read_unlock(); + } + + if (mlo && !have_sta && + WARN_ON(sdata->vif.valid_links || sdata->vif.active_links)) + return -EINVAL; + + err = ieee80211_vif_set_links(sdata, new_links, 0); if (err) return err; @@ -8729,12 +8887,6 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, goto out_err; } - if (assoc) { - rcu_read_lock(); - have_sta = sta_info_get(sdata, ap_mld_addr); - rcu_read_unlock(); - } - if (!have_sta) { if (mlo) new_sta = sta_info_alloc_with_link(sdata, ap_mld_addr, @@ -9334,6 +9486,39 @@ out_rcu: return err; } +static bool +ieee80211_mgd_assoc_bss_has_mld_ext_capa_ops(struct cfg80211_assoc_request *req) +{ + const struct cfg80211_bss_ies *ies; + struct cfg80211_bss *bss; + const struct element *ml; + + /* not an MLO connection if link_id < 0, so irrelevant */ + if (req->link_id < 0) + return false; + + bss = req->links[req->link_id].bss; + + guard(rcu)(); + ies = rcu_dereference(bss->ies); + for_each_element_extid(ml, WLAN_EID_EXT_EHT_MULTI_LINK, + ies->data, ies->len) { + const struct ieee80211_multi_link_elem *mle; + + if (!ieee80211_mle_type_ok(ml->data + 1, + IEEE80211_ML_CONTROL_TYPE_BASIC, + ml->datalen - 1)) + continue; + + mle = (void *)(ml->data + 1); + if (mle->control & cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)) + return true; + } + + return false; + +} + int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req) { @@ -9386,7 +9571,17 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, else memcpy(assoc_data->ap_addr, cbss->bssid, ETH_ALEN); - assoc_data->ext_mld_capa_ops = cpu_to_le16(req->ext_mld_capa_ops); + /* + * Many APs have broken parsing of the extended MLD capa/ops field, + * dropping (re-)association request frames or replying with association + * response with a failure status if it's present. + * Set our value from the userspace request only in strict mode or if + * the AP also had that field present. + */ + if (ieee80211_hw_check(&local->hw, STRICT) || + ieee80211_mgd_assoc_bss_has_mld_ext_capa_ops(req)) + assoc_data->ext_mld_capa_ops = + cpu_to_le16(req->ext_mld_capa_ops); if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; @@ -10029,7 +10224,6 @@ void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!add_links_data->link[link_id].bss || !(sdata->u.mgd.reconf.added_links & BIT(link_id))) - continue; valid_links |= BIT(link_id); @@ -10701,8 +10895,8 @@ static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, */ for_each_mle_subelement(sub, (const u8 *)elems->ml_epcs, elems->ml_epcs_len) { + struct ieee802_11_elems *link_elems __free(kfree) = NULL; struct ieee80211_link_data *link; - struct ieee802_11_elems *link_elems __free(kfree); u8 *pos = (void *)sub->data; u16 control; ssize_t len; diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index ece1e83c7b2fa..a5d4358f122ae 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -146,7 +146,7 @@ void ieee80211_ocb_work(struct ieee80211_sub_if_data *sdata) static void ieee80211_ocb_housekeeping_timer(struct timer_list *t) { struct ieee80211_sub_if_data *sdata = - from_timer(sdata, t, u.ocb.housekeeping_timer); + timer_container_of(sdata, t, u.ocb.housekeeping_timer); struct ieee80211_local *local = sdata->local; struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 2b9abc27462eb..13df6321634de 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -39,7 +39,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) if (local->hw.conf.flags & IEEE80211_CONF_PS) { offchannel_ps_enabled = true; local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } if (!offchannel_ps_enabled || @@ -567,6 +567,7 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, { struct ieee80211_roc_work *roc, *tmp; bool queued = false, combine_started = true; + struct cfg80211_scan_request *req; int ret; lockdep_assert_wiphy(local->hw.wiphy); @@ -612,9 +613,11 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, roc->mgmt_tx_cookie = *cookie; } + req = wiphy_dereference(local->hw.wiphy, local->scan_req); + /* if there's no need to queue, handle it immediately */ if (list_empty(&local->roc_list) && - !local->scanning && !ieee80211_is_radar_required(local)) { + !local->scanning && !ieee80211_is_radar_required(local, req)) { /* if not HW assist, just queue & schedule work */ if (!local->ops->remain_on_channel) { list_add_tail(&roc->list, &local->roc_list); diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index 6da39c864f45b..c5e0f7f460048 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -758,7 +758,6 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, { const struct element *elem, *sub; size_t profile_len = 0; - bool found = false; if (!bss || !bss->transmitted_bss) return profile_len; @@ -809,15 +808,14 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, index[2], new_bssid); if (ether_addr_equal(new_bssid, bss->bssid)) { - found = true; elems->bssid_index_len = index[1]; elems->bssid_index = (void *)&index[2]; - break; + return profile_len; } } } - return found ? profile_len : 0; + return 0; } static void @@ -1101,7 +1099,6 @@ int ieee80211_parse_bitrates(enum nl80211_chan_width width, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) { - u32 rate_flags = ieee80211_chanwidth_rate_flags(width); struct ieee80211_rate *br; int brate, rate, i, j, count = 0; @@ -1112,8 +1109,6 @@ int ieee80211_parse_bitrates(enum nl80211_chan_width width, for (j = 0; j < sband->n_bitrates; j++) { br = &sband->bitrates[j]; - if ((rate_flags & br->flags) != rate_flags) - continue; brate = DIV_ROUND_UP(br->bitrate, 5); if (brate == rate) { diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index a9cc832240a54..5a508d99e84f7 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -108,7 +108,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) sdata->u.mgd.powersave && !(local->hw.conf.flags & IEEE80211_CONF_PS)) { local->hw.conf.flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 0d056db9f81e6..3cb2ad6d0b280 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -368,9 +368,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, u32 rate_mask) { + u32 rate_flags = 0; int i; - u32 rate_flags = - ieee80211_chandef_rate_flags(&hw->conf.chandef); if (sband->band == NL80211_BAND_S1GHZ) { info->control.rates[0].flags |= IEEE80211_TX_RC_S1G_MCS; @@ -778,14 +777,9 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], u16 vht_mask[NL80211_VHT_NSS_MAX]) { - u32 i, flags; + u32 i; *mask = sdata->rc_rateidx_mask[sband->band]; - flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); - for (i = 0; i < sband->n_bitrates; i++) { - if ((flags & sband->bitrates[i].flags) != flags) - *mask &= ~BIT(i); - } if (*mask == (1 << sband->n_bitrates) - 1 && !sdata->rc_has_mcs_mask[sband->band] && @@ -990,8 +984,6 @@ int rate_control_set_rates(struct ieee80211_hw *hw, if (sta->uploaded) drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); - ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta)); - return 0; } EXPORT_SYMBOL(rate_control_set_rates); diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 706cbc99f718b..f66910013218d 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1873,16 +1873,13 @@ minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) static void minstrel_ht_fill_rate_array(u8 *dest, struct ieee80211_supported_band *sband, - const s16 *bitrates, int n_rates, u32 rate_flags) + const s16 *bitrates, int n_rates) { int i, j; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *rate = &sband->bitrates[i]; - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; - for (j = 0; j < n_rates; j++) { if (rate->bitrate != bitrates[j]) continue; @@ -1898,7 +1895,6 @@ minstrel_ht_init_cck_rates(struct minstrel_priv *mp) { static const s16 bitrates[4] = { 10, 20, 55, 110 }; struct ieee80211_supported_band *sband; - u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->cck_rates, 0xff, sizeof(mp->cck_rates)); sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; @@ -1908,8 +1904,7 @@ minstrel_ht_init_cck_rates(struct minstrel_priv *mp) BUILD_BUG_ON(ARRAY_SIZE(mp->cck_rates) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->cck_rates, sband, minstrel_cck_bitrates, - ARRAY_SIZE(minstrel_cck_bitrates), - rate_flags); + ARRAY_SIZE(minstrel_cck_bitrates)); } static void @@ -1917,7 +1912,6 @@ minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) { static const s16 bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; struct ieee80211_supported_band *sband; - u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->ofdm_rates[band], 0xff, sizeof(mp->ofdm_rates[band])); sband = mp->hw->wiphy->bands[band]; @@ -1927,8 +1921,7 @@ minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) BUILD_BUG_ON(ARRAY_SIZE(mp->ofdm_rates[band]) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->ofdm_rates[band], sband, minstrel_ofdm_bitrates, - ARRAY_SIZE(minstrel_ofdm_bitrates), - rate_flags); + ARRAY_SIZE(minstrel_ofdm_bitrates)); } static void * diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 09beb65d6108b..4d4ff4d4917a2 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -231,8 +231,19 @@ static void __ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, skb_queue_tail(&sdata->skb_queue, skb); wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); - if (sta) - sta->deflink.rx_stats.packets++; + if (sta) { + struct link_sta_info *link_sta_info; + + if (link_id >= 0) { + link_sta_info = rcu_dereference(sta->link[link_id]); + if (!link_sta_info) + return; + } else { + link_sta_info = &sta->deflink; + } + + link_sta_info->rx_stats.packets++; + } } static void ieee80211_queue_skb_to_iface(struct ieee80211_sub_if_data *sdata, @@ -1521,9 +1532,8 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) } if (rx->sdata->vif.type == NL80211_IFTYPE_AP && - cfg80211_rx_spurious_frame(rx->sdata->dev, - hdr->addr2, - GFP_ATOMIC)) + cfg80211_rx_spurious_frame(rx->sdata->dev, hdr->addr2, + rx->link_id, GFP_ATOMIC)) return RX_DROP_U_SPURIOUS; return RX_DROP; @@ -1861,7 +1871,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) if (!test_and_set_sta_flag(sta, WLAN_STA_4ADDR_EVENT)) cfg80211_rx_unexpected_4addr_frame( rx->sdata->dev, sta->sta.addr, - GFP_ATOMIC); + rx->link_id, GFP_ATOMIC); return RX_DROP_U_UNEXPECTED_4ADDR_FRAME; } /* @@ -3022,7 +3032,6 @@ __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset) struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; struct sk_buff_head frame_list; - ieee80211_rx_result res; struct ethhdr ethhdr; const u8 *check_da = ethhdr.h_dest, *check_sa = ethhdr.h_source; @@ -3084,24 +3093,18 @@ __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset) while (!skb_queue_empty(&frame_list)) { rx->skb = __skb_dequeue(&frame_list); - res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb); - switch (res) { + switch (ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb)) { case RX_QUEUED: - continue; - case RX_CONTINUE: break; + case RX_CONTINUE: + if (ieee80211_frame_allowed(rx, fc)) { + ieee80211_deliver_skb(rx); + break; + } + fallthrough; default: - goto free; + dev_kfree_skb(rx->skb); } - - if (!ieee80211_frame_allowed(rx, fc)) - goto free; - - ieee80211_deliver_skb(rx); - continue; - -free: - dev_kfree_skb(rx->skb); } return RX_QUEUED; @@ -3187,7 +3190,8 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) if (rx->sta && !test_and_set_sta_flag(rx->sta, WLAN_STA_4ADDR_EVENT)) cfg80211_rx_unexpected_4addr_frame( - rx->sdata->dev, rx->sta->sta.addr, GFP_ATOMIC); + rx->sdata->dev, rx->sta->sta.addr, rx->link_id, + GFP_ATOMIC); return RX_DROP; } @@ -3576,41 +3580,18 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto handled; } case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { - struct ieee80211_supported_band *sband; u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth; - enum ieee80211_sta_rx_bandwidth max_bw, new_bw; - struct sta_opmode_info sta_opmode = {}; + + if (chanwidth != IEEE80211_HT_CHANWIDTH_20MHZ && + chanwidth != IEEE80211_HT_CHANWIDTH_ANY) + goto invalid; /* If it doesn't support 40 MHz it can't change ... */ if (!(rx->link_sta->pub->ht_cap.cap & - IEEE80211_HT_CAP_SUP_WIDTH_20_40)) - goto handled; - - if (chanwidth == IEEE80211_HT_CHANWIDTH_20MHZ) - max_bw = IEEE80211_STA_RX_BW_20; - else - max_bw = ieee80211_sta_cap_rx_bw(rx->link_sta); - - /* set cur_max_bandwidth and recalc sta bw */ - rx->link_sta->cur_max_bandwidth = max_bw; - new_bw = ieee80211_sta_cur_vht_bw(rx->link_sta); - - if (rx->link_sta->pub->bandwidth == new_bw) + IEEE80211_HT_CAP_SUP_WIDTH_20_40)) goto handled; - rx->link_sta->pub->bandwidth = new_bw; - sband = rx->local->hw.wiphy->bands[status->band]; - sta_opmode.bw = - ieee80211_sta_rx_bw_to_chan_width(rx->link_sta); - sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED; - - rate_control_rate_update(local, sband, rx->link_sta, - IEEE80211_RC_BW_CHANGED); - cfg80211_sta_opmode_change_notify(sdata->dev, - rx->sta->addr, - &sta_opmode, - GFP_ATOMIC); - goto handled; + goto queue; } default: goto invalid; @@ -4234,10 +4215,16 @@ static bool ieee80211_rx_data_set_sta(struct ieee80211_rx_data *rx, rx->link_sta = NULL; } - if (link_id < 0) - rx->link = &rx->sdata->deflink; - else if (!ieee80211_rx_data_set_link(rx, link_id)) + if (link_id < 0) { + if (ieee80211_vif_is_mld(&rx->sdata->vif) && + sta && !sta->sta.valid_links) + rx->link = + rcu_dereference(rx->sdata->link[sta->deflink.link_id]); + else + rx->link = &rx->sdata->deflink; + } else if (!ieee80211_rx_data_set_link(rx, link_id)) { return false; + } return true; } @@ -4432,6 +4419,10 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) if (!multicast && !ether_addr_equal(sdata->dev->dev_addr, hdr->addr1)) return false; + /* reject invalid/our STA address */ + if (!is_valid_ether_addr(hdr->addr2) || + ether_addr_equal(sdata->dev->dev_addr, hdr->addr2)) + return false; if (!rx->sta) { int rate_idx; if (status->encoding != RX_ENC_LEGACY) @@ -5115,8 +5106,24 @@ static bool ieee80211_rx_for_interface(struct ieee80211_rx_data *rx, struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); sta = sta_info_get_bss(rx->sdata, hdr->addr2); - if (status->link_valid) + if (status->link_valid) { link_id = status->link_id; + } else if (ieee80211_vif_is_mld(&rx->sdata->vif) && + status->freq) { + struct ieee80211_link_data *link; + struct ieee80211_chanctx_conf *conf; + + for_each_link_data_rcu(rx->sdata, link) { + conf = rcu_dereference(link->conf->chanctx_conf); + if (!conf || !conf->def.chan) + continue; + + if (status->freq == conf->def.chan->center_freq) { + link_id = link->link_id; + break; + } + } + } } if (!ieee80211_rx_data_set_sta(rx, sta, link_id)) diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index d4ed0c0a335ca..1f68df6e80670 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -194,3 +194,29 @@ void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, break; } } + +void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_s1g_cap *s1g_cap_ie, + struct link_sta_info *link_sta) +{ + struct ieee80211_sta_s1g_cap *s1g_cap = &link_sta->pub->s1g_cap; + + memset(s1g_cap, 0, sizeof(*s1g_cap)); + + memcpy(s1g_cap->cap, s1g_cap_ie->capab_info, sizeof(s1g_cap->cap)); + memcpy(s1g_cap->nss_mcs, s1g_cap_ie->supp_mcs_nss, + sizeof(s1g_cap->nss_mcs)); + + s1g_cap->s1g = true; + + /* Maximum MPDU length is 1 bit for S1G */ + if (s1g_cap->cap[3] & S1G_CAP3_MAX_MPDU_LEN) { + link_sta->pub->agg.max_amsdu_len = + IEEE80211_MAX_MPDU_LEN_VHT_7991; + } else { + link_sta->pub->agg.max_amsdu_len = + IEEE80211_MAX_MPDU_LEN_VHT_3895; + } + + ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); +} diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index cb70790718858..dbf98aa4cd67f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/if_arp.h> @@ -240,6 +240,9 @@ static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, u32 scan_flags, const u8 *da) { + struct ieee80211_link_data *link_sdata; + u8 link_id; + if (!sdata) return false; @@ -251,7 +254,20 @@ static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) return true; - return ether_addr_equal(da, sdata->vif.addr); + + if (ether_addr_equal(da, sdata->vif.addr)) + return true; + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + link_sdata = rcu_dereference(sdata->link[link_id]); + if (!link_sdata) + continue; + + if (ether_addr_equal(da, link_sdata->conf->addr)) + return true; + } + + return false; } void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) @@ -260,6 +276,7 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; + struct ieee80211_ext *ext; size_t min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); @@ -269,12 +286,10 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_short_beacon.variable); - else - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_beacon); + ext = (struct ieee80211_ext *)mgmt; + min_hdr_len = + offsetof(struct ieee80211_ext, u.s1g_beacon.variable) + + ieee80211_s1g_optional_len(ext->frame_control); } if (skb->len < min_hdr_len) @@ -571,7 +586,8 @@ static int ieee80211_start_sw_scan(struct ieee80211_local *local, return 0; } -static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) +static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata, + struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *sdata_iter; @@ -579,7 +595,7 @@ static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) lockdep_assert_wiphy(local->hw.wiphy); - if (!ieee80211_is_radar_required(local)) + if (!ieee80211_is_radar_required(local, req)) return true; if (!regulatory_pre_cac_allowed(local->hw.wiphy)) @@ -595,9 +611,10 @@ static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) } static bool ieee80211_can_scan(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) + struct ieee80211_sub_if_data *sdata, + struct cfg80211_scan_request *req) { - if (!__ieee80211_can_leave_ch(sdata)) + if (!__ieee80211_can_leave_ch(sdata, req)) return false; if (!list_empty(&local->roc_list)) @@ -612,15 +629,19 @@ static bool ieee80211_can_scan(struct ieee80211_local *local, void ieee80211_run_deferred_scan(struct ieee80211_local *local) { + struct cfg80211_scan_request *req; + lockdep_assert_wiphy(local->hw.wiphy); if (!local->scan_req || local->scanning) return; + req = wiphy_dereference(local->hw.wiphy, local->scan_req); if (!ieee80211_can_scan(local, rcu_dereference_protected( local->scan_sdata, - lockdep_is_held(&local->hw.wiphy->mtx)))) + lockdep_is_held(&local->hw.wiphy->mtx)), + req)) return; wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, @@ -717,10 +738,10 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, !(sdata->vif.active_links & BIT(req->tsf_report_link_id))) return -EINVAL; - if (!__ieee80211_can_leave_ch(sdata)) + if (!__ieee80211_can_leave_ch(sdata, req)) return -EBUSY; - if (!ieee80211_can_scan(local, sdata)) { + if (!ieee80211_can_scan(local, sdata, req)) { /* wait for the work to finish/time out */ rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); @@ -779,6 +800,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, local->hw_scan_req->req.scan_6ghz_params = req->scan_6ghz_params; local->hw_scan_req->req.scan_6ghz = req->scan_6ghz; + local->hw_scan_req->req.first_part = req->first_part; /* * After allocating local->hw_scan_req, we must diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index c6015cd00372c..7422888d36409 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -147,14 +147,14 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; u32 control_freq, center_freq1, center_freq2; enum nl80211_chan_width chan_width; - struct { - struct ieee80211_he_operation _oper; - struct ieee80211_he_6ghz_oper _6ghz_oper; - } __packed he; - struct { - struct ieee80211_eht_operation _oper; - struct ieee80211_eht_operation_info _oper_info; - } __packed eht; + DEFINE_RAW_FLEX(struct ieee80211_he_operation, he, optional, + sizeof(struct ieee80211_he_6ghz_oper)); + struct ieee80211_he_6ghz_oper *_6ghz_oper = + (struct ieee80211_he_6ghz_oper *)he->optional; + DEFINE_RAW_FLEX(struct ieee80211_eht_operation, eht, optional, + sizeof(struct ieee80211_eht_operation_info)); + struct ieee80211_eht_operation_info *_oper_info = + (struct ieee80211_eht_operation_info *)eht->optional; const struct ieee80211_eht_operation *eht_oper; if (conn->mode < IEEE80211_CONN_MODE_HE) { @@ -167,38 +167,38 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, center_freq2 = chandef->center_freq2; chan_width = chandef->width; - he._oper.he_oper_params = + he->he_oper_params = le32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); - he._6ghz_oper.primary = + _6ghz_oper->primary = ieee80211_frequency_to_channel(control_freq); - he._6ghz_oper.ccfs0 = ieee80211_frequency_to_channel(center_freq1); - he._6ghz_oper.ccfs1 = center_freq2 ? + _6ghz_oper->ccfs0 = ieee80211_frequency_to_channel(center_freq1); + _6ghz_oper->ccfs1 = center_freq2 ? ieee80211_frequency_to_channel(center_freq2) : 0; switch (chan_width) { case NL80211_CHAN_WIDTH_320: - he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0; - he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -16 : 16; - he._6ghz_oper.control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; + _6ghz_oper->ccfs1 = _6ghz_oper->ccfs0; + _6ghz_oper->ccfs0 += control_freq < center_freq1 ? -16 : 16; + _6ghz_oper->control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; break; case NL80211_CHAN_WIDTH_160: - he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0; - he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -8 : 8; + _6ghz_oper->ccfs1 = _6ghz_oper->ccfs0; + _6ghz_oper->ccfs0 += control_freq < center_freq1 ? -8 : 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: - he._6ghz_oper.control = + _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: - he._6ghz_oper.control = + _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: - he._6ghz_oper.control = + _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; break; default: - he._6ghz_oper.control = + _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; break; } @@ -206,15 +206,14 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, if (conn->mode < IEEE80211_CONN_MODE_EHT) { eht_oper = NULL; } else { - eht._oper.params = IEEE80211_EHT_OPER_INFO_PRESENT; - eht._oper_info.control = he._6ghz_oper.control; - eht._oper_info.ccfs0 = he._6ghz_oper.ccfs0; - eht._oper_info.ccfs1 = he._6ghz_oper.ccfs1; - eht_oper = &eht._oper; + eht->params = IEEE80211_EHT_OPER_INFO_PRESENT; + _oper_info->control = _6ghz_oper->control; + _oper_info->ccfs0 = _6ghz_oper->ccfs0; + _oper_info->ccfs1 = _6ghz_oper->ccfs1; + eht_oper = eht; } - if (!ieee80211_chandef_he_6ghz_oper(local, &he._oper, - eht_oper, chandef)) + if (!ieee80211_chandef_he_6ghz_oper(local, he, eht_oper, chandef)) chandef->chan = NULL; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 248e1f63bf739..8c550aab9bdce 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/module.h> @@ -18,7 +18,6 @@ #include <linux/timer.h> #include <linux/rtnetlink.h> -#include <net/codel.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" @@ -356,6 +355,50 @@ static void sta_info_free_link(struct link_sta_info *link_sta) free_percpu(link_sta->pcpu_rx_stats); } +static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id) +{ + struct link_sta_info *link_sta = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + struct ieee80211_link_data *link; + int ac, tid; + u32 thr; + + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { + sta->rem_link_stats.tx_packets += + link_sta->tx_stats.packets[ac]; + sta->rem_link_stats.tx_bytes += link_sta->tx_stats.bytes[ac]; + } + + sta->rem_link_stats.rx_packets += link_sta->rx_stats.packets; + sta->rem_link_stats.rx_bytes += link_sta->rx_stats.bytes; + sta->rem_link_stats.tx_retries += link_sta->status_stats.retry_count; + sta->rem_link_stats.tx_failed += link_sta->status_stats.retry_failed; + sta->rem_link_stats.rx_dropped_misc += link_sta->rx_stats.dropped; + + thr = sta_get_expected_throughput(sta); + if (thr != 0) + sta->rem_link_stats.expected_throughput += thr; + + for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { + sta->rem_link_stats.pertid_stats.rx_msdu += + link_sta->rx_stats.msdu[tid]; + sta->rem_link_stats.pertid_stats.tx_msdu += + link_sta->tx_stats.msdu[tid]; + sta->rem_link_stats.pertid_stats.tx_msdu_retries += + link_sta->status_stats.msdu_retries[tid]; + sta->rem_link_stats.pertid_stats.tx_msdu_failed += + link_sta->status_stats.msdu_failed[tid]; + } + + if (sta->sdata->vif.type == NL80211_IFTYPE_STATION) { + link = wiphy_dereference(sta->sdata->local->hw.wiphy, + sta->sdata->link[link_id]); + if (link) + sta->rem_link_stats.beacon_loss_count += + link->u.mgd.beacon_loss_count; + } +} + static void sta_remove_link(struct sta_info *sta, unsigned int link_id, bool unhash) { @@ -378,6 +421,10 @@ static void sta_remove_link(struct sta_info *sta, unsigned int link_id, alloc = container_of(link_sta, typeof(*alloc), info); sta->sta.valid_links &= ~BIT(link_id); + + /* store removed link info for accumulated stats consistency */ + sta_accumulate_removed_link_stats(sta, link_id); + RCU_INIT_POINTER(sta->link[link_id], NULL); RCU_INIT_POINTER(sta->sta.link[link_id], NULL); if (alloc) { @@ -682,6 +729,7 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, IEEE80211_RATE_MANDATORY_G; break; case NL80211_BAND_5GHZ: + case NL80211_BAND_6GHZ: mandatory = IEEE80211_RATE_MANDATORY_A; break; case NL80211_BAND_60GHZ: @@ -701,12 +749,6 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, } } - sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD; - sta->cparams.target = MS2TIME(20); - sta->cparams.interval = MS2TIME(100); - sta->cparams.ecn = true; - sta->cparams.ce_threshold_selector = 0; - sta->cparams.ce_threshold_mask = 0; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); @@ -1549,7 +1591,8 @@ int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, static void sta_info_cleanup(struct timer_list *t) { - struct ieee80211_local *local = from_timer(local, t, sta_cleanup); + struct ieee80211_local *local = timer_container_of(local, t, + sta_cleanup); struct sta_info *sta; bool timer_needed = false; @@ -1657,7 +1700,7 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { - unsigned long last_active = ieee80211_sta_last_active(sta); + unsigned long last_active = ieee80211_sta_last_active(sta, -1); if (sdata != sta->sdata) continue; @@ -2426,18 +2469,27 @@ void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, } static struct ieee80211_sta_rx_stats * -sta_get_last_rx_stats(struct sta_info *sta) +sta_get_last_rx_stats(struct sta_info *sta, int link_id) { - struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats; + struct ieee80211_sta_rx_stats *stats; + struct link_sta_info *link_sta_info; int cpu; - if (!sta->deflink.pcpu_rx_stats) + if (link_id < 0) + link_sta_info = &sta->deflink; + else + link_sta_info = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + + stats = &link_sta_info->rx_stats; + + if (!link_sta_info->pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; - cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); + cpustats = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; @@ -2505,9 +2557,10 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, } } -static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) +static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo, + int link_id) { - u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); + u32 rate = READ_ONCE(sta_get_last_rx_stats(sta, link_id)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; @@ -2532,20 +2585,28 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, - int tid) + int tid, int link_id) { struct ieee80211_local *local = sta->local; + struct link_sta_info *link_sta_info; int cpu; + if (link_id < 0) + link_sta_info = &sta->deflink; + else + link_sta_info = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { - tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats, - tid); + tidstats->rx_msdu += + sta_get_tidstats_msdu(&link_sta_info->rx_stats, + tid); - if (sta->deflink.pcpu_rx_stats) { + if (link_sta_info->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; - cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, + cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); @@ -2557,22 +2618,24 @@ static void sta_set_tidstats(struct sta_info *sta, if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); - tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid]; + tidstats->tx_msdu = link_sta_info->tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); - tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid]; + tidstats->tx_msdu_retries = + link_sta_info->status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); - tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; + tidstats->tx_msdu_failed = + link_sta_info->status_stats.msdu_failed[tid]; } - if (tid < IEEE80211_NUM_TIDS) { + if (link_id < 0 && tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); @@ -2631,16 +2694,278 @@ static void sta_set_mesh_sinfo(struct sta_info *sta, } #endif +void sta_set_accumulated_removed_links_sinfo(struct sta_info *sta, + struct station_info *sinfo) +{ + /* Accumulating the removed link statistics. */ + sinfo->tx_packets = sta->rem_link_stats.tx_packets; + sinfo->rx_packets = sta->rem_link_stats.rx_packets; + sinfo->tx_bytes = sta->rem_link_stats.tx_bytes; + sinfo->rx_bytes = sta->rem_link_stats.rx_bytes; + sinfo->tx_retries = sta->rem_link_stats.tx_retries; + sinfo->tx_failed = sta->rem_link_stats.tx_failed; + sinfo->rx_dropped_misc = sta->rem_link_stats.rx_dropped_misc; + sinfo->beacon_loss_count = sta->rem_link_stats.beacon_loss_count; + sinfo->expected_throughput = sta->rem_link_stats.expected_throughput; + + if (sinfo->pertid) { + sinfo->pertid->rx_msdu = + sta->rem_link_stats.pertid_stats.rx_msdu; + sinfo->pertid->tx_msdu = + sta->rem_link_stats.pertid_stats.tx_msdu; + sinfo->pertid->tx_msdu_retries = + sta->rem_link_stats.pertid_stats.tx_msdu_retries; + sinfo->pertid->tx_msdu_failed = + sta->rem_link_stats.pertid_stats.tx_msdu_failed; + } +} + +static void sta_set_link_sinfo(struct sta_info *sta, + struct link_station_info *link_sinfo, + struct ieee80211_link_data *link, + bool tidstats) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_sta_rx_stats *last_rxstats; + int i, ac, cpu, link_id = link->link_id; + struct link_sta_info *link_sta_info; + u32 thr = 0; + + last_rxstats = sta_get_last_rx_stats(sta, link_id); + + link_sta_info = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + + /* do before driver, so beacon filtering drivers have a + * chance to e.g. just add the number of filtered beacons + * (or just modify the value entirely, of course) + */ + if (sdata->vif.type == NL80211_IFTYPE_STATION) + link_sinfo->rx_beacon = link->u.mgd.count_beacon_signal; + + ether_addr_copy(link_sinfo->addr, link_sta_info->addr); + + drv_link_sta_statistics(sta->local, sdata, + link_sta_info->pub, + link_sinfo); + + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | + BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | + BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); + + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + link_sinfo->beacon_loss_count = + link->u.mgd.beacon_loss_count; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); + } + + link_sinfo->inactive_time = + jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, link_id)); + + if (!(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { + link_sinfo->tx_bytes = 0; + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + link_sinfo->tx_bytes += + link_sta_info->tx_stats.bytes[ac]; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { + link_sinfo->tx_packets = 0; + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + link_sinfo->tx_packets += + link_sta_info->tx_stats.packets[ac]; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); + } + + if (!(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { + link_sinfo->rx_bytes += + sta_get_stats_bytes(&link_sta_info->rx_stats); + + if (link_sta_info->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; + + cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, + cpu); + link_sinfo->rx_bytes += + sta_get_stats_bytes(cpurxs); + } + } + + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { + link_sinfo->rx_packets = link_sta_info->rx_stats.packets; + if (link_sta_info->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; + + cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, + cpu); + link_sinfo->rx_packets += cpurxs->packets; + } + } + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { + link_sinfo->tx_retries = + link_sta_info->status_stats.retry_count; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { + link_sinfo->tx_failed = + link_sta_info->status_stats.retry_failed; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + link_sinfo->rx_duration += sta->airtime[ac].rx_airtime; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + link_sinfo->tx_duration += sta->airtime[ac].tx_airtime; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { + link_sinfo->airtime_weight = sta->airtime_weight; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); + } + + link_sinfo->rx_dropped_misc = link_sta_info->rx_stats.dropped; + if (link_sta_info->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; + + cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, + cpu); + link_sinfo->rx_dropped_misc += cpurxs->dropped; + } + } + + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | + BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); + link_sinfo->rx_beacon_signal_avg = + ieee80211_ave_rssi(&sdata->vif, -1); + } + + if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || + ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { + link_sinfo->signal = (s8)last_rxstats->last_signal; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); + } + + if (!link_sta_info->pcpu_rx_stats && + !(link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { + link_sinfo->signal_avg = + -ewma_signal_read(&link_sta_info->rx_stats_avg.signal); + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); + } + } + + /* for the average - if pcpu_rx_stats isn't set - rxstats must point to + * the sta->rx_stats struct, so the check here is fine with and without + * pcpu statistics + */ + if (last_rxstats->chains && + !(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | + BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); + if (!link_sta_info->pcpu_rx_stats) + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); + + link_sinfo->chains = last_rxstats->chains; + + for (i = 0; i < ARRAY_SIZE(link_sinfo->chain_signal); i++) { + link_sinfo->chain_signal[i] = + last_rxstats->chain_signal_last[i]; + link_sinfo->chain_signal_avg[i] = + -ewma_signal_read( + &link_sta_info->rx_stats_avg.chain_signal[i]); + } + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && + ieee80211_rate_valid(&link_sta_info->tx_stats.last_rate)) { + sta_set_rate_info_tx(sta, &link_sta_info->tx_stats.last_rate, + &link_sinfo->txrate); + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) { + if (sta_set_rate_info_rx(sta, &link_sinfo->rxrate, + link_id) == 0) + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_RX_BITRATE); + } + + if (tidstats && !cfg80211_link_sinfo_alloc_tid_stats(link_sinfo, + GFP_KERNEL)) { + for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) + sta_set_tidstats(sta, &link_sinfo->pertid[i], i, + link_id); + } + + link_sinfo->bss_param.flags = 0; + if (sdata->vif.bss_conf.use_cts_prot) + link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; + if (sdata->vif.bss_conf.use_short_preamble) + link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; + if (sdata->vif.bss_conf.use_short_slot) + link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; + link_sinfo->bss_param.dtim_period = link->conf->dtim_period; + link_sinfo->bss_param.beacon_interval = link->conf->beacon_int; + + thr = sta_get_expected_throughput(sta); + + if (thr != 0) { + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); + link_sinfo->expected_throughput = thr; + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && + link_sta_info->status_stats.ack_signal_filled) { + link_sinfo->ack_signal = + link_sta_info->status_stats.last_ack_signal; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && + link_sta_info->status_stats.ack_signal_filled) { + link_sinfo->avg_ack_signal = + -(s8)ewma_avg_signal_read( + &link_sta_info->status_stats.avg_ack_signal); + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); + } +} + void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; - int i, ac, cpu; + int i, ac, cpu, link_id; struct ieee80211_sta_rx_stats *last_rxstats; - last_rxstats = sta_get_last_rx_stats(sta); + last_rxstats = sta_get_last_rx_stats(sta, -1); sinfo->generation = sdata->local->sta_generation; @@ -2668,7 +2993,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = - jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); + jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, -1)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { @@ -2757,7 +3082,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); - sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); + sinfo->rx_beacon_signal_avg = + ieee80211_ave_rssi(&sdata->vif, -1); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || @@ -2806,13 +3132,13 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) && !sta->sta.valid_links) { - if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) + if (sta_set_rate_info_rx(sta, &sinfo->rxrate, -1) == 0) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) - sta_set_tidstats(sta, &sinfo->pertid[i], i); + sta_set_tidstats(sta, &sinfo->pertid[i], i, -1); } #ifdef CONFIG_MAC80211_MESH @@ -2874,6 +3200,26 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } + + if (sta->sta.valid_links) { + struct ieee80211_link_data *link; + struct link_sta_info *link_sta; + + ether_addr_copy(sinfo->mld_addr, sta->addr); + for_each_valid_link(sinfo, link_id) { + link_sta = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + link = wiphy_dereference(sdata->local->hw.wiphy, + sdata->link[link_id]); + + if (!link_sta || !sinfo->links[link_id] || !link) + continue; + + sinfo->valid_links = sta->sta.valid_links; + sta_set_link_sinfo(sta, sinfo->links[link_id], + link, tidstats); + } + } } u32 sta_get_expected_throughput(struct sta_info *sta) @@ -2895,35 +3241,24 @@ u32 sta_get_expected_throughput(struct sta_info *sta) return thr; } -unsigned long ieee80211_sta_last_active(struct sta_info *sta) +unsigned long ieee80211_sta_last_active(struct sta_info *sta, int link_id) { - struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); + struct ieee80211_sta_rx_stats *stats; + struct link_sta_info *link_sta_info; - if (!sta->deflink.status_stats.last_ack || - time_after(stats->last_rx, sta->deflink.status_stats.last_ack)) - return stats->last_rx; - return sta->deflink.status_stats.last_ack; -} + stats = sta_get_last_rx_stats(sta, link_id); -static void sta_update_codel_params(struct sta_info *sta, u32 thr) -{ - if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { - sta->cparams.target = MS2TIME(50); - sta->cparams.interval = MS2TIME(300); - sta->cparams.ecn = false; - } else { - sta->cparams.target = MS2TIME(20); - sta->cparams.interval = MS2TIME(100); - sta->cparams.ecn = true; - } -} + if (link_id < 0) + link_sta_info = &sta->deflink; + else + link_sta_info = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); -void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, - u32 thr) -{ - struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + if (!link_sta_info->status_stats.last_ack || + time_after(stats->last_rx, link_sta_info->status_stats.last_ack)) + return stats->last_rx; - sta_update_codel_params(sta, thr); + return link_sta_info->status_stats.last_ack; } int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 07b7ec39a52f9..5288d52866510 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -466,14 +466,6 @@ struct ieee80211_fragment_cache { unsigned int next; }; -/* - * The bandwidth threshold below which the per-station CoDel parameters will be - * scaled to be more lenient (to prevent starvation of slow stations). This - * value will be scaled by the number of active stations when it is being - * applied. - */ -#define STA_SLOW_THRESHOLD 6000 /* 6 Mbps */ - /** * struct link_sta_info - Link STA information * All link specific sta info are stored here for reference. This can be @@ -577,6 +569,58 @@ struct link_sta_info { }; /** + * struct ieee80211_sta_removed_link_stats - Removed link sta data + * + * keep required accumulated removed link data for stats + * + * @rx_packets: accumulated packets (MSDUs & MMPDUs) received from + * this station for removed links + * @tx_packets: accumulated packets (MSDUs & MMPDUs) transmitted to + * this station for removed links + * @rx_bytes: accumulated bytes (size of MPDUs) received from this + * station for removed links + * @tx_bytes: accumulated bytes (size of MPDUs) transmitted to this + * station for removed links + * @tx_retries: cumulative retry counts (MPDUs) for removed links + * @tx_failed: accumulated number of failed transmissions (MPDUs) + * (retries exceeded, no ACK) for removed links + * @rx_dropped_misc: accumulated dropped packets for un-specified reason + * from this station for removed links + * @beacon_loss_count: Number of times beacon loss event has triggered + * from this station for removed links. + * @expected_throughput: expected throughput in kbps (including 802.11 + * headers) towards this station for removed links + * @pertid_stats: accumulated per-TID statistics for removed link of + * station + * @pertid_stats.rx_msdu : accumulated number of received MSDUs towards + * this station for removed links. + * @pertid_stats.tx_msdu: accumulated number of (attempted) transmitted + * MSDUs towards this station for removed links + * @pertid_stats.tx_msdu_retries: accumulated number of retries (not + * counting the first) for transmitted MSDUs towards this station + * for removed links + * @pertid_stats.tx_msdu_failed: accumulated number of failed transmitted + * MSDUs towards this station for removed links + */ +struct ieee80211_sta_removed_link_stats { + u32 rx_packets; + u32 tx_packets; + u64 rx_bytes; + u64 tx_bytes; + u32 tx_retries; + u32 tx_failed; + u32 rx_dropped_misc; + u32 beacon_loss_count; + u32 expected_throughput; + struct { + u64 rx_msdu; + u64 tx_msdu; + u64 tx_msdu_retries; + u64 tx_msdu_failed; + } pertid_stats; +}; + +/** * struct sta_info - STA information * * This structure collects information about a station that @@ -626,7 +670,6 @@ struct link_sta_info { * @sta: station information we share with the driver * @sta_state: duplicates information about station state (for debug) * @rcu_head: RCU head used for freeing this station struct - * @cparams: CoDel parameters for this station. * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED) * @amsdu_mesh_control: track the mesh A-MSDU format used by the peer: * @@ -653,6 +696,7 @@ struct link_sta_info { * @deflink address and remaining would be allocated and the address * would be assigned to link[link_id] where link_id is the id assigned * by the AP. + * @rem_link_stats: accumulated removed link stats */ struct sta_info { /* General information, mostly static */ @@ -717,8 +761,6 @@ struct sta_info { struct dentry *debugfs_dir; #endif - struct codel_params cparams; - u8 reserved_tid; s8 amsdu_mesh_control; @@ -729,6 +771,7 @@ struct sta_info { struct ieee80211_sta_aggregates cur; struct link_sta_info deflink; struct link_sta_info __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; + struct ieee80211_sta_removed_link_stats rem_link_stats; /* keep last! */ struct ieee80211_sta sta; @@ -933,6 +976,9 @@ void sta_set_rate_info_tx(struct sta_info *sta, void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats); +void sta_set_accumulated_removed_links_sinfo(struct sta_info *sta, + struct station_info *sinfo); + u32 sta_get_expected_throughput(struct sta_info *sta); void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, @@ -947,7 +993,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta); void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta); void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta); -unsigned long ieee80211_sta_last_active(struct sta_info *sta); +unsigned long ieee80211_sta_last_active(struct sta_info *sta, int link_id); void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 2f92e7c7f203b..ba5fbacbeeda6 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -382,8 +382,8 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link, if (WARN_ON_ONCE(!sband)) return; - ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_SUPP_RATES); - ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_EXT_SUPP_RATES); + ieee80211_put_srates_elem(skb, sband, 0, 0, WLAN_EID_SUPP_RATES); + ieee80211_put_srates_elem(skb, sband, 0, 0, WLAN_EID_EXT_SUPP_RATES); ieee80211_tdls_add_supp_channels(sdata, skb); /* add any custom IEs that go before Extended Capabilities */ @@ -1422,7 +1422,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, if (!(wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS)) return -EOPNOTSUPP; - if (sdata->vif.type != NL80211_IFTYPE_STATION) + if (sdata->vif.type != NL80211_IFTYPE_STATION || !sdata->vif.cfg.assoc) return -EINVAL; switch (oper) { diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 72fad8ea8bb9a..0bfbce1574862 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -384,12 +384,14 @@ DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface, TRACE_EVENT(drv_config, TP_PROTO(struct ieee80211_local *local, + int radio_idx, u32 changed), - TP_ARGS(local, changed), + TP_ARGS(local, radio_idx, changed), TP_STRUCT__entry( LOCAL_ENTRY + __field(int, radio_idx) __field(u32, changed) __field(u32, flags) __field(int, power_level) @@ -403,6 +405,7 @@ TRACE_EVENT(drv_config, TP_fast_assign( LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; __entry->changed = changed; __entry->flags = local->hw.conf.flags; __entry->power_level = local->hw.conf.power_level; @@ -417,8 +420,8 @@ TRACE_EVENT(drv_config, ), TP_printk( - LOCAL_PR_FMT " ch:%#x" CHANDEF_PR_FMT, - LOCAL_PR_ARG, __entry->changed, CHANDEF_PR_ARG + LOCAL_PR_FMT " radio_idx:%d ch:%#x" CHANDEF_PR_FMT, + LOCAL_PR_ARG, __entry->radio_idx, __entry->changed, CHANDEF_PR_ARG ) ); @@ -818,34 +821,71 @@ TRACE_EVENT(drv_get_key_seq, ) ); -DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold, - TP_PROTO(struct ieee80211_local *local, u32 value), - TP_ARGS(local, value) +TRACE_EVENT(drv_set_frag_threshold, + TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 value), + + TP_ARGS(local, radio_idx, value), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, radio_idx) + __field(u32, value) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; + __entry->value = value; + ), + + TP_printk( + LOCAL_PR_FMT " radio_id:%d value:%u", + LOCAL_PR_ARG, __entry->radio_idx, __entry->value + ) ); -DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold, - TP_PROTO(struct ieee80211_local *local, u32 value), - TP_ARGS(local, value) +TRACE_EVENT(drv_set_rts_threshold, + TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 value), + + TP_ARGS(local, radio_idx, value), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, radio_idx) + __field(u32, value) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; + __entry->value = value; + ), + + TP_printk( + LOCAL_PR_FMT " radio_id:%d value:%u", + LOCAL_PR_ARG, __entry->radio_idx, __entry->value + ) ); TRACE_EVENT(drv_set_coverage_class, - TP_PROTO(struct ieee80211_local *local, s16 value), + TP_PROTO(struct ieee80211_local *local, int radio_idx, s16 value), - TP_ARGS(local, value), + TP_ARGS(local, radio_idx, value), TP_STRUCT__entry( LOCAL_ENTRY + __field(int, radio_idx) __field(s16, value) ), TP_fast_assign( LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; __entry->value = value; ), TP_printk( - LOCAL_PR_FMT " value:%d", - LOCAL_PR_ARG, __entry->value + LOCAL_PR_FMT " radio_id:%d value:%d", + LOCAL_PR_ARG, __entry->radio_idx, __entry->value ) ); @@ -1002,6 +1042,33 @@ DEFINE_EVENT(sta_event, drv_sta_statistics, TP_ARGS(local, sdata, sta) ); +TRACE_EVENT(drv_link_sta_statistics, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta), + + TP_ARGS(local, sdata, link_sta), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(u32, link_id) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_NAMED_ASSIGN(link_sta->sta); + __entry->link_id = link_sta->link_id; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " (link %d)", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id + ) +); + DEFINE_EVENT(sta_event, drv_sta_add, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, @@ -1291,12 +1358,14 @@ TRACE_EVENT(drv_set_antenna, ); TRACE_EVENT(drv_get_antenna, - TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret), + TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 tx_ant, + u32 rx_ant, int ret), - TP_ARGS(local, tx_ant, rx_ant, ret), + TP_ARGS(local, radio_idx, tx_ant, rx_ant, ret), TP_STRUCT__entry( LOCAL_ENTRY + __field(int, radio_idx) __field(u32, tx_ant) __field(u32, rx_ant) __field(int, ret) @@ -1304,14 +1373,16 @@ TRACE_EVENT(drv_get_antenna, TP_fast_assign( LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; __entry->tx_ant = tx_ant; __entry->rx_ant = rx_ant; __entry->ret = ret; ), TP_printk( - LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d", - LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret + LOCAL_PR_FMT " radio_idx:%d tx_ant:%d rx_ant:%d ret:%d", + LOCAL_PR_ARG, __entry->radio_idx, __entry->tx_ant, + __entry->rx_ant, __entry->ret ) ); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 20179db88c4a6..00671ae45b2f7 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * Transmit and frame generation functions. */ @@ -26,6 +26,7 @@ #include <net/codel_impl.h> #include <linux/unaligned.h> #include <net/fq_impl.h> +#include <net/sock.h> #include <net/gso.h> #include "ieee80211_i.h" @@ -49,19 +50,11 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct ieee80211_supported_band *sband; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_chanctx_conf *chanctx_conf; - u32 rate_flags = 0; /* assume HW handles this */ if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) return 0; - rcu_read_lock(); - chanctx_conf = rcu_dereference(tx->sdata->vif.bss_conf.chanctx_conf); - if (chanctx_conf) - rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def); - rcu_read_unlock(); - /* uh huh? */ if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; @@ -138,9 +131,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (r->bitrate > txrate->bitrate) break; - if ((rate_flags & r->flags) != rate_flags) - continue; - if (tx->sdata->vif.bss_conf.basic_rates & BIT(i)) rate = r->bitrate; @@ -622,6 +612,12 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) else tx->key = NULL; + if (info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) { + if (tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) + info->control.hw_key = &tx->key->conf; + return TX_CONTINUE; + } + if (tx->key) { bool skip_hw = false; @@ -1183,7 +1179,8 @@ void ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, return; if (!sta || - (!sta->sta.valid_links && !sta->sta.deflink.ht_cap.ht_supported) || + (!sta->sta.valid_links && !sta->sta.deflink.ht_cap.ht_supported && + !sta->sta.deflink.s1g_cap.s1g) || !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || skb->protocol == sdata->control_port_protocol) return; @@ -1402,16 +1399,9 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, local = container_of(fq, struct ieee80211_local, fq); txqi = container_of(tin, struct txq_info, tin); + cparams = &local->cparams; cstats = &txqi->cstats; - if (txqi->txq.sta) { - struct sta_info *sta = container_of(txqi->txq.sta, - struct sta_info, sta); - cparams = &sta->cparams; - } else { - cparams = &local->cparams; - } - if (flow == &tin->default_flow) cvars = &txqi->def_cvars; else @@ -1445,7 +1435,7 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, { struct fq *fq = &local->fq; struct fq_tin *tin = &txqi->tin; - u32 flow_idx = fq_flow_idx(fq, skb); + u32 flow_idx; ieee80211_set_skb_enqueue_time(skb); @@ -1461,6 +1451,7 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, IEEE80211_TX_INTCFL_NEED_TXPROCESSING; __skb_queue_tail(&txqi->frags, skb); } else { + flow_idx = fq_flow_idx(fq, skb); fq_tin_enqueue(fq, tin, flow_idx, skb, fq_skb_free_func); } @@ -1558,7 +1549,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local, spin_unlock_bh(&local->active_txq_lock[txqi->txq.ac]); } -void ieee80211_txq_set_params(struct ieee80211_local *local) +void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx) { if (local->hw.wiphy->txq_limit) local->fq.limit = local->hw.wiphy->txq_limit; @@ -1622,7 +1613,7 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) for (i = 0; i < fq->flows_cnt; i++) codel_vars_init(&local->cvars[i]); - ieee80211_txq_set_params(local); + ieee80211_txq_set_params(local, -1); return 0; } @@ -2876,8 +2867,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } if (unlikely(!multicast && - ((skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || + (sk_requests_wifi_status(skb->sk) || ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) info_id = ieee80211_store_ack_skb(local, skb, &info_flags, cookie); @@ -3774,7 +3764,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, return false; /* don't handle TX status request here either */ - if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + if (sk_requests_wifi_status(skb->sk)) return false; if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { @@ -3894,6 +3884,7 @@ begin: * The key can be removed while the packet was queued, so need to call * this here to get the current key. */ + info->control.hw_key = NULL; r = ieee80211_tx_h_select_key(&tx); if (r != TX_CONTINUE) { ieee80211_free_txskb(&local->hw, skb); @@ -4116,7 +4107,9 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, spin_lock_bh(&local->active_txq_lock[txq->ac]); - has_queue = force || txq_has_queue(txq); + has_queue = force || + (!test_bit(IEEE80211_TXQ_STOP, &txqi->flags) && + txq_has_queue(txq)); if (list_empty(&txqi->schedule_order) && (has_queue || ieee80211_txq_keep_active(txqi))) { /* If airtime accounting is active, always enqueue STAs at the @@ -4526,8 +4519,10 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); } else if (ieee80211_vif_is_mld(&sdata->vif) && - sdata->vif.type == NL80211_IFTYPE_AP && - !ieee80211_hw_check(&sdata->local->hw, MLO_MCAST_MULTI_LINK_TX)) { + ((sdata->vif.type == NL80211_IFTYPE_AP && + !ieee80211_hw_check(&sdata->local->hw, MLO_MCAST_MULTI_LINK_TX)) || + (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + !sdata->wdev.use_4addr))) { ieee80211_mlo_multicast_tx(dev, skb); } else { normal: @@ -4664,8 +4659,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, memcpy(IEEE80211_SKB_CB(seg), info, sizeof(*info)); } - if (unlikely(skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) { + if (unlikely(sk_requests_wifi_status(skb->sk))) { info->status_data = ieee80211_store_ack_skb(local, skb, &info->flags, NULL); if (info->status_data) @@ -5033,12 +5027,25 @@ static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, } } -static u8 __ieee80211_beacon_update_cntdwn(struct beacon_data *beacon) +static u8 __ieee80211_beacon_update_cntdwn(struct ieee80211_link_data *link, + struct beacon_data *beacon) { - beacon->cntdwn_current_counter--; + if (beacon->cntdwn_current_counter == 1) { + /* + * Channel switch handling is done by a worker thread while + * beacons get pulled from hardware timers. It's therefore + * possible that software threads are slow enough to not be + * able to complete CSA handling in a single beacon interval, + * in which case we get here. There isn't much to do about + * it, other than letting the user know that the AP isn't + * behaving correctly. + */ + link_err_once(link, + "beacon TX faster than countdown (channel/color switch) completion\n"); + return 0; + } - /* the counter should never reach 0 */ - WARN_ON_ONCE(!beacon->cntdwn_current_counter); + beacon->cntdwn_current_counter--; return beacon->cntdwn_current_counter; } @@ -5069,7 +5076,7 @@ u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif, unsigned int link_i if (!beacon) goto unlock; - count = __ieee80211_beacon_update_cntdwn(beacon); + count = __ieee80211_beacon_update_cntdwn(link, beacon); unlock: rcu_read_unlock(); @@ -5293,14 +5300,14 @@ ieee80211_beacon_add_mbssid(struct sk_buff *skb, struct beacon_data *beacon, } static struct sk_buff * -ieee80211_beacon_get_ap(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct ieee80211_link_data *link, - struct ieee80211_mutable_offsets *offs, - bool is_template, - struct beacon_data *beacon, - struct ieee80211_chanctx_conf *chanctx_conf, - u8 ema_index) +__ieee80211_beacon_get_ap(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_data *link, + struct ieee80211_mutable_offsets *offs, + bool is_template, + struct beacon_data *beacon, + struct ieee80211_chanctx_conf *chanctx_conf, + u8 ema_index) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); @@ -5361,6 +5368,71 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw, return skb; } +static bool ieee80211_s1g_need_long_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_data *link) +{ + struct ps_data *ps = &sdata->u.ap.ps; + + if (ps->sb_count == 0) + ps->sb_count = link->conf->s1g_long_beacon_period - 1; + else + ps->sb_count--; + + return ps->sb_count == 0; +} + +static struct sk_buff * +ieee80211_s1g_short_beacon_get(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_data *link, + struct ieee80211_chanctx_conf *chanctx_conf, + struct s1g_short_beacon_data *sb, + bool is_template) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_if_ap *ap = &sdata->u.ap; + struct sk_buff *skb; + + skb = dev_alloc_skb(local->tx_headroom + sb->short_head_len + + sb->short_tail_len + 256 + + local->hw.extra_beacon_tailroom); + if (!skb) + return NULL; + + skb_reserve(skb, local->tx_headroom); + skb_put_data(skb, sb->short_head, sb->short_head_len); + + ieee80211_beacon_add_tim(sdata, link, &ap->ps, skb, is_template); + + if (sb->short_tail) + skb_put_data(skb, sb->short_tail, sb->short_tail_len); + + ieee80211_beacon_get_finish(hw, vif, link, NULL, NULL, skb, + chanctx_conf, 0); + return skb; +} + +static struct sk_buff * +ieee80211_beacon_get_ap(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_link_data *link, + struct ieee80211_mutable_offsets *offs, + bool is_template, struct beacon_data *beacon, + struct ieee80211_chanctx_conf *chanctx_conf, + u8 ema_index, struct s1g_short_beacon_data *s1g_sb) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (!sdata->vif.cfg.s1g || !s1g_sb || + ieee80211_s1g_need_long_beacon(sdata, link)) + return __ieee80211_beacon_get_ap(hw, vif, link, offs, + is_template, beacon, + chanctx_conf, ema_index); + + return ieee80211_s1g_short_beacon_get(hw, vif, link, chanctx_conf, + s1g_sb, is_template); +} + static struct ieee80211_ema_beacons * ieee80211_beacon_get_ap_ema_list(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -5384,7 +5456,7 @@ ieee80211_beacon_get_ap_ema_list(struct ieee80211_hw *hw, ieee80211_beacon_get_ap(hw, vif, link, &ema->bcn[ema->cnt].offs, is_template, beacon, - chanctx_conf, ema->cnt); + chanctx_conf, ema->cnt, NULL); if (!ema->bcn[ema->cnt].skb) break; } @@ -5413,6 +5485,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_sub_if_data *sdata = NULL; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_link_data *link; + struct s1g_short_beacon_data *s1g_short_bcn = NULL; rcu_read_lock(); @@ -5434,6 +5507,13 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (!beacon) goto out; + if (vif->cfg.s1g && link->u.ap.s1g_short_beacon) { + s1g_short_bcn = + rcu_dereference(link->u.ap.s1g_short_beacon); + if (!s1g_short_bcn) + goto out; + } + if (ema_beacons) { *ema_beacons = ieee80211_beacon_get_ap_ema_list(hw, vif, link, @@ -5454,8 +5534,8 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, skb = ieee80211_beacon_get_ap(hw, vif, link, offs, is_template, beacon, - chanctx_conf, - ema_index); + chanctx_conf, ema_index, + s1g_short_bcn); } } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; @@ -5467,7 +5547,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (beacon->cntdwn_counter_offsets[0]) { if (!is_template) - __ieee80211_beacon_update_cntdwn(beacon); + __ieee80211_beacon_update_cntdwn(link, beacon); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } @@ -5499,7 +5579,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, * for now we leave it consistent with overall * mac80211's behavior. */ - __ieee80211_beacon_update_cntdwn(beacon); + __ieee80211_beacon_update_cntdwn(link, beacon); ieee80211_set_beacon_cntdwn(sdata, beacon, link); } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index dec6e16b8c7d2..32f1bc5908c57 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1204,7 +1204,6 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, struct ieee80211_supported_band *sband; int i, err; size_t noffset; - u32 rate_flags; bool have_80mhz = false; *offset = 0; @@ -1213,13 +1212,11 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, if (WARN_ON_ONCE(!sband)) return 0; - rate_flags = ieee80211_chandef_rate_flags(chandef); - /* For direct scan add S1G IE and consider its override bits */ if (band == NL80211_BAND_S1GHZ) return ieee80211_put_s1g_cap(skb, &sband->s1g_cap); - err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, + err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_SUPP_RATES); if (err) return err; @@ -1241,7 +1238,7 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, *offset = noffset; } - err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, + err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_EXT_SUPP_RATES); if (err) return err; @@ -1522,16 +1519,13 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, { struct ieee80211_supported_band *sband; size_t num_rates; - u32 supp_rates, rate_flags; + u32 supp_rates; int i, j; sband = sdata->local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return 1; - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); - num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + @@ -1551,12 +1545,7 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, continue; for (j = 0; j < num_rates; j++) { - int brate; - if ((rate_flags & sband->bitrates[j].flags) - != rate_flags) - continue; - - brate = sband->bitrates[j].bitrate; + int brate = sband->bitrates[j].bitrate; if (brate == own_rate) { supp_rates |= BIT(j); @@ -1767,6 +1756,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) bool sched_scan_stopped = false; bool suspended = local->suspended; bool in_reconfig = false; + u32 rts_threshold; lockdep_assert_wiphy(local->hw.wiphy); @@ -1837,13 +1827,20 @@ int ieee80211_reconfig(struct ieee80211_local *local) } /* setup fragmentation threshold */ - drv_set_frag_threshold(local, hw->wiphy->frag_threshold); + drv_set_frag_threshold(local, -1, hw->wiphy->frag_threshold); /* setup RTS threshold */ - drv_set_rts_threshold(local, hw->wiphy->rts_threshold); + if (hw->wiphy->n_radio > 0) { + for (i = 0; i < hw->wiphy->n_radio; i++) { + rts_threshold = hw->wiphy->radio_cfg[i].rts_threshold; + drv_set_rts_threshold(local, i, rts_threshold); + } + } else { + drv_set_rts_threshold(local, -1, hw->wiphy->rts_threshold); + } /* reset coverage class */ - drv_set_coverage_class(local, hw->wiphy->coverage_class); + drv_set_coverage_class(local, -1, hw->wiphy->coverage_class); ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, @@ -1901,11 +1898,11 @@ int ieee80211_reconfig(struct ieee80211_local *local) ieee80211_assign_chanctx(local, sdata, &sdata->deflink); /* reconfigure hardware */ - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | - IEEE80211_CONF_CHANGE_MONITOR | - IEEE80211_CONF_CHANGE_PS | - IEEE80211_CONF_CHANGE_RETRY_LIMITS | - IEEE80211_CONF_CHANGE_IDLE); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | + IEEE80211_CONF_CHANGE_MONITOR | + IEEE80211_CONF_CHANGE_PS | + IEEE80211_CONF_CHANGE_RETRY_LIMITS | + IEEE80211_CONF_CHANGE_IDLE); ieee80211_configure_filter(local); @@ -2155,11 +2152,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); wake_up: - - if (local->virt_monitors > 0 && - local->virt_monitors == local->open_count) - ieee80211_add_virtual_monitor(local); - /* * Clear the WLAN_STA_BLOCK_BA flag so new aggregation * sessions can be established after a resume. @@ -2213,6 +2205,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) } } + if (local->virt_monitors > 0 && + local->virt_monitors == local->open_count) + ieee80211_add_virtual_monitor(local); + if (!suspended) return 0; @@ -2559,6 +2555,23 @@ end: return 0; } +int ieee80211_put_reg_conn(struct sk_buff *skb, + enum ieee80211_channel_flags flags) +{ + u8 reg_conn = IEEE80211_REG_CONN_LPI_VALID | + IEEE80211_REG_CONN_LPI_VALUE | + IEEE80211_REG_CONN_SP_VALID; + + if (!(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT)) + reg_conn |= IEEE80211_REG_CONN_SP_VALUE; + + skb_put_u8(skb, WLAN_EID_EXTENSION); + skb_put_u8(skb, 1 + sizeof(reg_conn)); + skb_put_u8(skb, WLAN_EID_EXT_NON_AP_STA_REG_CON); + skb_put_u8(skb, reg_conn); + return 0; +} + int ieee80211_put_he_6ghz_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode) @@ -3223,15 +3236,13 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, - u32 basic_rates, u32 rate_flags, u32 masked_rates, + u32 basic_rates, u32 masked_rates, u8 element_id) { u8 i, rates, skip; rates = 0; for (i = 0; i < sband->n_bitrates; i++) { - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; if (masked_rates & BIT(i)) continue; rates++; @@ -3257,8 +3268,6 @@ int ieee80211_put_srates_elem(struct sk_buff *skb, int rate; u8 basic; - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; if (masked_rates & BIT(i)) continue; @@ -3280,14 +3289,24 @@ int ieee80211_put_srates_elem(struct sk_buff *skb, return 0; } -int ieee80211_ave_rssi(struct ieee80211_vif *vif) +int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_link_data *link_data; if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return 0; - return -ewma_beacon_signal_read(&sdata->deflink.u.mgd.ave_beacon_signal); + if (link_id < 0) + link_data = &sdata->deflink; + else + link_data = wiphy_dereference(sdata->local->hw.wiphy, + sdata->link[link_id]); + + if (WARN_ON_ONCE(!link_data)) + return -99; + + return -ewma_beacon_signal_read(&link_data->u.mgd.ave_beacon_signal); } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); @@ -3894,12 +3913,10 @@ int ieee80211_parse_p2p_noa(const struct ieee80211_p2p_noa_attr *attr, } EXPORT_SYMBOL(ieee80211_parse_p2p_noa); -void ieee80211_recalc_dtim(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata) +void ieee80211_recalc_dtim(struct ieee80211_sub_if_data *sdata, u64 tsf) { - u64 tsf = drv_get_tsf(local, sdata); u64 dtim_count = 0; - u16 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; + u32 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; u8 dtim_period = sdata->vif.bss_conf.dtim_period; struct ps_data *ps; u8 bcns_from_dtim; @@ -3935,6 +3952,33 @@ void ieee80211_recalc_dtim(struct ieee80211_local *local, ps->dtim_count = dtim_count; } +/* + * Given a long beacon period, calculate the current index into + * that period to determine the number of TSBTTs until the next TBTT. + * It is completely valid to have a short beacon period that differs + * from the dtim period (i.e a TBTT thats not a DTIM). + */ +void ieee80211_recalc_sb_count(struct ieee80211_sub_if_data *sdata, u64 tsf) +{ + u32 sb_idx; + struct ps_data *ps = &sdata->bss->ps; + u8 lb_period = sdata->vif.bss_conf.s1g_long_beacon_period; + u32 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; + + /* No mesh / IBSS support for short beaconing */ + if (tsf == -1ULL || !lb_period || + (sdata->vif.type != NL80211_IFTYPE_AP && + sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) + return; + + /* find the current TSBTT index in our lb_period */ + do_div(tsf, beacon_int); + sb_idx = do_div(tsf, lb_period); + + /* num TSBTTs until the next TBTT */ + ps->sb_count = sb_idx ? lb_period - sb_idx : 0; +} + static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { @@ -3968,6 +4012,33 @@ static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, return radar_detect; } +bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, + struct cfg80211_scan_request *scan_req, + int radio_idx) +{ + struct ieee80211_channel *chan; + int i, chan_radio_idx; + + for (i = 0; i < scan_req->n_channels; i++) { + chan = scan_req->channels[i]; + chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); + /* + * The chan_radio_idx should be valid since it's taken from a + * valid scan request. + * However, if chan_radio_idx is unexpectedly invalid (negative), + * we take a conservative approach and assume the scan request + * might use the specified radio_idx. Hence, return true. + */ + if (WARN_ON(chan_radio_idx < 0)) + return true; + + if (chan_radio_idx == radio_idx) + return true; + } + + return false; +} + static u32 __ieee80211_get_radio_mask(struct ieee80211_sub_if_data *sdata) { diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index c5c5d16ed6c81..b099d79e8fbb2 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -672,8 +672,9 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; } } else { - pr_warn_ratelimited("Ignoring NSS change in VHT Operating Mode Notification from %pM with invalid nss %d", - link_sta->pub->addr, nss); + sdata_dbg(sdata, + "Ignore NSS change to invalid %d in VHT opmode notif from %pM", + nss, link_sta->pub->addr); } } diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 9b12ca97f4128..df4e8cf33899b 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -53,6 +53,7 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { struct sock *sk = sock->sk; struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct net *net = sock_net(&msk->sk); struct sockaddr_mctp *smctp; int rc; @@ -73,14 +74,48 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) lock_sock(sk); - /* TODO: allow rebind */ if (sk_hashed(sk)) { rc = -EADDRINUSE; goto out_release; } - msk->bind_net = smctp->smctp_network; - msk->bind_addr = smctp->smctp_addr.s_addr; - msk->bind_type = smctp->smctp_type & 0x7f; /* ignore the IC bit */ + + msk->bind_local_addr = smctp->smctp_addr.s_addr; + + /* MCTP_NET_ANY with a specific EID is resolved to the default net + * at bind() time. + * For bind_addr=MCTP_ADDR_ANY it is handled specially at route + * lookup time. + */ + if (smctp->smctp_network == MCTP_NET_ANY && + msk->bind_local_addr != MCTP_ADDR_ANY) { + msk->bind_net = mctp_default_net(net); + } else { + msk->bind_net = smctp->smctp_network; + } + + /* ignore the IC bit */ + smctp->smctp_type &= 0x7f; + + if (msk->bind_peer_set) { + if (msk->bind_type != smctp->smctp_type) { + /* Prior connect() had a different type */ + rc = -EINVAL; + goto out_release; + } + + if (msk->bind_net == MCTP_NET_ANY) { + /* Restrict to the network passed to connect() */ + msk->bind_net = msk->bind_peer_net; + } + + if (msk->bind_net != msk->bind_peer_net) { + /* connect() had a different net to bind() */ + rc = -EINVAL; + goto out_release; + } + } else { + msk->bind_type = smctp->smctp_type; + } rc = sk->sk_prot->hash(sk); @@ -90,6 +125,67 @@ out_release: return rc; } +/* Used to set a specific peer prior to bind. Not used for outbound + * connections (Tag Owner set) since MCTP is a datagram protocol. + */ +static int mctp_connect(struct socket *sock, struct sockaddr *addr, + int addrlen, int flags) +{ + struct sock *sk = sock->sk; + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct net *net = sock_net(&msk->sk); + struct sockaddr_mctp *smctp; + int rc; + + if (addrlen != sizeof(*smctp)) + return -EINVAL; + + if (addr->sa_family != AF_MCTP) + return -EAFNOSUPPORT; + + /* It's a valid sockaddr for MCTP, cast and do protocol checks */ + smctp = (struct sockaddr_mctp *)addr; + + if (!mctp_sockaddr_is_ok(smctp)) + return -EINVAL; + + /* Can't bind by tag */ + if (smctp->smctp_tag) + return -EINVAL; + + /* IC bit must be unset */ + if (smctp->smctp_type & 0x80) + return -EINVAL; + + lock_sock(sk); + + if (sk_hashed(sk)) { + /* bind() already */ + rc = -EADDRINUSE; + goto out_release; + } + + if (msk->bind_peer_set) { + /* connect() already */ + rc = -EADDRINUSE; + goto out_release; + } + + msk->bind_peer_set = true; + msk->bind_peer_addr = smctp->smctp_addr.s_addr; + msk->bind_type = smctp->smctp_type; + if (smctp->smctp_network == MCTP_NET_ANY) + msk->bind_peer_net = mctp_default_net(net); + else + msk->bind_peer_net = smctp->smctp_network; + + rc = 0; + +out_release: + release_sock(sk); + return rc; +} + static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); @@ -97,8 +193,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct sock *sk = sock->sk; struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct mctp_skb_cb *cb; - struct mctp_route *rt; struct sk_buff *skb = NULL; + struct mctp_dst dst; int hlen; if (addr) { @@ -133,34 +229,30 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msk->addr_ext && addrlen >= sizeof(struct sockaddr_mctp_ext)) { DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, extaddr, msg->msg_name); - struct net_device *dev; - - rc = -EINVAL; - rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), extaddr->smctp_ifindex); - /* check for correct halen */ - if (dev && extaddr->smctp_halen == dev->addr_len) { - hlen = LL_RESERVED_SPACE(dev) + sizeof(struct mctp_hdr); - rc = 0; - } - rcu_read_unlock(); + + if (!mctp_sockaddr_ext_is_ok(extaddr)) + return -EINVAL; + + rc = mctp_dst_from_extaddr(&dst, sock_net(sk), + extaddr->smctp_ifindex, + extaddr->smctp_halen, + extaddr->smctp_haddr); if (rc) - goto err_free; - rt = NULL; + return rc; + } else { - rt = mctp_route_lookup(sock_net(sk), addr->smctp_network, - addr->smctp_addr.s_addr); - if (!rt) { - rc = -EHOSTUNREACH; - goto err_free; - } - hlen = LL_RESERVED_SPACE(rt->dev->dev) + sizeof(struct mctp_hdr); + rc = mctp_route_lookup(sock_net(sk), addr->smctp_network, + addr->smctp_addr.s_addr, &dst); + if (rc) + return rc; } + hlen = LL_RESERVED_SPACE(dst.dev->dev) + sizeof(struct mctp_hdr); + skb = sock_alloc_send_skb(sk, hlen + 1 + len, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) - return rc; + goto err_release_dst; skb_reserve(skb, hlen); @@ -175,30 +267,16 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) cb = __mctp_cb(skb); cb->net = addr->smctp_network; - if (!rt) { - /* fill extended address in cb */ - DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, - extaddr, msg->msg_name); - - if (!mctp_sockaddr_ext_is_ok(extaddr) || - extaddr->smctp_halen > sizeof(cb->haddr)) { - rc = -EINVAL; - goto err_free; - } - - cb->ifindex = extaddr->smctp_ifindex; - /* smctp_halen is checked above */ - cb->halen = extaddr->smctp_halen; - memcpy(cb->haddr, extaddr->smctp_haddr, cb->halen); - } - - rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr, + rc = mctp_local_output(sk, &dst, skb, addr->smctp_addr.s_addr, addr->smctp_tag); + mctp_dst_release(&dst); return rc ? : len; err_free: kfree_skb(skb); +err_release_dst: + mctp_dst_release(&dst); return rc; } @@ -551,7 +629,7 @@ static const struct proto_ops mctp_dgram_ops = { .family = PF_MCTP, .release = mctp_release, .bind = mctp_bind, - .connect = sock_no_connect, + .connect = mctp_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, @@ -618,6 +696,7 @@ static int mctp_sk_init(struct sock *sk) INIT_HLIST_HEAD(&msk->keys); timer_setup(&msk->key_expiry, mctp_sk_expire_keys, 0); + msk->bind_peer_set = false; return 0; } @@ -629,15 +708,48 @@ static void mctp_sk_close(struct sock *sk, long timeout) static int mctp_sk_hash(struct sock *sk) { struct net *net = sock_net(sk); + struct sock *existing; + struct mctp_sock *msk; + mctp_eid_t remote; + u32 hash; + int rc; + + msk = container_of(sk, struct mctp_sock, sk); + + if (msk->bind_peer_set) + remote = msk->bind_peer_addr; + else + remote = MCTP_ADDR_ANY; + hash = mctp_bind_hash(msk->bind_type, msk->bind_local_addr, remote); + + mutex_lock(&net->mctp.bind_lock); + + /* Prevent duplicate binds. */ + sk_for_each(existing, &net->mctp.binds[hash]) { + struct mctp_sock *mex = + container_of(existing, struct mctp_sock, sk); + + bool same_peer = (mex->bind_peer_set && msk->bind_peer_set && + mex->bind_peer_addr == msk->bind_peer_addr) || + (!mex->bind_peer_set && !msk->bind_peer_set); + + if (mex->bind_type == msk->bind_type && + mex->bind_local_addr == msk->bind_local_addr && same_peer && + mex->bind_net == msk->bind_net) { + rc = -EADDRINUSE; + goto out; + } + } /* Bind lookup runs under RCU, remain live during that. */ sock_set_flag(sk, SOCK_RCU_FREE); - mutex_lock(&net->mctp.bind_lock); - sk_add_node_rcu(sk, &net->mctp.binds); - mutex_unlock(&net->mctp.bind_lock); + sk_add_node_rcu(sk, &net->mctp.binds[hash]); + rc = 0; - return 0; +out: + mutex_unlock(&net->mctp.bind_lock); + return rc; } static void mctp_sk_unhash(struct sock *sk) @@ -793,3 +905,7 @@ MODULE_DESCRIPTION("MCTP core"); MODULE_AUTHOR("Jeremy Kerr <jk@codeconstruct.com.au>"); MODULE_ALIAS_NETPROTO(PF_MCTP); + +#if IS_ENABLED(CONFIG_MCTP_TEST) +#include "test/sock-test.c" +#endif diff --git a/net/mctp/device.c b/net/mctp/device.c index 7c0dcf3df3196..4d404edd7446e 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -120,8 +120,8 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) int ifindex = 0, rc; /* Filter by ifindex if a header is provided */ - if (cb->nlh->nlmsg_len >= nlmsg_msg_size(sizeof(*hdr))) { - hdr = nlmsg_data(cb->nlh); + hdr = nlmsg_payload(cb->nlh, sizeof(*hdr)); + if (hdr) { ifindex = hdr->ifa_index; } else { if (cb->strict_check) { diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c index 590f642413e4e..05b899f22d902 100644 --- a/net/mctp/neigh.c +++ b/net/mctp/neigh.c @@ -250,7 +250,10 @@ static int mctp_rtm_getneigh(struct sk_buff *skb, struct netlink_callback *cb) int idx; } *cbctx = (void *)cb->ctx; - ndmsg = nlmsg_data(cb->nlh); + ndmsg = nlmsg_payload(cb->nlh, sizeof(*ndmsg)); + if (!ndmsg) + return -EINVAL; + req_ifindex = ndmsg->ndm_ifindex; idx = 0; diff --git a/net/mctp/route.c b/net/mctp/route.c index d9c8e5a5f9ce9..2b2b958ef6a37 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -17,6 +17,8 @@ #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <kunit/static_stub.h> + #include <uapi/linux/if_arp.h> #include <net/mctp.h> @@ -32,39 +34,42 @@ static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ; static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev); /* route output callbacks */ -static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) +static int mctp_dst_discard(struct mctp_dst *dst, struct sk_buff *skb) { kfree_skb(skb); return 0; } -static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) +static struct mctp_sock *mctp_lookup_bind_details(struct net *net, + struct sk_buff *skb, + u8 type, u8 dest, + u8 src, bool allow_net_any) { struct mctp_skb_cb *cb = mctp_cb(skb); - struct mctp_hdr *mh; struct sock *sk; - u8 type; + u8 hash; - WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); - /* TODO: look up in skb->cb? */ - mh = mctp_hdr(skb); - - if (!skb_headlen(skb)) - return NULL; + hash = mctp_bind_hash(type, dest, src); - type = (*(u8 *)skb->data) & 0x7f; - - sk_for_each_rcu(sk, &net->mctp.binds) { + sk_for_each_rcu(sk, &net->mctp.binds[hash]) { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + if (!allow_net_any && msk->bind_net == MCTP_NET_ANY) + continue; + if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net) continue; if (msk->bind_type != type) continue; - if (!mctp_address_matches(msk->bind_addr, mh->dest)) + if (msk->bind_peer_set && + !mctp_address_matches(msk->bind_peer_addr, src)) + continue; + + if (!mctp_address_matches(msk->bind_local_addr, dest)) continue; return msk; @@ -73,6 +78,54 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) return NULL; } +static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) +{ + struct mctp_sock *msk; + struct mctp_hdr *mh; + u8 type; + + /* TODO: look up in skb->cb? */ + mh = mctp_hdr(skb); + + if (!skb_headlen(skb)) + return NULL; + + type = (*(u8 *)skb->data) & 0x7f; + + /* Look for binds in order of widening scope. A given destination or + * source address also implies matching on a particular network. + * + * - Matching destination and source + * - Matching destination + * - Matching source + * - Matching network, any address + * - Any network or address + */ + + msk = mctp_lookup_bind_details(net, skb, type, mh->dest, mh->src, + false); + if (msk) + return msk; + msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY, mh->src, + false); + if (msk) + return msk; + msk = mctp_lookup_bind_details(net, skb, type, mh->dest, MCTP_ADDR_ANY, + false); + if (msk) + return msk; + msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY, + MCTP_ADDR_ANY, false); + if (msk) + return msk; + msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY, + MCTP_ADDR_ANY, true); + if (msk) + return msk; + + return NULL; +} + /* A note on the key allocations. * * struct net->mctp.keys contains our set of currently-allocated keys for @@ -368,7 +421,7 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) return 0; } -static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) +static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) { struct mctp_sk_key *key, *any_key = NULL; struct net *net = dev_net(skb->dev); @@ -392,6 +445,9 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) */ skb_orphan(skb); + if (skb->pkt_type == PACKET_OUTGOING) + skb->pkt_type = PACKET_LOOPBACK; + /* ensure we have enough data for a header and a type */ if (skb->len < sizeof(struct mctp_hdr) + 1) goto out; @@ -556,39 +612,31 @@ out: return rc; } -static unsigned int mctp_route_mtu(struct mctp_route *rt) -{ - return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu); -} - -static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) +static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { - struct mctp_skb_cb *cb = mctp_cb(skb); - struct mctp_hdr *hdr = mctp_hdr(skb); char daddr_buf[MAX_ADDR_LEN]; char *daddr = NULL; - unsigned int mtu; int rc; skb->protocol = htons(ETH_P_MCTP); + skb->pkt_type = PACKET_OUTGOING; - mtu = READ_ONCE(skb->dev->mtu); - if (skb->len > mtu) { + if (skb->len > dst->mtu) { kfree_skb(skb); return -EMSGSIZE; } - if (cb->ifindex) { - /* direct route; use the hwaddr we stashed in sendmsg */ - if (cb->halen != skb->dev->addr_len) { + /* direct route; use the hwaddr we stashed in sendmsg */ + if (dst->halen) { + if (dst->halen != skb->dev->addr_len) { /* sanity check, sendmsg should have already caught this */ kfree_skb(skb); return -EMSGSIZE; } - daddr = cb->haddr; + daddr = dst->haddr; } else { /* If lookup fails let the device handle daddr==NULL */ - if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0) + if (mctp_neigh_lookup(dst->dev, dst->nexthop, daddr_buf) == 0) daddr = daddr_buf; } @@ -599,7 +647,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) return -EHOSTUNREACH; } - mctp_flow_prepare_output(skb, route->dev); + mctp_flow_prepare_output(skb, dst->dev); rc = dev_queue_xmit(skb); if (rc) @@ -612,7 +660,8 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) static void mctp_route_release(struct mctp_route *rt) { if (refcount_dec_and_test(&rt->refs)) { - mctp_dev_put(rt->dev); + if (rt->dst_type == MCTP_ROUTE_DIRECT) + mctp_dev_put(rt->dev); kfree_rcu(rt, rcu); } } @@ -628,7 +677,7 @@ static struct mctp_route *mctp_route_alloc(void) INIT_LIST_HEAD(&rt->list); refcount_set(&rt->refs, 1); - rt->output = mctp_route_discard; + rt->output = mctp_dst_discard; return rt; } @@ -801,10 +850,16 @@ static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk, } /* routing lookups */ +static unsigned int mctp_route_netid(struct mctp_route *rt) +{ + return rt->dst_type == MCTP_ROUTE_DIRECT ? + READ_ONCE(rt->dev->net) : rt->gateway.net; +} + static bool mctp_rt_match_eid(struct mctp_route *rt, unsigned int net, mctp_eid_t eid) { - return READ_ONCE(rt->dev->net) == net && + return mctp_route_netid(rt) == net && rt->min <= eid && rt->max >= eid; } @@ -813,54 +868,150 @@ static bool mctp_rt_compare_exact(struct mctp_route *rt1, struct mctp_route *rt2) { ASSERT_RTNL(); - return rt1->dev->net == rt2->dev->net && + return mctp_route_netid(rt1) == mctp_route_netid(rt2) && rt1->min == rt2->min && rt1->max == rt2->max; } -struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, - mctp_eid_t daddr) +/* must only be called on a direct route, as the final output hop */ +static void mctp_dst_from_route(struct mctp_dst *dst, mctp_eid_t eid, + unsigned int mtu, struct mctp_route *route) +{ + mctp_dev_hold(route->dev); + dst->nexthop = eid; + dst->dev = route->dev; + dst->mtu = READ_ONCE(dst->dev->dev->mtu); + if (mtu) + dst->mtu = min(dst->mtu, mtu); + dst->halen = 0; + dst->output = route->output; +} + +int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, + unsigned char halen, const unsigned char *haddr) { - struct mctp_route *tmp, *rt = NULL; + struct net_device *netdev; + struct mctp_dev *dev; + int rc = -ENOENT; + + if (halen > sizeof(dst->haddr)) + return -EINVAL; rcu_read_lock(); - list_for_each_entry_rcu(tmp, &net->mctp.routes, list) { - /* TODO: add metrics */ - if (mctp_rt_match_eid(tmp, dnet, daddr)) { - if (refcount_inc_not_zero(&tmp->refs)) { - rt = tmp; - break; - } - } + netdev = dev_get_by_index_rcu(net, ifindex); + if (!netdev) + goto out_unlock; + + if (netdev->addr_len != halen) { + rc = -EINVAL; + goto out_unlock; } + dev = __mctp_dev_get(netdev); + if (!dev) + goto out_unlock; + + dst->dev = dev; + dst->mtu = READ_ONCE(netdev->mtu); + dst->halen = halen; + dst->output = mctp_dst_output; + dst->nexthop = 0; + memcpy(dst->haddr, haddr, halen); + + rc = 0; + +out_unlock: rcu_read_unlock(); + return rc; +} - return rt; +void mctp_dst_release(struct mctp_dst *dst) +{ + mctp_dev_put(dst->dev); +} + +static struct mctp_route *mctp_route_lookup_single(struct net *net, + unsigned int dnet, + mctp_eid_t daddr) +{ + struct mctp_route *rt; + + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (mctp_rt_match_eid(rt, dnet, daddr)) + return rt; + } + + return NULL; } -static struct mctp_route *mctp_route_lookup_null(struct net *net, - struct net_device *dev) +/* populates *dst on successful lookup, if set */ +int mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr, struct mctp_dst *dst) { - struct mctp_route *tmp, *rt = NULL; + const unsigned int max_depth = 32; + unsigned int depth, mtu = 0; + int rc = -EHOSTUNREACH; rcu_read_lock(); - list_for_each_entry_rcu(tmp, &net->mctp.routes, list) { - if (tmp->dev->dev == dev && tmp->type == RTN_LOCAL && - refcount_inc_not_zero(&tmp->refs)) { - rt = tmp; + for (depth = 0; depth < max_depth; depth++) { + struct mctp_route *rt; + + rt = mctp_route_lookup_single(net, dnet, daddr); + if (!rt) + break; + + /* clamp mtu to the smallest in the path, allowing 0 + * to specify no restrictions + */ + if (mtu && rt->mtu) + mtu = min(mtu, rt->mtu); + else + mtu = mtu ?: rt->mtu; + + if (rt->dst_type == MCTP_ROUTE_DIRECT) { + if (dst) + mctp_dst_from_route(dst, daddr, mtu, rt); + rc = 0; break; + + } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) { + daddr = rt->gateway.eid; } } rcu_read_unlock(); - return rt; + return rc; +} + +static int mctp_route_lookup_null(struct net *net, struct net_device *dev, + struct mctp_dst *dst) +{ + int rc = -EHOSTUNREACH; + struct mctp_route *rt; + + rcu_read_lock(); + + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (rt->dst_type != MCTP_ROUTE_DIRECT || rt->type != RTN_LOCAL) + continue; + + if (rt->dev->dev != dev) + continue; + + mctp_dst_from_route(dst, 0, 0, rt); + rc = 0; + break; + } + + rcu_read_unlock(); + + return rc; } -static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, +static int mctp_do_fragment_route(struct mctp_dst *dst, struct sk_buff *skb, unsigned int mtu, u8 tag) { const unsigned int hlen = sizeof(struct mctp_hdr); @@ -933,7 +1084,7 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, skb_ext_copy(skb2, skb); /* do route */ - rc = rt->output(rt, skb2); + rc = dst->output(dst, skb2); if (rc) break; @@ -945,68 +1096,34 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, return rc; } -int mctp_local_output(struct sock *sk, struct mctp_route *rt, +int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag) { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); - struct mctp_skb_cb *cb = mctp_cb(skb); - struct mctp_route tmp_rt = {0}; struct mctp_sk_key *key; struct mctp_hdr *hdr; unsigned long flags; unsigned int netid; unsigned int mtu; mctp_eid_t saddr; - bool ext_rt; int rc; u8 tag; - rc = -ENODEV; - - if (rt) { - ext_rt = false; - if (WARN_ON(!rt->dev)) - goto out_release; - - } else if (cb->ifindex) { - struct net_device *dev; - - ext_rt = true; - rt = &tmp_rt; - - rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), cb->ifindex); - if (!dev) { - rcu_read_unlock(); - goto out_free; - } - rt->dev = __mctp_dev_get(dev); - rcu_read_unlock(); - - if (!rt->dev) - goto out_release; + KUNIT_STATIC_STUB_REDIRECT(mctp_local_output, sk, dst, skb, daddr, + req_tag); - /* establish temporary route - we set up enough to keep - * mctp_route_output happy - */ - rt->output = mctp_route_output; - rt->mtu = 0; - - } else { - rc = -EINVAL; - goto out_free; - } + rc = -ENODEV; - spin_lock_irqsave(&rt->dev->addrs_lock, flags); - if (rt->dev->num_addrs == 0) { + spin_lock_irqsave(&dst->dev->addrs_lock, flags); + if (dst->dev->num_addrs == 0) { rc = -EHOSTUNREACH; } else { /* use the outbound interface's first address as our source */ - saddr = rt->dev->addrs[0]; + saddr = dst->dev->addrs[0]; rc = 0; } - spin_unlock_irqrestore(&rt->dev->addrs_lock, flags); - netid = READ_ONCE(rt->dev->net); + spin_unlock_irqrestore(&dst->dev->addrs_lock, flags); + netid = READ_ONCE(dst->dev->net); if (rc) goto out_release; @@ -1032,15 +1149,13 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, tag = req_tag & MCTP_TAG_MASK; } + skb->pkt_type = PACKET_OUTGOING; skb->protocol = htons(ETH_P_MCTP); skb->priority = 0; skb_reset_transport_header(skb); skb_push(skb, sizeof(struct mctp_hdr)); skb_reset_network_header(skb); - skb->dev = rt->dev->dev; - - /* cb->net will have been set on initial ingress */ - cb->src = saddr; + skb->dev = dst->dev->dev; /* set up common header fields */ hdr = mctp_hdr(skb); @@ -1048,73 +1163,64 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, hdr->dest = daddr; hdr->src = saddr; - mtu = mctp_route_mtu(rt); + mtu = dst->mtu; if (skb->len + sizeof(struct mctp_hdr) <= mtu) { hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | tag; - rc = rt->output(rt, skb); + rc = dst->output(dst, skb); } else { - rc = mctp_do_fragment_route(rt, skb, mtu, tag); + rc = mctp_do_fragment_route(dst, skb, mtu, tag); } /* route output functions consume the skb, even on error */ skb = NULL; out_release: - if (!ext_rt) - mctp_route_release(rt); - - mctp_dev_put(tmp_rt.dev); - -out_free: kfree_skb(skb); return rc; } /* route management */ -static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, - unsigned int daddr_extent, unsigned int mtu, - unsigned char type) + +/* mctp_route_add(): Add the provided route, previously allocated via + * mctp_route_alloc(). On success, takes ownership of @rt, which includes a + * hold on rt->dev for usage in the route table. On failure a caller will want + * to mctp_route_release(). + * + * We expect that the caller has set rt->type, rt->dst_type, rt->min, rt->max, + * rt->mtu and either rt->dev (with a reference held appropriately) or + * rt->gateway. Other fields will be populated. + */ +static int mctp_route_add(struct net *net, struct mctp_route *rt) { - int (*rtfn)(struct mctp_route *rt, struct sk_buff *skb); - struct net *net = dev_net(mdev->dev); - struct mctp_route *rt, *ert; + struct mctp_route *ert; - if (!mctp_address_unicast(daddr_start)) + if (!mctp_address_unicast(rt->min) || !mctp_address_unicast(rt->max)) return -EINVAL; - if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255) + if (rt->dst_type == MCTP_ROUTE_DIRECT && !rt->dev) + return -EINVAL; + + if (rt->dst_type == MCTP_ROUTE_GATEWAY && !rt->gateway.eid) return -EINVAL; - switch (type) { + switch (rt->type) { case RTN_LOCAL: - rtfn = mctp_route_input; + rt->output = mctp_dst_input; break; case RTN_UNICAST: - rtfn = mctp_route_output; + rt->output = mctp_dst_output; break; default: return -EINVAL; } - rt = mctp_route_alloc(); - if (!rt) - return -ENOMEM; - - rt->min = daddr_start; - rt->max = daddr_start + daddr_extent; - rt->mtu = mtu; - rt->dev = mdev; - mctp_dev_hold(rt->dev); - rt->type = type; - rt->output = rtfn; - ASSERT_RTNL(); + /* Prevent duplicate identical routes. */ list_for_each_entry(ert, &net->mctp.routes, list) { if (mctp_rt_compare_exact(rt, ert)) { - mctp_route_release(rt); return -EEXIST; } } @@ -1124,10 +1230,10 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, return 0; } -static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start, - unsigned int daddr_extent, unsigned char type) +static int mctp_route_remove(struct net *net, unsigned int netid, + mctp_eid_t daddr_start, unsigned int daddr_extent, + unsigned char type) { - struct net *net = dev_net(mdev->dev); struct mctp_route *rt, *tmp; mctp_eid_t daddr_end; bool dropped; @@ -1141,7 +1247,7 @@ static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start, ASSERT_RTNL(); list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { - if (rt->dev == mdev && + if (mctp_route_netid(rt) == netid && rt->min == daddr_start && rt->max == daddr_end && rt->type == type) { list_del_rcu(&rt->list); @@ -1156,12 +1262,32 @@ static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start, int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr) { - return mctp_route_add(mdev, addr, 0, 0, RTN_LOCAL); + struct mctp_route *rt; + int rc; + + rt = mctp_route_alloc(); + if (!rt) + return -ENOMEM; + + rt->min = addr; + rt->max = addr; + rt->dst_type = MCTP_ROUTE_DIRECT; + rt->dev = mdev; + rt->type = RTN_LOCAL; + + mctp_dev_hold(rt->dev); + + rc = mctp_route_add(dev_net(mdev->dev), rt); + if (rc) + mctp_route_release(rt); + + return rc; } int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr) { - return mctp_route_remove(mdev, addr, 0, RTN_LOCAL); + return mctp_route_remove(dev_net(mdev->dev), mdev->net, + addr, 0, RTN_LOCAL); } /* removes all entries for a given device */ @@ -1172,7 +1298,7 @@ void mctp_route_remove_dev(struct mctp_dev *mdev) ASSERT_RTNL(); list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { - if (rt->dev == mdev) { + if (rt->dst_type == MCTP_ROUTE_DIRECT && rt->dev == mdev) { list_del_rcu(&rt->list); /* TODO: immediate RTM_DELROUTE */ mctp_route_release(rt); @@ -1189,8 +1315,9 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, struct net *net = dev_net(dev); struct mctp_dev *mdev; struct mctp_skb_cb *cb; - struct mctp_route *rt; + struct mctp_dst dst; struct mctp_hdr *mh; + int rc; rcu_read_lock(); mdev = __mctp_dev_get(dev); @@ -1232,17 +1359,17 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, cb->net = READ_ONCE(mdev->net); cb->ifindex = dev->ifindex; - rt = mctp_route_lookup(net, cb->net, mh->dest); + rc = mctp_route_lookup(net, cb->net, mh->dest, &dst); /* NULL EID, but addressed to our physical address */ - if (!rt && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST) - rt = mctp_route_lookup_null(net, dev); + if (rc && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST) + rc = mctp_route_lookup_null(net, dev, &dst); - if (!rt) + if (rc) goto err_drop; - rt->output(rt, skb); - mctp_route_release(rt); + dst.output(&dst, skb); + mctp_dst_release(&dst); mctp_dev_put(mdev); return NET_RX_SUCCESS; @@ -1264,19 +1391,28 @@ static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = { [RTA_DST] = { .type = NLA_U8 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_OIF] = { .type = NLA_U32 }, + [RTA_GATEWAY] = NLA_POLICY_EXACT_LEN(sizeof(struct mctp_fq_addr)), +}; + +static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = { + [RTAX_MTU] = { .type = NLA_U32 }, }; -/* Common part for RTM_NEWROUTE and RTM_DELROUTE parsing. - * tb must hold RTA_MAX+1 elements. +/* base parsing; common to both _lookup and _populate variants. + * + * For gateway routes (which have a RTA_GATEWAY, and no RTA_OIF), we populate + * *gatweayp. for direct routes (RTA_OIF, no RTA_GATEWAY), we populate *mdev. */ -static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack, - struct nlattr **tb, struct rtmsg **rtm, - struct mctp_dev **mdev, mctp_eid_t *daddr_start) +static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr **tb, struct rtmsg **rtm, + struct mctp_dev **mdev, + struct mctp_fq_addr *gatewayp, + mctp_eid_t *daddr_start) { - struct net *net = sock_net(skb->sk); + struct mctp_fq_addr *gateway = NULL; + unsigned int ifindex = 0; struct net_device *dev; - unsigned int ifindex; int rc; rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX, @@ -1292,11 +1428,44 @@ static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh, } *daddr_start = nla_get_u8(tb[RTA_DST]); - if (!tb[RTA_OIF]) { - NL_SET_ERR_MSG(extack, "ifindex missing"); + if (tb[RTA_OIF]) + ifindex = nla_get_u32(tb[RTA_OIF]); + + if (tb[RTA_GATEWAY]) + gateway = nla_data(tb[RTA_GATEWAY]); + + if (ifindex && gateway) { + NL_SET_ERR_MSG(extack, + "cannot specify both ifindex and gateway"); + return -EINVAL; + + } else if (ifindex) { + dev = __dev_get_by_index(net, ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "bad ifindex"); + return -ENODEV; + } + *mdev = mctp_dev_get_rtnl(dev); + if (!*mdev) + return -ENODEV; + gatewayp->eid = 0; + + } else if (gateway) { + if (!mctp_address_unicast(gateway->eid)) { + NL_SET_ERR_MSG(extack, "bad gateway"); + return -EINVAL; + } + + gatewayp->eid = gateway->eid; + gatewayp->net = gateway->net != MCTP_NET_ANY ? + gateway->net : + READ_ONCE(net->mctp.default_net); + *mdev = NULL; + + } else { + NL_SET_ERR_MSG(extack, "no route output provided"); return -EINVAL; } - ifindex = nla_get_u32(tb[RTA_OIF]); *rtm = nlmsg_data(nlh); if ((*rtm)->rtm_family != AF_MCTP) { @@ -1304,82 +1473,157 @@ static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } - dev = __dev_get_by_index(net, ifindex); - if (!dev) { - NL_SET_ERR_MSG(extack, "bad ifindex"); - return -ENODEV; - } - *mdev = mctp_dev_get_rtnl(dev); - if (!*mdev) - return -ENODEV; - - if (dev->flags & IFF_LOOPBACK) { - NL_SET_ERR_MSG(extack, "no routes to loopback"); + if ((*rtm)->rtm_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST"); return -EINVAL; } return 0; } -static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = { - [RTAX_MTU] = { .type = NLA_U32 }, -}; - -static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +/* Route parsing for lookup operations; we only need the "route target" + * components (ie., network and dest-EID range). + */ +static int mctp_route_nlparse_lookup(struct net *net, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + unsigned char *type, unsigned int *netid, + mctp_eid_t *daddr_start, + unsigned int *daddr_extent) { struct nlattr *tb[RTA_MAX + 1]; + struct mctp_fq_addr gw; + struct mctp_dev *mdev; + struct rtmsg *rtm; + int rc; + + rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm, + &mdev, &gw, daddr_start); + if (rc) + return rc; + + if (mdev) { + *netid = mdev->net; + } else if (gw.eid) { + *netid = gw.net; + } else { + /* bug: _nlparse_common should not allow this */ + return -1; + } + + *type = rtm->rtm_type; + *daddr_extent = rtm->rtm_dst_len; + + return 0; +} + +/* Full route parse for RTM_NEWROUTE: populate @rt. On success, + * MCTP_ROUTE_DIRECT routes (ie, those with a direct dev) will hold a reference + * to that dev. + */ +static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct mctp_route *rt) +{ struct nlattr *tbx[RTAX_MAX + 1]; + struct nlattr *tb[RTA_MAX + 1]; + unsigned int daddr_extent; + struct mctp_fq_addr gw; mctp_eid_t daddr_start; - struct mctp_dev *mdev; + struct mctp_dev *dev; struct rtmsg *rtm; - unsigned int mtu; + u32 mtu = 0; int rc; - rc = mctp_route_nlparse(skb, nlh, extack, tb, - &rtm, &mdev, &daddr_start); - if (rc < 0) + rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm, + &dev, &gw, &daddr_start); + if (rc) return rc; - if (rtm->rtm_type != RTN_UNICAST) { - NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST"); + daddr_extent = rtm->rtm_dst_len; + + if (daddr_extent > 0xff || daddr_extent + daddr_start >= 255) { + NL_SET_ERR_MSG(extack, "invalid eid range"); return -EINVAL; } - mtu = 0; if (tb[RTA_METRICS]) { rc = nla_parse_nested(tbx, RTAX_MAX, tb[RTA_METRICS], rta_metrics_policy, NULL); - if (rc < 0) + if (rc < 0) { + NL_SET_ERR_MSG(extack, "incorrect RTA_METRICS format"); return rc; + } if (tbx[RTAX_MTU]) mtu = nla_get_u32(tbx[RTAX_MTU]); } - rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu, - rtm->rtm_type); + rt->type = rtm->rtm_type; + rt->min = daddr_start; + rt->max = daddr_start + daddr_extent; + rt->mtu = mtu; + if (gw.eid) { + rt->dst_type = MCTP_ROUTE_GATEWAY; + rt->gateway.eid = gw.eid; + rt->gateway.net = gw.net; + } else { + rt->dst_type = MCTP_ROUTE_DIRECT; + rt->dev = dev; + mctp_dev_hold(rt->dev); + } + + return 0; +} + +static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct mctp_route *rt; + int rc; + + rt = mctp_route_alloc(); + if (!rt) + return -ENOMEM; + + rc = mctp_route_nlparse_populate(net, nlh, extack, rt); + if (rc < 0) + goto err_free; + + if (rt->dst_type == MCTP_ROUTE_DIRECT && + rt->dev->dev->flags & IFF_LOOPBACK) { + NL_SET_ERR_MSG(extack, "no routes to loopback"); + rc = -EINVAL; + goto err_free; + } + + rc = mctp_route_add(net, rt); + if (!rc) + return 0; + +err_free: + mctp_route_release(rt); return rc; } static int mctp_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct nlattr *tb[RTA_MAX + 1]; + struct net *net = sock_net(skb->sk); + unsigned int netid, daddr_extent; + unsigned char type = RTN_UNSPEC; mctp_eid_t daddr_start; - struct mctp_dev *mdev; - struct rtmsg *rtm; int rc; - rc = mctp_route_nlparse(skb, nlh, extack, tb, - &rtm, &mdev, &daddr_start); + rc = mctp_route_nlparse_lookup(net, nlh, extack, &type, &netid, + &daddr_start, &daddr_extent); if (rc < 0) return rc; /* we only have unicast routes */ - if (rtm->rtm_type != RTN_UNICAST) + if (type != RTN_UNICAST) return -EINVAL; - rc = mctp_route_remove(mdev, daddr_start, rtm->rtm_dst_len, RTN_UNICAST); + rc = mctp_route_remove(net, netid, daddr_start, daddr_extent, type); return rc; } @@ -1405,7 +1649,6 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, hdr->rtm_tos = 0; hdr->rtm_table = RT_TABLE_DEFAULT; hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */ - hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */ hdr->rtm_type = rt->type; if (nla_put_u8(skb, RTA_DST, rt->min)) @@ -1422,13 +1665,17 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, nla_nest_end(skb, metrics); - if (rt->dev) { + if (rt->dst_type == MCTP_ROUTE_DIRECT) { + hdr->rtm_scope = RT_SCOPE_LINK; if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex)) goto cancel; + } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) { + hdr->rtm_scope = RT_SCOPE_UNIVERSE; + if (nla_put(skb, RTA_GATEWAY, + sizeof(rt->gateway), &rt->gateway)) + goto cancel; } - /* TODO: conditional neighbour physaddr? */ - nlmsg_end(skb, nlh); return 0; @@ -1475,7 +1722,7 @@ static int __net_init mctp_routes_net_init(struct net *net) struct netns_mctp *ns = &net->mctp; INIT_LIST_HEAD(&ns->routes); - INIT_HLIST_HEAD(&ns->binds); + hash_init(ns->binds); mutex_init(&ns->bind_lock); INIT_HLIST_HEAD(&ns->keys); spin_lock_init(&ns->keys_lock); diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 06c1897b685a8..fb6b46a952cb4 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -2,132 +2,11 @@ #include <kunit/test.h> -#include "utils.h" - -struct mctp_test_route { - struct mctp_route rt; - struct sk_buff_head pkts; -}; - -static int mctp_test_route_output(struct mctp_route *rt, struct sk_buff *skb) -{ - struct mctp_test_route *test_rt = container_of(rt, struct mctp_test_route, rt); - - skb_queue_tail(&test_rt->pkts, skb); - - return 0; -} - -/* local version of mctp_route_alloc() */ -static struct mctp_test_route *mctp_route_test_alloc(void) -{ - struct mctp_test_route *rt; +/* keep clangd happy when compiled outside of the route.c include */ +#include <net/mctp.h> +#include <net/mctpdevice.h> - rt = kzalloc(sizeof(*rt), GFP_KERNEL); - if (!rt) - return NULL; - - INIT_LIST_HEAD(&rt->rt.list); - refcount_set(&rt->rt.refs, 1); - rt->rt.output = mctp_test_route_output; - - skb_queue_head_init(&rt->pkts); - - return rt; -} - -static struct mctp_test_route *mctp_test_create_route(struct net *net, - struct mctp_dev *dev, - mctp_eid_t eid, - unsigned int mtu) -{ - struct mctp_test_route *rt; - - rt = mctp_route_test_alloc(); - if (!rt) - return NULL; - - rt->rt.min = eid; - rt->rt.max = eid; - rt->rt.mtu = mtu; - rt->rt.type = RTN_UNSPEC; - if (dev) - mctp_dev_hold(dev); - rt->rt.dev = dev; - - list_add_rcu(&rt->rt.list, &net->mctp.routes); - - return rt; -} - -static void mctp_test_route_destroy(struct kunit *test, - struct mctp_test_route *rt) -{ - unsigned int refs; - - rtnl_lock(); - list_del_rcu(&rt->rt.list); - rtnl_unlock(); - - skb_queue_purge(&rt->pkts); - if (rt->rt.dev) - mctp_dev_put(rt->rt.dev); - - refs = refcount_read(&rt->rt.refs); - KUNIT_ASSERT_EQ_MSG(test, refs, 1, "route ref imbalance"); - - kfree_rcu(&rt->rt, rcu); -} - -static void mctp_test_skb_set_dev(struct sk_buff *skb, - struct mctp_test_dev *dev) -{ - struct mctp_skb_cb *cb; - - cb = mctp_cb(skb); - cb->net = READ_ONCE(dev->mdev->net); - skb->dev = dev->ndev; -} - -static struct sk_buff *mctp_test_create_skb(const struct mctp_hdr *hdr, - unsigned int data_len) -{ - size_t hdr_len = sizeof(*hdr); - struct sk_buff *skb; - unsigned int i; - u8 *buf; - - skb = alloc_skb(hdr_len + data_len, GFP_KERNEL); - if (!skb) - return NULL; - - __mctp_cb(skb); - memcpy(skb_put(skb, hdr_len), hdr, hdr_len); - - buf = skb_put(skb, data_len); - for (i = 0; i < data_len; i++) - buf[i] = i & 0xff; - - return skb; -} - -static struct sk_buff *__mctp_test_create_skb_data(const struct mctp_hdr *hdr, - const void *data, - size_t data_len) -{ - size_t hdr_len = sizeof(*hdr); - struct sk_buff *skb; - - skb = alloc_skb(hdr_len + data_len, GFP_KERNEL); - if (!skb) - return NULL; - - __mctp_cb(skb); - memcpy(skb_put(skb, hdr_len), hdr, hdr_len); - memcpy(skb_put(skb, data_len), data, data_len); - - return skb; -} +#include "utils.h" #define mctp_test_create_skb_data(h, d) \ __mctp_test_create_skb_data(h, d, sizeof(*d)) @@ -141,8 +20,10 @@ struct mctp_frag_test { static void mctp_test_fragment(struct kunit *test) { const struct mctp_frag_test *params; + struct mctp_test_pktqueue tpq; int rc, i, n, mtu, msgsize; - struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct mctp_dst dst; struct sk_buff *skb; struct mctp_hdr hdr; u8 seq; @@ -159,13 +40,15 @@ static void mctp_test_fragment(struct kunit *test) skb = mctp_test_create_skb(&hdr, msgsize); KUNIT_ASSERT_TRUE(test, skb); - rt = mctp_test_create_route(&init_net, NULL, 10, mtu); - KUNIT_ASSERT_TRUE(test, rt); + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + mctp_test_dst_setup(test, &dst, dev, &tpq, mtu); - rc = mctp_do_fragment_route(&rt->rt, skb, mtu, MCTP_TAG_OWNER); + rc = mctp_do_fragment_route(&dst, skb, mtu, MCTP_TAG_OWNER); KUNIT_EXPECT_FALSE(test, rc); - n = rt->pkts.qlen; + n = tpq.pkts.qlen; KUNIT_EXPECT_EQ(test, n, params->n_frags); @@ -178,7 +61,7 @@ static void mctp_test_fragment(struct kunit *test) first = i == 0; last = i == (n - 1); - skb2 = skb_dequeue(&rt->pkts); + skb2 = skb_dequeue(&tpq.pkts); if (!skb2) break; @@ -216,7 +99,8 @@ static void mctp_test_fragment(struct kunit *test) kfree_skb(skb2); } - mctp_test_route_destroy(test, rt); + mctp_test_dst_release(&dst, &tpq); + mctp_test_destroy_dev(dev); } static const struct mctp_frag_test mctp_frag_tests[] = { @@ -246,25 +130,30 @@ struct mctp_rx_input_test { static void mctp_test_rx_input(struct kunit *test) { const struct mctp_rx_input_test *params; + struct mctp_test_pktqueue tpq; struct mctp_test_route *rt; struct mctp_test_dev *dev; struct sk_buff *skb; params = test->param_value; + test->priv = &tpq; dev = mctp_test_create_dev(); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); - rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); + rt = mctp_test_create_route_direct(&init_net, dev->mdev, 8, 68); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); skb = mctp_test_create_skb(¶ms->hdr, 1); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + mctp_test_pktqueue_init(&tpq); + mctp_pkttype_receive(skb, dev->ndev, &mctp_packet_type, NULL); - KUNIT_EXPECT_EQ(test, !!rt->pkts.qlen, params->input); + KUNIT_EXPECT_EQ(test, !!tpq.pkts.qlen, params->input); + skb_queue_purge(&tpq.pkts); mctp_test_route_destroy(test, rt); mctp_test_destroy_dev(dev); } @@ -292,12 +181,12 @@ KUNIT_ARRAY_PARAM(mctp_rx_input, mctp_rx_input_tests, /* set up a local dev, route on EID 8, and a socket listening on type 0 */ static void __mctp_route_test_init(struct kunit *test, struct mctp_test_dev **devp, - struct mctp_test_route **rtp, + struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq, struct socket **sockp, unsigned int netid) { struct sockaddr_mctp addr = {0}; - struct mctp_test_route *rt; struct mctp_test_dev *dev; struct socket *sock; int rc; @@ -307,8 +196,7 @@ static void __mctp_route_test_init(struct kunit *test, if (netid != MCTP_NET_ANY) WRITE_ONCE(dev->mdev->net, netid); - rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + mctp_test_dst_setup(test, dst, dev, tpq, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -320,18 +208,18 @@ static void __mctp_route_test_init(struct kunit *test, rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr)); KUNIT_ASSERT_EQ(test, rc, 0); - *rtp = rt; *devp = dev; *sockp = sock; } static void __mctp_route_test_fini(struct kunit *test, struct mctp_test_dev *dev, - struct mctp_test_route *rt, + struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq, struct socket *sock) { sock_release(sock); - mctp_test_route_destroy(test, rt); + mctp_test_dst_release(dst, tpq); mctp_test_destroy_dev(dev); } @@ -344,22 +232,24 @@ struct mctp_route_input_sk_test { static void mctp_test_route_input_sk(struct kunit *test) { const struct mctp_route_input_sk_test *params; + struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; - struct mctp_test_route *rt; struct mctp_test_dev *dev; + struct mctp_dst dst; struct socket *sock; int rc; params = test->param_value; - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); skb = mctp_test_create_skb_data(¶ms->hdr, ¶ms->type); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); mctp_test_skb_set_dev(skb, dev); + mctp_test_pktqueue_init(&tpq); - rc = mctp_route_input(&rt->rt, skb); + rc = mctp_dst_input(&dst, skb); if (params->deliver) { KUNIT_EXPECT_EQ(test, rc, 0); @@ -376,7 +266,7 @@ static void mctp_test_route_input_sk(struct kunit *test) KUNIT_EXPECT_NULL(test, skb2); } - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } #define FL_S (MCTP_HDR_FLAG_SOM) @@ -413,16 +303,17 @@ struct mctp_route_input_sk_reasm_test { static void mctp_test_route_input_sk_reasm(struct kunit *test) { const struct mctp_route_input_sk_reasm_test *params; + struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; - struct mctp_test_route *rt; struct mctp_test_dev *dev; + struct mctp_dst dst; struct socket *sock; int i, rc; u8 c; params = test->param_value; - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); for (i = 0; i < params->n_hdrs; i++) { c = i; @@ -431,7 +322,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test) mctp_test_skb_set_dev(skb, dev); - rc = mctp_route_input(&rt->rt, skb); + rc = mctp_dst_input(&dst, skb); } skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc); @@ -445,7 +336,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test) KUNIT_EXPECT_NULL(test, skb2); } - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } #define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_TO | (f) | ((s) << MCTP_HDR_SEQ_SHIFT)) @@ -547,7 +438,7 @@ struct mctp_route_input_sk_keys_test { static void mctp_test_route_input_sk_keys(struct kunit *test) { const struct mctp_route_input_sk_keys_test *params; - struct mctp_test_route *rt; + struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_sk_key *key; @@ -555,6 +446,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) struct mctp_sock *msk; struct socket *sock; unsigned long flags; + struct mctp_dst dst; unsigned int net; int rc; u8 c; @@ -565,8 +457,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); net = READ_ONCE(dev->mdev->net); - rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + mctp_test_dst_setup(test, &dst, dev, &tpq, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -592,7 +483,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) mctp_test_skb_set_dev(skb, dev); - rc = mctp_route_input(&rt->rt, skb); + rc = mctp_dst_input(&dst, skb); /* (potentially) receive message */ skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc); @@ -606,7 +497,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) skb_free_datagram(sock->sk, skb2); mctp_key_unref(key); - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } static const struct mctp_route_input_sk_keys_test mctp_route_input_sk_keys_tests[] = { @@ -681,7 +572,8 @@ KUNIT_ARRAY_PARAM(mctp_route_input_sk_keys, mctp_route_input_sk_keys_tests, struct test_net { unsigned int netid; struct mctp_test_dev *dev; - struct mctp_test_route *rt; + struct mctp_test_pktqueue tpq; + struct mctp_dst dst; struct socket *sock; struct sk_buff *skb; struct mctp_sk_key *key; @@ -699,18 +591,20 @@ mctp_test_route_input_multiple_nets_bind_init(struct kunit *test, t->msg.data = t->netid; - __mctp_route_test_init(test, &t->dev, &t->rt, &t->sock, t->netid); + __mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock, + t->netid); t->skb = mctp_test_create_skb_data(&hdr, &t->msg); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t->skb); mctp_test_skb_set_dev(t->skb, t->dev); + mctp_test_pktqueue_init(&t->tpq); } static void mctp_test_route_input_multiple_nets_bind_fini(struct kunit *test, struct test_net *t) { - __mctp_route_test_fini(test, t->dev, t->rt, t->sock); + __mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock); } /* Test that skbs from different nets (otherwise identical) get routed to their @@ -731,9 +625,9 @@ static void mctp_test_route_input_multiple_nets_bind(struct kunit *test) mctp_test_route_input_multiple_nets_bind_init(test, &t1); mctp_test_route_input_multiple_nets_bind_init(test, &t2); - rc = mctp_route_input(&t1.rt->rt, t1.skb); + rc = mctp_dst_input(&t1.dst, t1.skb); KUNIT_ASSERT_EQ(test, rc, 0); - rc = mctp_route_input(&t2.rt->rt, t2.skb); + rc = mctp_dst_input(&t2.dst, t2.skb); KUNIT_ASSERT_EQ(test, rc, 0); rx_skb1 = skb_recv_datagram(t1.sock->sk, MSG_DONTWAIT, &rc); @@ -767,7 +661,8 @@ mctp_test_route_input_multiple_nets_key_init(struct kunit *test, t->msg.data = t->netid; - __mctp_route_test_init(test, &t->dev, &t->rt, &t->sock, t->netid); + __mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock, + t->netid); msk = container_of(t->sock->sk, struct mctp_sock, sk); @@ -790,7 +685,7 @@ mctp_test_route_input_multiple_nets_key_fini(struct kunit *test, struct test_net *t) { mctp_key_unref(t->key); - __mctp_route_test_fini(test, t->dev, t->rt, t->sock); + __mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock); } /* test that skbs from different nets (otherwise identical) get routed to their @@ -812,9 +707,9 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) mctp_test_route_input_multiple_nets_key_init(test, &t1); mctp_test_route_input_multiple_nets_key_init(test, &t2); - rc = mctp_route_input(&t1.rt->rt, t1.skb); + rc = mctp_dst_input(&t1.dst, t1.skb); KUNIT_ASSERT_EQ(test, rc, 0); - rc = mctp_route_input(&t2.rt->rt, t2.skb); + rc = mctp_dst_input(&t2.dst, t2.skb); KUNIT_ASSERT_EQ(test, rc, 0); rx_skb1 = skb_recv_datagram(t1.sock->sk, MSG_DONTWAIT, &rc); @@ -843,13 +738,14 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) static void mctp_test_route_input_sk_fail_single(struct kunit *test) { const struct mctp_hdr hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO); - struct mctp_test_route *rt; + struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; + struct mctp_dst dst; struct socket *sock; struct sk_buff *skb; int rc; - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); /* No rcvbuf space, so delivery should fail. __sock_set_rcvbuf will * clamp the minimum to SOCK_MIN_RCVBUF, so we open-code this. @@ -865,14 +761,14 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test) mctp_test_skb_set_dev(skb, dev); /* do route input, which should fail */ - rc = mctp_route_input(&rt->rt, skb); + rc = mctp_dst_input(&dst, skb); KUNIT_EXPECT_NE(test, rc, 0); /* we should hold the only reference to skb */ KUNIT_EXPECT_EQ(test, refcount_read(&skb->users), 1); kfree_skb(skb); - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } /* Input route to socket, using a fragmented message, where sock delivery fails. @@ -880,14 +776,15 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test) static void mctp_test_route_input_sk_fail_frag(struct kunit *test) { const struct mctp_hdr hdrs[2] = { RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1) }; - struct mctp_test_route *rt; + struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct sk_buff *skbs[2]; + struct mctp_dst dst; struct socket *sock; unsigned int i; int rc; - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); lock_sock(sock->sk); WRITE_ONCE(sock->sk->sk_rcvbuf, 0); @@ -904,11 +801,11 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test) /* first route input should succeed, we're only queueing to the * frag list */ - rc = mctp_route_input(&rt->rt, skbs[0]); + rc = mctp_dst_input(&dst, skbs[0]); KUNIT_EXPECT_EQ(test, rc, 0); /* final route input should fail to deliver to the socket */ - rc = mctp_route_input(&rt->rt, skbs[1]); + rc = mctp_dst_input(&dst, skbs[1]); KUNIT_EXPECT_NE(test, rc, 0); /* we should hold the only reference to both skbs */ @@ -918,7 +815,7 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test) KUNIT_EXPECT_EQ(test, refcount_read(&skbs[1]->users), 1); kfree_skb(skbs[1]); - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } /* Input route to socket, using a fragmented message created from clones. @@ -933,23 +830,22 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1), }; - struct mctp_test_route *rt; + const size_t data_len = 3; /* arbitrary */ + u8 compare[3 * ARRAY_SIZE(hdrs)]; + u8 flat[3 * ARRAY_SIZE(hdrs)]; + struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct sk_buff *skb[5]; struct sk_buff *rx_skb; + struct mctp_dst dst; struct socket *sock; - size_t data_len; - u8 compare[100]; - u8 flat[100]; size_t total; void *p; int rc; - /* Arbitrary length */ - data_len = 3; total = data_len + sizeof(struct mctp_hdr); - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); /* Create a single skb initially with concatenated packets */ skb[0] = mctp_test_create_skb(&hdrs[0], 5 * total); @@ -988,7 +884,7 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) /* Feed the fragments into MCTP core */ for (int i = 0; i < 5; i++) { - rc = mctp_route_input(&rt->rt, skb[i]); + rc = mctp_dst_input(&dst, skb[i]); KUNIT_EXPECT_EQ(test, rc, 0); } @@ -1026,29 +922,29 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) kfree_skb(skb[i]); } - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } #if IS_ENABLED(CONFIG_MCTP_FLOWS) static void mctp_test_flow_init(struct kunit *test, struct mctp_test_dev **devp, - struct mctp_test_route **rtp, + struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq, struct socket **sock, struct sk_buff **skbp, unsigned int len) { - struct mctp_test_route *rt; struct mctp_test_dev *dev; struct sk_buff *skb; /* we have a slightly odd routing setup here; the test route * is for EID 8, which is our local EID. We don't do a routing * lookup, so that's fine - all we require is a path through - * mctp_local_output, which will call rt->output on whatever + * mctp_local_output, which will call dst->output on whatever * route we provide */ - __mctp_route_test_init(test, &dev, &rt, sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, dst, tpq, sock, MCTP_NET_ANY); /* Assign a single EID. ->addrs is freed on mctp netdev release */ dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL); @@ -1061,42 +957,41 @@ static void mctp_test_flow_init(struct kunit *test, skb_reserve(skb, sizeof(struct mctp_hdr) + 1); memset(skb_put(skb, len), 0, len); - /* take a ref for the route, we'll decrement in local output */ - refcount_inc(&rt->rt.refs); *devp = dev; - *rtp = rt; *skbp = skb; } static void mctp_test_flow_fini(struct kunit *test, struct mctp_test_dev *dev, - struct mctp_test_route *rt, + struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq, struct socket *sock) { - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, dst, tpq, sock); } /* test that an outgoing skb has the correct MCTP extension data set */ static void mctp_test_packet_flow(struct kunit *test) { + struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; - struct mctp_test_route *rt; struct mctp_test_dev *dev; + struct mctp_dst dst; struct mctp_flow *flow; struct socket *sock; - u8 dst = 8; + u8 dst_eid = 8; int n, rc; - mctp_test_flow_init(test, &dev, &rt, &sock, &skb, 30); + mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 30); - rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER); + rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); - n = rt->pkts.qlen; + n = tpq.pkts.qlen; KUNIT_ASSERT_EQ(test, n, 1); - skb2 = skb_dequeue(&rt->pkts); + skb2 = skb_dequeue(&tpq.pkts); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb2); flow = skb_ext_find(skb2, SKB_EXT_MCTP); @@ -1105,7 +1000,7 @@ static void mctp_test_packet_flow(struct kunit *test) KUNIT_ASSERT_PTR_EQ(test, flow->key->sk, sock->sk); kfree_skb(skb2); - mctp_test_flow_fini(test, dev, rt, sock); + mctp_test_flow_fini(test, dev, &dst, &tpq, sock); } /* test that outgoing skbs, after fragmentation, all have the correct MCTP @@ -1113,26 +1008,27 @@ static void mctp_test_packet_flow(struct kunit *test) */ static void mctp_test_fragment_flow(struct kunit *test) { + struct mctp_test_pktqueue tpq; struct mctp_flow *flows[2]; struct sk_buff *tx_skbs[2]; - struct mctp_test_route *rt; struct mctp_test_dev *dev; + struct mctp_dst dst; struct sk_buff *skb; struct socket *sock; - u8 dst = 8; + u8 dst_eid = 8; int n, rc; - mctp_test_flow_init(test, &dev, &rt, &sock, &skb, 100); + mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 100); - rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER); + rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); - n = rt->pkts.qlen; + n = tpq.pkts.qlen; KUNIT_ASSERT_EQ(test, n, 2); /* both resulting packets should have the same flow data */ - tx_skbs[0] = skb_dequeue(&rt->pkts); - tx_skbs[1] = skb_dequeue(&rt->pkts); + tx_skbs[0] = skb_dequeue(&tpq.pkts); + tx_skbs[1] = skb_dequeue(&tpq.pkts); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[0]); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[1]); @@ -1148,7 +1044,7 @@ static void mctp_test_fragment_flow(struct kunit *test) kfree_skb(tx_skbs[0]); kfree_skb(tx_skbs[1]); - mctp_test_flow_fini(test, dev, rt, sock); + mctp_test_flow_fini(test, dev, &dst, &tpq, sock); } #else @@ -1166,15 +1062,16 @@ static void mctp_test_fragment_flow(struct kunit *test) /* Test that outgoing skbs cause a suitable tag to be created */ static void mctp_test_route_output_key_create(struct kunit *test) { + const u8 dst_eid = 26, src_eid = 15; + struct mctp_test_pktqueue tpq; const unsigned int netid = 50; - const u8 dst = 26, src = 15; - struct mctp_test_route *rt; struct mctp_test_dev *dev; struct mctp_sk_key *key; struct netns_mctp *mns; unsigned long flags; struct socket *sock; struct sk_buff *skb; + struct mctp_dst dst; bool empty, single; const int len = 2; int rc; @@ -1183,15 +1080,14 @@ static void mctp_test_route_output_key_create(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); WRITE_ONCE(dev->mdev->net, netid); - rt = mctp_test_create_route(&init_net, dev->mdev, dst, 68); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + mctp_test_dst_setup(test, &dst, dev, &tpq, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL); dev->mdev->num_addrs = 1; - dev->mdev->addrs[0] = src; + dev->mdev->addrs[0] = src_eid; skb = alloc_skb(sizeof(struct mctp_hdr) + 1 + len, GFP_KERNEL); KUNIT_ASSERT_TRUE(test, skb); @@ -1199,8 +1095,6 @@ static void mctp_test_route_output_key_create(struct kunit *test) skb_reserve(skb, sizeof(struct mctp_hdr) + 1 + len); memset(skb_put(skb, len), 0, len); - refcount_inc(&rt->rt.refs); - mns = &sock_net(sock->sk)->mctp; /* We assume we're starting from an empty keys list, which requires @@ -1211,7 +1105,7 @@ static void mctp_test_route_output_key_create(struct kunit *test) spin_unlock_irqrestore(&mns->keys_lock, flags); KUNIT_ASSERT_TRUE(test, empty); - rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER); + rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); key = NULL; @@ -1227,16 +1121,480 @@ static void mctp_test_route_output_key_create(struct kunit *test) KUNIT_ASSERT_TRUE(test, single); KUNIT_EXPECT_EQ(test, key->net, netid); - KUNIT_EXPECT_EQ(test, key->local_addr, src); - KUNIT_EXPECT_EQ(test, key->peer_addr, dst); + KUNIT_EXPECT_EQ(test, key->local_addr, src_eid); + KUNIT_EXPECT_EQ(test, key->peer_addr, dst_eid); /* key has incoming tag, so inverse of what we sent */ KUNIT_EXPECT_FALSE(test, key->tag & MCTP_TAG_OWNER); sock_release(sock); - mctp_test_route_destroy(test, rt); + mctp_test_dst_release(&dst, &tpq); mctp_test_destroy_dev(dev); } +static void mctp_test_route_extaddr_input(struct kunit *test) +{ + static const unsigned char haddr[] = { 0xaa, 0x55 }; + struct mctp_test_pktqueue tpq; + struct mctp_skb_cb *cb, *cb2; + const unsigned int len = 40; + struct mctp_test_dev *dev; + struct sk_buff *skb, *skb2; + struct mctp_dst dst; + struct mctp_hdr hdr; + struct socket *sock; + int rc; + + hdr.ver = 1; + hdr.src = 10; + hdr.dest = 8; + hdr.flags_seq_tag = FL_S | FL_E | FL_TO; + + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + + skb = mctp_test_create_skb(&hdr, len); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + + /* set our hardware addressing data */ + cb = mctp_cb(skb); + memcpy(cb->haddr, haddr, sizeof(haddr)); + cb->halen = sizeof(haddr); + + mctp_test_skb_set_dev(skb, dev); + + rc = mctp_dst_input(&dst, skb); + KUNIT_ASSERT_EQ(test, rc, 0); + + skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb2); + KUNIT_ASSERT_EQ(test, skb2->len, len); + + cb2 = mctp_cb(skb2); + + /* Received SKB should have the hardware addressing as set above. + * We're likely to have the same actual cb here (ie., cb == cb2), + * but it's the comparison that we care about + */ + KUNIT_EXPECT_EQ(test, cb2->halen, sizeof(haddr)); + KUNIT_EXPECT_MEMEQ(test, cb2->haddr, haddr, sizeof(haddr)); + + kfree_skb(skb2); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); +} + +static void mctp_test_route_gw_lookup(struct kunit *test) +{ + struct mctp_test_route *rt1, *rt2; + struct mctp_dst dst = { 0 }; + struct mctp_test_dev *dev; + int rc; + + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + /* 8 (local) -> 10 (gateway) via 9 (direct) */ + rt1 = mctp_test_create_route_direct(&init_net, dev->mdev, 9, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt1); + rt2 = mctp_test_create_route_gw(&init_net, dev->mdev->net, 10, 9, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt2); + + rc = mctp_route_lookup(&init_net, dev->mdev->net, 10, &dst); + KUNIT_EXPECT_EQ(test, rc, 0); + KUNIT_EXPECT_PTR_EQ(test, dst.dev, dev->mdev); + KUNIT_EXPECT_EQ(test, dst.mtu, dev->ndev->mtu); + KUNIT_EXPECT_EQ(test, dst.nexthop, 9); + KUNIT_EXPECT_EQ(test, dst.halen, 0); + + mctp_dst_release(&dst); + + mctp_test_route_destroy(test, rt2); + mctp_test_route_destroy(test, rt1); + mctp_test_destroy_dev(dev); +} + +static void mctp_test_route_gw_loop(struct kunit *test) +{ + struct mctp_test_route *rt1, *rt2; + struct mctp_dst dst = { 0 }; + struct mctp_test_dev *dev; + int rc; + + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + /* two routes using each other as the gw */ + rt1 = mctp_test_create_route_gw(&init_net, dev->mdev->net, 9, 10, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt1); + rt2 = mctp_test_create_route_gw(&init_net, dev->mdev->net, 10, 9, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt2); + + /* this should fail, rather than infinite-loop */ + rc = mctp_route_lookup(&init_net, dev->mdev->net, 10, &dst); + KUNIT_EXPECT_NE(test, rc, 0); + + mctp_test_route_destroy(test, rt2); + mctp_test_route_destroy(test, rt1); + mctp_test_destroy_dev(dev); +} + +struct mctp_route_gw_mtu_test { + /* working away from the local stack */ + unsigned int dev, neigh, gw, dst; + unsigned int exp; +}; + +static void mctp_route_gw_mtu_to_desc(const struct mctp_route_gw_mtu_test *t, + char *desc) +{ + sprintf(desc, "dev %d, neigh %d, gw %d, dst %d -> %d", + t->dev, t->neigh, t->gw, t->dst, t->exp); +} + +static const struct mctp_route_gw_mtu_test mctp_route_gw_mtu_tests[] = { + /* no route-specific MTUs */ + { 68, 0, 0, 0, 68 }, + { 100, 0, 0, 0, 100 }, + /* one route MTU (smaller than dev mtu), others unrestricted */ + { 100, 68, 0, 0, 68 }, + { 100, 0, 68, 0, 68 }, + { 100, 0, 0, 68, 68 }, + /* smallest applied, regardless of order */ + { 100, 99, 98, 68, 68 }, + { 99, 100, 98, 68, 68 }, + { 98, 99, 100, 68, 68 }, + { 68, 98, 99, 100, 68 }, +}; + +KUNIT_ARRAY_PARAM(mctp_route_gw_mtu, mctp_route_gw_mtu_tests, + mctp_route_gw_mtu_to_desc); + +static void mctp_test_route_gw_mtu(struct kunit *test) +{ + const struct mctp_route_gw_mtu_test *mtus = test->param_value; + struct mctp_test_route *rt1, *rt2, *rt3; + struct mctp_dst dst = { 0 }; + struct mctp_test_dev *dev; + struct mctp_dev *mdev; + unsigned int netid; + int rc; + + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + dev->ndev->mtu = mtus->dev; + mdev = dev->mdev; + netid = mdev->net; + + /* 8 (local) -> 11 (dst) via 10 (gw) via 9 (neigh) */ + rt1 = mctp_test_create_route_direct(&init_net, mdev, 9, mtus->neigh); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt1); + + rt2 = mctp_test_create_route_gw(&init_net, netid, 10, 9, mtus->gw); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt2); + + rt3 = mctp_test_create_route_gw(&init_net, netid, 11, 10, mtus->dst); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt3); + + rc = mctp_route_lookup(&init_net, dev->mdev->net, 11, &dst); + KUNIT_EXPECT_EQ(test, rc, 0); + KUNIT_EXPECT_EQ(test, dst.mtu, mtus->exp); + + mctp_dst_release(&dst); + + mctp_test_route_destroy(test, rt3); + mctp_test_route_destroy(test, rt2); + mctp_test_route_destroy(test, rt1); + mctp_test_destroy_dev(dev); +} + +#define MCTP_TEST_LLADDR_LEN 2 +struct mctp_test_llhdr { + unsigned int magic; + unsigned char src[MCTP_TEST_LLADDR_LEN]; + unsigned char dst[MCTP_TEST_LLADDR_LEN]; +}; + +static const unsigned int mctp_test_llhdr_magic = 0x5c78339c; + +static int test_dev_header_create(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) +{ + struct kunit *test = current->kunit_test; + struct mctp_test_llhdr *hdr; + + hdr = skb_push(skb, sizeof(*hdr)); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, hdr); + skb_reset_mac_header(skb); + + hdr->magic = mctp_test_llhdr_magic; + memcpy(&hdr->src, saddr, sizeof(hdr->src)); + memcpy(&hdr->dst, daddr, sizeof(hdr->dst)); + + return 0; +} + +/* Test the dst_output path for a gateway-routed skb: we should have it + * lookup the nexthop EID in the neighbour table, and call into + * header_ops->create to resolve that to a lladdr. Our mock header_ops->create + * will just set a synthetic link-layer header, which we check after transmit. + */ +static void mctp_test_route_gw_output(struct kunit *test) +{ + const unsigned char haddr_self[MCTP_TEST_LLADDR_LEN] = { 0xaa, 0x03 }; + const unsigned char haddr_peer[MCTP_TEST_LLADDR_LEN] = { 0xaa, 0x02 }; + const struct header_ops ops = { + .create = test_dev_header_create, + }; + struct mctp_neigh neigh = { 0 }; + struct mctp_test_llhdr *ll_hdr; + struct mctp_dst dst = { 0 }; + struct mctp_hdr hdr = { 0 }; + struct mctp_test_dev *dev; + struct sk_buff *skb; + unsigned char *buf; + int i, rc; + + dev = mctp_test_create_dev_lladdr(sizeof(haddr_self), haddr_self); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + dev->ndev->header_ops = &ops; + + dst.dev = dev->mdev; + __mctp_dev_get(dst.dev->dev); + dst.mtu = 68; + dst.nexthop = 9; + + /* simple mctp_neigh_add for the gateway (not dest!) endpoint */ + INIT_LIST_HEAD(&neigh.list); + neigh.dev = dev->mdev; + mctp_dev_hold(dev->mdev); + neigh.eid = 9; + neigh.source = MCTP_NEIGH_STATIC; + memcpy(neigh.ha, haddr_peer, sizeof(haddr_peer)); + list_add_rcu(&neigh.list, &init_net.mctp.neighbours); + + hdr.ver = 1; + hdr.src = 8; + hdr.dest = 10; + hdr.flags_seq_tag = FL_S | FL_E | FL_TO; + + /* construct enough for a future link-layer header, the provided + * mctp header, and 4 bytes of data + */ + skb = alloc_skb(sizeof(*ll_hdr) + sizeof(hdr) + 4, GFP_KERNEL); + skb->dev = dev->ndev; + __mctp_cb(skb); + + skb_reserve(skb, sizeof(*ll_hdr)); + + memcpy(skb_put(skb, sizeof(hdr)), &hdr, sizeof(hdr)); + buf = skb_put(skb, 4); + for (i = 0; i < 4; i++) + buf[i] = i; + + /* extra ref over the dev_xmit */ + skb_get(skb); + + rc = mctp_dst_output(&dst, skb); + KUNIT_EXPECT_EQ(test, rc, 0); + + mctp_dst_release(&dst); + list_del_rcu(&neigh.list); + mctp_dev_put(dev->mdev); + + /* check that we have our header created with the correct neighbour */ + ll_hdr = (void *)skb_mac_header(skb); + KUNIT_EXPECT_EQ(test, ll_hdr->magic, mctp_test_llhdr_magic); + KUNIT_EXPECT_MEMEQ(test, ll_hdr->src, haddr_self, sizeof(haddr_self)); + KUNIT_EXPECT_MEMEQ(test, ll_hdr->dst, haddr_peer, sizeof(haddr_peer)); + kfree_skb(skb); +} + +struct mctp_bind_lookup_test { + /* header of incoming message */ + struct mctp_hdr hdr; + u8 ty; + /* mctp network of incoming interface (smctp_network) */ + unsigned int net; + + /* expected socket, matches .name in lookup_binds, NULL for dropped */ + const char *expect; +}; + +/* Single-packet TO-set message */ +#define LK(src, dst) RX_HDR(1, (src), (dst), FL_S | FL_E | FL_TO) + +/* Input message test cases for bind lookup tests. + * + * 10 and 11 are local EIDs. + * 20 and 21 are remote EIDs. + */ +static const struct mctp_bind_lookup_test mctp_bind_lookup_tests[] = { + /* both local-eid and remote-eid binds, remote eid is preferenced */ + { .hdr = LK(20, 10), .ty = 1, .net = 1, .expect = "remote20" }, + + { .hdr = LK(20, 255), .ty = 1, .net = 1, .expect = "remote20" }, + { .hdr = LK(20, 0), .ty = 1, .net = 1, .expect = "remote20" }, + { .hdr = LK(0, 255), .ty = 1, .net = 1, .expect = "any" }, + { .hdr = LK(0, 11), .ty = 1, .net = 1, .expect = "any" }, + { .hdr = LK(0, 0), .ty = 1, .net = 1, .expect = "any" }, + { .hdr = LK(0, 10), .ty = 1, .net = 1, .expect = "local10" }, + { .hdr = LK(21, 10), .ty = 1, .net = 1, .expect = "local10" }, + { .hdr = LK(21, 11), .ty = 1, .net = 1, .expect = "remote21local11" }, + + /* both src and dest set to eid=99. unusual, but accepted + * by MCTP stack currently. + */ + { .hdr = LK(99, 99), .ty = 1, .net = 1, .expect = "any" }, + + /* unbound smctp_type */ + { .hdr = LK(20, 10), .ty = 3, .net = 1, .expect = NULL }, + + /* smctp_network tests */ + + { .hdr = LK(0, 0), .ty = 1, .net = 7, .expect = "any" }, + { .hdr = LK(21, 10), .ty = 1, .net = 2, .expect = "any" }, + + /* remote EID 20 matches, but MCTP_NET_ANY in "remote20" resolved + * to net=1, so lookup doesn't match "remote20" + */ + { .hdr = LK(20, 10), .ty = 1, .net = 3, .expect = "any" }, + + { .hdr = LK(21, 10), .ty = 1, .net = 3, .expect = "remote21net3" }, + { .hdr = LK(21, 10), .ty = 1, .net = 4, .expect = "remote21net4" }, + { .hdr = LK(21, 10), .ty = 1, .net = 5, .expect = "remote21net5" }, + + { .hdr = LK(21, 10), .ty = 1, .net = 5, .expect = "remote21net5" }, + + { .hdr = LK(99, 10), .ty = 1, .net = 8, .expect = "local10net8" }, + + { .hdr = LK(99, 10), .ty = 1, .net = 9, .expect = "anynet9" }, + { .hdr = LK(0, 0), .ty = 1, .net = 9, .expect = "anynet9" }, + { .hdr = LK(99, 99), .ty = 1, .net = 9, .expect = "anynet9" }, + { .hdr = LK(20, 10), .ty = 1, .net = 9, .expect = "anynet9" }, +}; + +/* Binds to create during the lookup tests */ +static const struct mctp_test_bind_setup lookup_binds[] = { + /* any address and net, type 1 */ + { .name = "any", .bind_addr = MCTP_ADDR_ANY, + .bind_net = MCTP_NET_ANY, .bind_type = 1, }, + /* local eid 10, net 1 (resolved from MCTP_NET_ANY) */ + { .name = "local10", .bind_addr = 10, + .bind_net = MCTP_NET_ANY, .bind_type = 1, }, + /* local eid 10, net 8 */ + { .name = "local10net8", .bind_addr = 10, + .bind_net = 8, .bind_type = 1, }, + /* any EID, net 9 */ + { .name = "anynet9", .bind_addr = MCTP_ADDR_ANY, + .bind_net = 9, .bind_type = 1, }, + + /* remote eid 20, net 1, any local eid */ + { .name = "remote20", .bind_addr = MCTP_ADDR_ANY, + .bind_net = MCTP_NET_ANY, .bind_type = 1, + .have_peer = true, .peer_addr = 20, .peer_net = MCTP_NET_ANY, }, + + /* remote eid 20, net 1, local eid 11 */ + { .name = "remote21local11", .bind_addr = 11, + .bind_net = MCTP_NET_ANY, .bind_type = 1, + .have_peer = true, .peer_addr = 21, .peer_net = MCTP_NET_ANY, }, + + /* remote eid 21, specific net=3 for connect() */ + { .name = "remote21net3", .bind_addr = MCTP_ADDR_ANY, + .bind_net = MCTP_NET_ANY, .bind_type = 1, + .have_peer = true, .peer_addr = 21, .peer_net = 3, }, + + /* remote eid 21, net 4 for bind, specific net=4 for connect() */ + { .name = "remote21net4", .bind_addr = MCTP_ADDR_ANY, + .bind_net = 4, .bind_type = 1, + .have_peer = true, .peer_addr = 21, .peer_net = 4, }, + + /* remote eid 21, net 5 for bind, specific net=5 for connect() */ + { .name = "remote21net5", .bind_addr = MCTP_ADDR_ANY, + .bind_net = 5, .bind_type = 1, + .have_peer = true, .peer_addr = 21, .peer_net = 5, }, +}; + +static void mctp_bind_lookup_desc(const struct mctp_bind_lookup_test *t, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, + "{src %d dst %d ty %d net %d expect %s}", + t->hdr.src, t->hdr.dest, t->ty, t->net, t->expect); +} + +KUNIT_ARRAY_PARAM(mctp_bind_lookup, mctp_bind_lookup_tests, + mctp_bind_lookup_desc); + +static void mctp_test_bind_lookup(struct kunit *test) +{ + const struct mctp_bind_lookup_test *rx; + struct socket *socks[ARRAY_SIZE(lookup_binds)]; + struct sk_buff *skb_pkt = NULL, *skb_sock = NULL; + struct socket *sock_ty0, *sock_expect = NULL; + struct mctp_test_pktqueue tpq; + struct mctp_test_dev *dev; + struct mctp_dst dst; + int rc; + + rx = test->param_value; + + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock_ty0, rx->net); + /* Create all binds */ + for (size_t i = 0; i < ARRAY_SIZE(lookup_binds); i++) { + mctp_test_bind_run(test, &lookup_binds[i], + &rc, &socks[i]); + KUNIT_ASSERT_EQ(test, rc, 0); + + /* Record the expected receive socket */ + if (rx->expect && + strcmp(rx->expect, lookup_binds[i].name) == 0) { + KUNIT_ASSERT_NULL(test, sock_expect); + sock_expect = socks[i]; + } + } + KUNIT_ASSERT_EQ(test, !!sock_expect, !!rx->expect); + + /* Create test message */ + skb_pkt = mctp_test_create_skb_data(&rx->hdr, &rx->ty); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb_pkt); + mctp_test_skb_set_dev(skb_pkt, dev); + mctp_test_pktqueue_init(&tpq); + + rc = mctp_dst_input(&dst, skb_pkt); + if (rx->expect) { + /* Test the message is received on the expected socket */ + KUNIT_EXPECT_EQ(test, rc, 0); + skb_sock = skb_recv_datagram(sock_expect->sk, + MSG_DONTWAIT, &rc); + if (!skb_sock) { + /* Find which socket received it instead */ + for (size_t i = 0; i < ARRAY_SIZE(lookup_binds); i++) { + skb_sock = skb_recv_datagram(socks[i]->sk, + MSG_DONTWAIT, &rc); + if (skb_sock) { + KUNIT_FAIL(test, + "received on incorrect socket '%s', expect '%s'", + lookup_binds[i].name, + rx->expect); + goto cleanup; + } + } + KUNIT_FAIL(test, "no message received"); + } + } else { + KUNIT_EXPECT_NE(test, rc, 0); + } + +cleanup: + kfree_skb(skb_sock); + kfree_skb(skb_pkt); + + /* Drop all binds */ + for (size_t i = 0; i < ARRAY_SIZE(lookup_binds); i++) + sock_release(socks[i]); + + __mctp_route_test_fini(test, dev, &dst, &tpq, sock_ty0); +} + static struct kunit_case mctp_test_cases[] = { KUNIT_CASE_PARAM(mctp_test_fragment, mctp_frag_gen_params), KUNIT_CASE_PARAM(mctp_test_rx_input, mctp_rx_input_gen_params), @@ -1253,11 +1611,17 @@ static struct kunit_case mctp_test_cases[] = { KUNIT_CASE(mctp_test_fragment_flow), KUNIT_CASE(mctp_test_route_output_key_create), KUNIT_CASE(mctp_test_route_input_cloned_frag), + KUNIT_CASE(mctp_test_route_extaddr_input), + KUNIT_CASE(mctp_test_route_gw_lookup), + KUNIT_CASE(mctp_test_route_gw_loop), + KUNIT_CASE_PARAM(mctp_test_route_gw_mtu, mctp_route_gw_mtu_gen_params), + KUNIT_CASE(mctp_test_route_gw_output), + KUNIT_CASE_PARAM(mctp_test_bind_lookup, mctp_bind_lookup_gen_params), {} }; static struct kunit_suite mctp_test_suite = { - .name = "mctp", + .name = "mctp-route", .test_cases = mctp_test_cases, }; diff --git a/net/mctp/test/sock-test.c b/net/mctp/test/sock-test.c new file mode 100644 index 0000000000000..b0942deb50198 --- /dev/null +++ b/net/mctp/test/sock-test.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <kunit/static_stub.h> +#include <kunit/test.h> + +#include <linux/socket.h> +#include <linux/spinlock.h> + +#include "utils.h" + +static const u8 dev_default_lladdr[] = { 0x01, 0x02 }; + +/* helper for simple sock setup: single device, with dev_default_lladdr as its + * hardware address, assigned with a local EID 8, and a route to EID 9 + */ +static void __mctp_sock_test_init(struct kunit *test, + struct mctp_test_dev **devp, + struct mctp_test_route **rtp, + struct socket **sockp) +{ + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + unsigned long flags; + u8 *addrs; + int rc; + + dev = mctp_test_create_dev_lladdr(sizeof(dev_default_lladdr), + dev_default_lladdr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + addrs = kmalloc(1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, addrs); + addrs[0] = 8; + + spin_lock_irqsave(&dev->mdev->addrs_lock, flags); + dev->mdev->num_addrs = 1; + swap(addrs, dev->mdev->addrs); + spin_unlock_irqrestore(&dev->mdev->addrs_lock, flags); + + kfree(addrs); + + rt = mctp_test_create_route_direct(dev_net(dev->ndev), dev->mdev, 9, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + + rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); + KUNIT_ASSERT_EQ(test, rc, 0); + + *devp = dev; + *rtp = rt; + *sockp = sock; +} + +static void __mctp_sock_test_fini(struct kunit *test, + struct mctp_test_dev *dev, + struct mctp_test_route *rt, + struct socket *sock) +{ + sock_release(sock); + mctp_test_route_destroy(test, rt); + mctp_test_destroy_dev(dev); +} + +struct mctp_test_sock_local_output_config { + struct mctp_test_dev *dev; + size_t halen; + u8 haddr[MAX_ADDR_LEN]; + bool invoked; + int rc; +}; + +static int mctp_test_sock_local_output(struct sock *sk, + struct mctp_dst *dst, + struct sk_buff *skb, + mctp_eid_t daddr, u8 req_tag) +{ + struct kunit *test = kunit_get_current_test(); + struct mctp_test_sock_local_output_config *cfg = test->priv; + + KUNIT_EXPECT_PTR_EQ(test, dst->dev, cfg->dev->mdev); + KUNIT_EXPECT_EQ(test, dst->halen, cfg->halen); + KUNIT_EXPECT_MEMEQ(test, dst->haddr, cfg->haddr, dst->halen); + + cfg->invoked = true; + + kfree_skb(skb); + + return cfg->rc; +} + +static void mctp_test_sock_sendmsg_extaddr(struct kunit *test) +{ + struct sockaddr_mctp_ext addr = { + .smctp_base = { + .smctp_family = AF_MCTP, + .smctp_tag = MCTP_TAG_OWNER, + .smctp_network = MCTP_NET_ANY, + }, + }; + struct mctp_test_sock_local_output_config cfg = { 0 }; + u8 haddr[] = { 0xaa, 0x01 }; + u8 buf[4] = { 0, 1, 2, 3 }; + struct mctp_test_route *rt; + struct msghdr msg = { 0 }; + struct mctp_test_dev *dev; + struct mctp_sock *msk; + struct socket *sock; + ssize_t send_len; + struct kvec vec = { + .iov_base = buf, + .iov_len = sizeof(buf), + }; + + __mctp_sock_test_init(test, &dev, &rt, &sock); + + /* Expect to see the dst configured up with the addressing data we + * provide in the struct sockaddr_mctp_ext + */ + cfg.dev = dev; + cfg.halen = sizeof(haddr); + memcpy(cfg.haddr, haddr, sizeof(haddr)); + + test->priv = &cfg; + + kunit_activate_static_stub(test, mctp_local_output, + mctp_test_sock_local_output); + + /* enable and configure direct addressing */ + msk = container_of(sock->sk, struct mctp_sock, sk); + msk->addr_ext = true; + + addr.smctp_ifindex = dev->ndev->ifindex; + addr.smctp_halen = sizeof(haddr); + memcpy(addr.smctp_haddr, haddr, sizeof(haddr)); + + msg.msg_name = &addr; + msg.msg_namelen = sizeof(addr); + + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &vec, 1, sizeof(buf)); + send_len = mctp_sendmsg(sock, &msg, sizeof(buf)); + KUNIT_EXPECT_EQ(test, send_len, sizeof(buf)); + KUNIT_EXPECT_TRUE(test, cfg.invoked); + + __mctp_sock_test_fini(test, dev, rt, sock); +} + +static void mctp_test_sock_recvmsg_extaddr(struct kunit *test) +{ + struct sockaddr_mctp_ext recv_addr = { 0 }; + u8 rcv_buf[1], rcv_data[] = { 0, 1 }; + u8 haddr[] = { 0xaa, 0x02 }; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct mctp_skb_cb *cb; + struct mctp_sock *msk; + struct sk_buff *skb; + struct mctp_hdr hdr; + struct socket *sock; + struct msghdr msg; + ssize_t recv_len; + int rc; + struct kvec vec = { + .iov_base = rcv_buf, + .iov_len = sizeof(rcv_buf), + }; + + __mctp_sock_test_init(test, &dev, &rt, &sock); + + /* enable extended addressing on recv */ + msk = container_of(sock->sk, struct mctp_sock, sk); + msk->addr_ext = true; + + /* base incoming header, using a nul-EID dest */ + hdr.ver = 1; + hdr.dest = 0; + hdr.src = 9; + hdr.flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | + MCTP_HDR_FLAG_TO; + + skb = mctp_test_create_skb_data(&hdr, &rcv_data); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + + mctp_test_skb_set_dev(skb, dev); + + /* set incoming extended address data */ + cb = mctp_cb(skb); + cb->halen = sizeof(haddr); + cb->ifindex = dev->ndev->ifindex; + memcpy(cb->haddr, haddr, sizeof(haddr)); + + /* Deliver to socket. The route input path pulls the network header, + * leaving skb data at type byte onwards. recvmsg will consume the + * type for addr.smctp_type + */ + skb_pull(skb, sizeof(hdr)); + rc = sock_queue_rcv_skb(sock->sk, skb); + KUNIT_ASSERT_EQ(test, rc, 0); + + msg.msg_name = &recv_addr; + msg.msg_namelen = sizeof(recv_addr); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, sizeof(rcv_buf)); + + recv_len = mctp_recvmsg(sock, &msg, sizeof(rcv_buf), + MSG_DONTWAIT | MSG_TRUNC); + + KUNIT_EXPECT_EQ(test, recv_len, sizeof(rcv_buf)); + + /* expect our extended address to be populated from hdr and cb */ + KUNIT_EXPECT_EQ(test, msg.msg_namelen, sizeof(recv_addr)); + KUNIT_EXPECT_EQ(test, recv_addr.smctp_base.smctp_family, AF_MCTP); + KUNIT_EXPECT_EQ(test, recv_addr.smctp_ifindex, dev->ndev->ifindex); + KUNIT_EXPECT_EQ(test, recv_addr.smctp_halen, sizeof(haddr)); + KUNIT_EXPECT_MEMEQ(test, recv_addr.smctp_haddr, haddr, sizeof(haddr)); + + __mctp_sock_test_fini(test, dev, rt, sock); +} + +static const struct mctp_test_bind_setup bind_addrany_netdefault_type1 = { + .bind_addr = MCTP_ADDR_ANY, .bind_net = MCTP_NET_ANY, .bind_type = 1, +}; + +static const struct mctp_test_bind_setup bind_addrany_net2_type1 = { + .bind_addr = MCTP_ADDR_ANY, .bind_net = 2, .bind_type = 1, +}; + +/* 1 is default net */ +static const struct mctp_test_bind_setup bind_addr8_net1_type1 = { + .bind_addr = 8, .bind_net = 1, .bind_type = 1, +}; + +static const struct mctp_test_bind_setup bind_addrany_net1_type1 = { + .bind_addr = MCTP_ADDR_ANY, .bind_net = 1, .bind_type = 1, +}; + +/* 2 is an arbitrary net */ +static const struct mctp_test_bind_setup bind_addr8_net2_type1 = { + .bind_addr = 8, .bind_net = 2, .bind_type = 1, +}; + +static const struct mctp_test_bind_setup bind_addr8_netdefault_type1 = { + .bind_addr = 8, .bind_net = MCTP_NET_ANY, .bind_type = 1, +}; + +static const struct mctp_test_bind_setup bind_addrany_net2_type2 = { + .bind_addr = MCTP_ADDR_ANY, .bind_net = 2, .bind_type = 2, +}; + +static const struct mctp_test_bind_setup bind_addrany_net2_type1_peer9 = { + .bind_addr = MCTP_ADDR_ANY, .bind_net = 2, .bind_type = 1, + .have_peer = true, .peer_addr = 9, .peer_net = 2, +}; + +struct mctp_bind_pair_test { + const struct mctp_test_bind_setup *bind1; + const struct mctp_test_bind_setup *bind2; + int error; +}; + +/* Pairs of binds and whether they will conflict */ +static const struct mctp_bind_pair_test mctp_bind_pair_tests[] = { + /* Both ADDR_ANY, conflict */ + { &bind_addrany_netdefault_type1, &bind_addrany_netdefault_type1, + EADDRINUSE }, + /* Same specific EID, conflict */ + { &bind_addr8_netdefault_type1, &bind_addr8_netdefault_type1, + EADDRINUSE }, + /* ADDR_ANY vs specific EID, OK */ + { &bind_addrany_netdefault_type1, &bind_addr8_netdefault_type1, 0 }, + /* ADDR_ANY different types, OK */ + { &bind_addrany_net2_type2, &bind_addrany_net2_type1, 0 }, + /* ADDR_ANY different nets, OK */ + { &bind_addrany_net2_type1, &bind_addrany_netdefault_type1, 0 }, + + /* specific EID, NET_ANY (resolves to default) + * vs specific EID, explicit default net 1, conflict + */ + { &bind_addr8_netdefault_type1, &bind_addr8_net1_type1, EADDRINUSE }, + + /* specific EID, net 1 vs specific EID, net 2, ok */ + { &bind_addr8_net1_type1, &bind_addr8_net2_type1, 0 }, + + /* ANY_ADDR, NET_ANY (doesn't resolve to default) + * vs ADDR_ANY, explicit default net 1, OK + */ + { &bind_addrany_netdefault_type1, &bind_addrany_net1_type1, 0 }, + + /* specific remote peer doesn't conflict with any-peer bind */ + { &bind_addrany_net2_type1_peer9, &bind_addrany_net2_type1, 0 }, + + /* bind() NET_ANY is allowed with a connect() net */ + { &bind_addrany_net2_type1_peer9, &bind_addrany_netdefault_type1, 0 }, +}; + +static void mctp_bind_pair_desc(const struct mctp_bind_pair_test *t, char *desc) +{ + char peer1[25] = {0}, peer2[25] = {0}; + + if (t->bind1->have_peer) + snprintf(peer1, sizeof(peer1), ", peer %d net %d", + t->bind1->peer_addr, t->bind1->peer_net); + if (t->bind2->have_peer) + snprintf(peer2, sizeof(peer2), ", peer %d net %d", + t->bind2->peer_addr, t->bind2->peer_net); + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, + "{bind(addr %d, type %d, net %d%s)} {bind(addr %d, type %d, net %d%s)} -> error %d", + t->bind1->bind_addr, t->bind1->bind_type, + t->bind1->bind_net, peer1, + t->bind2->bind_addr, t->bind2->bind_type, + t->bind2->bind_net, peer2, t->error); +} + +KUNIT_ARRAY_PARAM(mctp_bind_pair, mctp_bind_pair_tests, mctp_bind_pair_desc); + +static void mctp_test_bind_invalid(struct kunit *test) +{ + struct socket *sock; + int rc; + + /* bind() fails if the bind() vs connect() networks mismatch. */ + const struct mctp_test_bind_setup bind_connect_net_mismatch = { + .bind_addr = MCTP_ADDR_ANY, .bind_net = 1, .bind_type = 1, + .have_peer = true, .peer_addr = 9, .peer_net = 2, + }; + mctp_test_bind_run(test, &bind_connect_net_mismatch, &rc, &sock); + KUNIT_EXPECT_EQ(test, -rc, EINVAL); + sock_release(sock); +} + +static int +mctp_test_bind_conflicts_inner(struct kunit *test, + const struct mctp_test_bind_setup *bind1, + const struct mctp_test_bind_setup *bind2) +{ + struct socket *sock1 = NULL, *sock2 = NULL, *sock3 = NULL; + int bind_errno; + + /* Bind to first address, always succeeds */ + mctp_test_bind_run(test, bind1, &bind_errno, &sock1); + KUNIT_EXPECT_EQ(test, bind_errno, 0); + + /* A second identical bind always fails */ + mctp_test_bind_run(test, bind1, &bind_errno, &sock2); + KUNIT_EXPECT_EQ(test, -bind_errno, EADDRINUSE); + + /* A different bind, result is returned */ + mctp_test_bind_run(test, bind2, &bind_errno, &sock3); + + if (sock1) + sock_release(sock1); + if (sock2) + sock_release(sock2); + if (sock3) + sock_release(sock3); + + return bind_errno; +} + +static void mctp_test_bind_conflicts(struct kunit *test) +{ + const struct mctp_bind_pair_test *pair; + int bind_errno; + + pair = test->param_value; + + bind_errno = + mctp_test_bind_conflicts_inner(test, pair->bind1, pair->bind2); + KUNIT_EXPECT_EQ(test, -bind_errno, pair->error); + + /* swapping the calls, the second bind should still fail */ + bind_errno = + mctp_test_bind_conflicts_inner(test, pair->bind2, pair->bind1); + KUNIT_EXPECT_EQ(test, -bind_errno, pair->error); +} + +static void mctp_test_assumptions(struct kunit *test) +{ + /* check assumption of default net from bind_addr8_net1_type1 */ + KUNIT_ASSERT_EQ(test, mctp_default_net(&init_net), 1); +} + +static struct kunit_case mctp_test_cases[] = { + KUNIT_CASE(mctp_test_assumptions), + KUNIT_CASE(mctp_test_sock_sendmsg_extaddr), + KUNIT_CASE(mctp_test_sock_recvmsg_extaddr), + KUNIT_CASE_PARAM(mctp_test_bind_conflicts, mctp_bind_pair_gen_params), + KUNIT_CASE(mctp_test_bind_invalid), + {} +}; + +static struct kunit_suite mctp_test_suite = { + .name = "mctp-sock", + .test_cases = mctp_test_cases, +}; + +kunit_test_suite(mctp_test_suite); diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index 565763eb02114..953d419027718 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -26,19 +26,22 @@ static void mctp_test_dev_setup(struct net_device *ndev) ndev->type = ARPHRD_MCTP; ndev->mtu = MCTP_DEV_TEST_MTU; ndev->hard_header_len = 0; - ndev->addr_len = 0; ndev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; ndev->flags = IFF_NOARP; ndev->netdev_ops = &mctp_test_netdev_ops; ndev->needs_free_netdev = true; } -struct mctp_test_dev *mctp_test_create_dev(void) +static struct mctp_test_dev *__mctp_test_create_dev(unsigned short lladdr_len, + const unsigned char *lladdr) { struct mctp_test_dev *dev; struct net_device *ndev; int rc; + if (WARN_ON(lladdr_len > MAX_ADDR_LEN)) + return NULL; + ndev = alloc_netdev(sizeof(*dev), "mctptest%d", NET_NAME_ENUM, mctp_test_dev_setup); if (!ndev) @@ -46,6 +49,8 @@ struct mctp_test_dev *mctp_test_create_dev(void) dev = netdev_priv(ndev); dev->ndev = ndev; + ndev->addr_len = lladdr_len; + dev_addr_set(ndev, lladdr); rc = register_netdev(ndev); if (rc) { @@ -61,8 +66,231 @@ struct mctp_test_dev *mctp_test_create_dev(void) return dev; } +struct mctp_test_dev *mctp_test_create_dev(void) +{ + return __mctp_test_create_dev(0, NULL); +} + +struct mctp_test_dev *mctp_test_create_dev_lladdr(unsigned short lladdr_len, + const unsigned char *lladdr) +{ + return __mctp_test_create_dev(lladdr_len, lladdr); +} + void mctp_test_destroy_dev(struct mctp_test_dev *dev) { mctp_dev_put(dev->mdev); unregister_netdev(dev->ndev); } + +static const unsigned int test_pktqueue_magic = 0x5f713aef; + +void mctp_test_pktqueue_init(struct mctp_test_pktqueue *tpq) +{ + tpq->magic = test_pktqueue_magic; + skb_queue_head_init(&tpq->pkts); +} + +static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb) +{ + struct kunit *test = current->kunit_test; + struct mctp_test_pktqueue *tpq = test->priv; + + KUNIT_ASSERT_EQ(test, tpq->magic, test_pktqueue_magic); + + skb_queue_tail(&tpq->pkts, skb); + + return 0; +} + +/* local version of mctp_route_alloc() */ +static struct mctp_test_route *mctp_route_test_alloc(void) +{ + struct mctp_test_route *rt; + + rt = kzalloc(sizeof(*rt), GFP_KERNEL); + if (!rt) + return NULL; + + INIT_LIST_HEAD(&rt->rt.list); + refcount_set(&rt->rt.refs, 1); + rt->rt.output = mctp_test_dst_output; + + return rt; +} + +struct mctp_test_route *mctp_test_create_route_direct(struct net *net, + struct mctp_dev *dev, + mctp_eid_t eid, + unsigned int mtu) +{ + struct mctp_test_route *rt; + + rt = mctp_route_test_alloc(); + if (!rt) + return NULL; + + rt->rt.min = eid; + rt->rt.max = eid; + rt->rt.mtu = mtu; + rt->rt.type = RTN_UNSPEC; + rt->rt.dst_type = MCTP_ROUTE_DIRECT; + if (dev) + mctp_dev_hold(dev); + rt->rt.dev = dev; + + list_add_rcu(&rt->rt.list, &net->mctp.routes); + + return rt; +} + +struct mctp_test_route *mctp_test_create_route_gw(struct net *net, + unsigned int netid, + mctp_eid_t eid, + mctp_eid_t gw, + unsigned int mtu) +{ + struct mctp_test_route *rt; + + rt = mctp_route_test_alloc(); + if (!rt) + return NULL; + + rt->rt.min = eid; + rt->rt.max = eid; + rt->rt.mtu = mtu; + rt->rt.type = RTN_UNSPEC; + rt->rt.dst_type = MCTP_ROUTE_GATEWAY; + rt->rt.gateway.eid = gw; + rt->rt.gateway.net = netid; + + list_add_rcu(&rt->rt.list, &net->mctp.routes); + + return rt; +} + +/* Convenience function for our test dst; release with mctp_test_dst_release() + */ +void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst, + struct mctp_test_dev *dev, + struct mctp_test_pktqueue *tpq, unsigned int mtu) +{ + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); + + memset(dst, 0, sizeof(*dst)); + + dst->dev = dev->mdev; + __mctp_dev_get(dst->dev->dev); + dst->mtu = mtu; + dst->output = mctp_test_dst_output; + mctp_test_pktqueue_init(tpq); + test->priv = tpq; +} + +void mctp_test_dst_release(struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq) +{ + mctp_dst_release(dst); + skb_queue_purge(&tpq->pkts); +} + +void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt) +{ + unsigned int refs; + + rtnl_lock(); + list_del_rcu(&rt->rt.list); + rtnl_unlock(); + + if (rt->rt.dst_type == MCTP_ROUTE_DIRECT && rt->rt.dev) + mctp_dev_put(rt->rt.dev); + + refs = refcount_read(&rt->rt.refs); + KUNIT_ASSERT_EQ_MSG(test, refs, 1, "route ref imbalance"); + + kfree_rcu(&rt->rt, rcu); +} + +void mctp_test_skb_set_dev(struct sk_buff *skb, struct mctp_test_dev *dev) +{ + struct mctp_skb_cb *cb; + + cb = mctp_cb(skb); + cb->net = READ_ONCE(dev->mdev->net); + skb->dev = dev->ndev; +} + +struct sk_buff *mctp_test_create_skb(const struct mctp_hdr *hdr, + unsigned int data_len) +{ + size_t hdr_len = sizeof(*hdr); + struct sk_buff *skb; + unsigned int i; + u8 *buf; + + skb = alloc_skb(hdr_len + data_len, GFP_KERNEL); + if (!skb) + return NULL; + + __mctp_cb(skb); + memcpy(skb_put(skb, hdr_len), hdr, hdr_len); + + buf = skb_put(skb, data_len); + for (i = 0; i < data_len; i++) + buf[i] = i & 0xff; + + return skb; +} + +struct sk_buff *__mctp_test_create_skb_data(const struct mctp_hdr *hdr, + const void *data, size_t data_len) +{ + size_t hdr_len = sizeof(*hdr); + struct sk_buff *skb; + + skb = alloc_skb(hdr_len + data_len, GFP_KERNEL); + if (!skb) + return NULL; + + __mctp_cb(skb); + memcpy(skb_put(skb, hdr_len), hdr, hdr_len); + memcpy(skb_put(skb, data_len), data, data_len); + + return skb; +} + +void mctp_test_bind_run(struct kunit *test, + const struct mctp_test_bind_setup *setup, + int *ret_bind_errno, struct socket **sock) +{ + struct sockaddr_mctp addr; + int rc; + + *ret_bind_errno = -EIO; + + rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, sock); + KUNIT_ASSERT_EQ(test, rc, 0); + + /* connect() if requested */ + if (setup->have_peer) { + memset(&addr, 0x0, sizeof(addr)); + addr.smctp_family = AF_MCTP; + addr.smctp_network = setup->peer_net; + addr.smctp_addr.s_addr = setup->peer_addr; + /* connect() type must match bind() type */ + addr.smctp_type = setup->bind_type; + rc = kernel_connect(*sock, (struct sockaddr *)&addr, + sizeof(addr), 0); + KUNIT_EXPECT_EQ(test, rc, 0); + } + + /* bind() */ + memset(&addr, 0x0, sizeof(addr)); + addr.smctp_family = AF_MCTP; + addr.smctp_network = setup->bind_net; + addr.smctp_addr.s_addr = setup->bind_addr; + addr.smctp_type = setup->bind_type; + + *ret_bind_errno = + kernel_bind(*sock, (struct sockaddr *)&addr, sizeof(addr)); +} diff --git a/net/mctp/test/utils.h b/net/mctp/test/utils.h index df6aa1c034409..06bdb6cb5eff6 100644 --- a/net/mctp/test/utils.h +++ b/net/mctp/test/utils.h @@ -3,6 +3,11 @@ #ifndef __NET_MCTP_TEST_UTILS_H #define __NET_MCTP_TEST_UTILS_H +#include <uapi/linux/netdevice.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> + #include <kunit/test.h> #define MCTP_DEV_TEST_MTU 68 @@ -10,11 +15,67 @@ struct mctp_test_dev { struct net_device *ndev; struct mctp_dev *mdev; + + unsigned short lladdr_len; + unsigned char lladdr[MAX_ADDR_LEN]; }; struct mctp_test_dev; +struct mctp_test_route { + struct mctp_route rt; +}; + +struct mctp_test_pktqueue { + unsigned int magic; + struct sk_buff_head pkts; +}; + +struct mctp_test_bind_setup { + mctp_eid_t bind_addr; + int bind_net; + u8 bind_type; + + bool have_peer; + mctp_eid_t peer_addr; + int peer_net; + + /* optional name. Used for comparison in "lookup" tests */ + const char *name; +}; + struct mctp_test_dev *mctp_test_create_dev(void); +struct mctp_test_dev *mctp_test_create_dev_lladdr(unsigned short lladdr_len, + const unsigned char *lladdr); void mctp_test_destroy_dev(struct mctp_test_dev *dev); +struct mctp_test_route *mctp_test_create_route_direct(struct net *net, + struct mctp_dev *dev, + mctp_eid_t eid, + unsigned int mtu); +struct mctp_test_route *mctp_test_create_route_gw(struct net *net, + unsigned int netid, + mctp_eid_t eid, + mctp_eid_t gw, + unsigned int mtu); +void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst, + struct mctp_test_dev *dev, + struct mctp_test_pktqueue *tpq, unsigned int mtu); +void mctp_test_dst_release(struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq); +void mctp_test_pktqueue_init(struct mctp_test_pktqueue *tpq); +void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt); +void mctp_test_skb_set_dev(struct sk_buff *skb, struct mctp_test_dev *dev); +struct sk_buff *mctp_test_create_skb(const struct mctp_hdr *hdr, + unsigned int data_len); +struct sk_buff *__mctp_test_create_skb_data(const struct mctp_hdr *hdr, + const void *data, size_t data_len); + +#define mctp_test_create_skb_data(h, d) \ + __mctp_test_create_skb_data(h, d, sizeof(*d)) + +void mctp_test_bind_run(struct kunit *test, + const struct mctp_test_bind_setup *setup, + int *ret_bind_errno, struct socket **sock); + #endif /* __NET_MCTP_TEST_UTILS_H */ diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 1f63b32d76d67..25c88cba5c48b 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -81,8 +81,8 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) if (index < net->mpls.platform_labels) { struct mpls_route __rcu **platform_label = - rcu_dereference(net->mpls.platform_label); - rt = rcu_dereference(platform_label[index]); + rcu_dereference_rtnl(net->mpls.platform_label); + rt = rcu_dereference_rtnl(platform_label[index]); } return rt; } @@ -706,7 +706,7 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, } else { unsigned int flags; - flags = dev_get_flags(dev); + flags = netif_get_flags(dev); if (!(flags & (IFF_RUNNING | IFF_LOWER_UP))) nh->nh_flags |= RTNH_F_LINKDOWN; } @@ -1616,14 +1616,14 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, return notifier_from_errno(err); break; case NETDEV_UP: - flags = dev_get_flags(dev); + flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); else mpls_ifup(dev, RTNH_F_DEAD); break; case NETDEV_CHANGE: - flags = dev_get_flags(dev); + flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) { mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); } else { @@ -2095,12 +2095,12 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct rtmsg *rtm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request"); return -EINVAL; } - rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { @@ -2288,7 +2288,8 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -2298,7 +2299,6 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index d9290c5bb6c79..fed40dae5583a 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -533,9 +533,9 @@ void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; if (timeouts == to_max || (timeouts < to_max && expired)) { - MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); subflow->mpc_drop = 1; - mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); + mptcp_early_fallback(mptcp_sk(subflow->conn), subflow, + MPTCP_MIB_MPCAPABLEACTIVEDROP); } } diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 19eb9292bd609..cf879c188ca26 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -28,6 +28,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), + SNMP_MIB_ITEM("MPJoinRejected", MPTCP_MIB_JOINREJECTED), SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR), SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), @@ -79,6 +80,11 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB), SNMP_MIB_ITEM("Blackhole", MPTCP_MIB_BLACKHOLE), + SNMP_MIB_ITEM("MPCapableDataFallback", MPTCP_MIB_MPCAPABLEDATAFALLBACK), + SNMP_MIB_ITEM("MD5SigFallback", MPTCP_MIB_MD5SIGFALLBACK), + SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), + SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), + SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), SNMP_MIB_SENTINEL }; diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 128282982843a..309bac6fea325 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -23,6 +23,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ + MPTCP_MIB_JOINREJECTED, /* The PM rejected the JOIN request */ MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */ @@ -80,6 +81,13 @@ enum linux_mptcp_mib_field { MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */ MPTCP_MIB_CURRESTAB, /* Current established MPTCP connections */ MPTCP_MIB_BLACKHOLE, /* A blackhole has been detected */ + MPTCP_MIB_MPCAPABLEDATAFALLBACK, /* Missing DSS/MPC+data on first + * established packet + */ + MPTCP_MIB_MD5SIGFALLBACK, /* Conflicting TCP option enabled */ + MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */ + MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */ + MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk status */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 421ced0312890..70c0ab0ecf905 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -978,8 +978,10 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, if (subflow->mp_join) goto reset; subflow->mp_capable = 0; - pr_fallback(msk); - mptcp_do_fallback(ssk); + if (!mptcp_try_fallback(ssk, MPTCP_MIB_MPCAPABLEDATAFALLBACK)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_FALLBACKFAILED); + goto reset; + } return false; } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 31747f974941f..420d416e2603d 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -151,10 +151,13 @@ bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; + bool ret; entry = mptcp_pm_del_add_timer(msk, addr, false); + ret = entry; kfree(entry); - return entry; + + return ret; } bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) @@ -267,7 +270,8 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, static void mptcp_pm_add_timer(struct timer_list *timer) { - struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); + struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer, + add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; @@ -761,8 +765,14 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) pr_debug("fail_seq=%llu\n", fail_seq); - if (!READ_ONCE(msk->allow_infinite_fallback)) + /* After accepting the fail, we can't create any other subflows */ + spin_lock_bh(&msk->fallback_lock); + if (!msk->allow_infinite_fallback) { + spin_unlock_bh(&msk->fallback_lock); return; + } + msk->allow_subflows = false; + spin_unlock_bh(&msk->fallback_lock); if (!subflow->fail_tout) { pr_debug("send MP_FAIL response and infinite map\n"); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 44f7ab463d755..9a287b75c1b31 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -11,6 +11,7 @@ #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/atomic.h> +#include <net/aligned_data.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> @@ -46,7 +47,9 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); -DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); +DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static struct net_device *mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ @@ -65,6 +68,26 @@ static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) return &inet_stream_ops; } +bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib) +{ + struct net *net = sock_net((struct sock *)msk); + + if (__mptcp_check_fallback(msk)) + return true; + + spin_lock_bh(&msk->fallback_lock); + if (!msk->allow_infinite_fallback) { + spin_unlock_bh(&msk->fallback_lock); + return false; + } + + msk->allow_subflows = false; + set_bit(MPTCP_FALLBACK_DONE, &msk->flags); + __MPTCP_INC_STATS(net, fb_mib); + spin_unlock_bh(&msk->fallback_lock); + return true; +} + static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -558,11 +581,7 @@ static bool mptcp_check_data_fin(struct sock *sk) static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) { - if (READ_ONCE(msk->allow_infinite_fallback)) { - MPTCP_INC_STATS(sock_net(ssk), - MPTCP_MIB_DSSCORRUPTIONFALLBACK); - mptcp_do_fallback(ssk); - } else { + if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSCORRUPTIONFALLBACK)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET); mptcp_subflow_reset(ssk); } @@ -790,7 +809,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) { mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); - WRITE_ONCE(msk->allow_infinite_fallback, false); + msk->allow_infinite_fallback = false; mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); } @@ -801,6 +820,14 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) if (sk->sk_state != TCP_ESTABLISHED) return false; + spin_lock_bh(&msk->fallback_lock); + if (!msk->allow_subflows) { + spin_unlock_bh(&msk->fallback_lock); + return false; + } + mptcp_subflow_joined(msk, ssk); + spin_unlock_bh(&msk->fallback_lock); + /* attach to msk socket only after we are sure we will deal with it * at close time */ @@ -809,7 +836,6 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); - mptcp_subflow_joined(msk, ssk); mptcp_stop_tout_timer(sk); __mptcp_propagate_sndbuf(sk, ssk); return true; @@ -1134,10 +1160,13 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk, mpext->infinite_map = 1; mpext->data_len = 0; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); + if (!mptcp_try_fallback(ssk, MPTCP_MIB_INFINITEMAPTX)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_FALLBACKFAILED); + mptcp_subflow_reset(ssk); + return; + } + mptcp_subflow_ctx(ssk)->send_infinite_map = 0; - pr_fallback(msk); - mptcp_do_fallback(ssk); } #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) @@ -1374,7 +1403,7 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) * - estimate the faster flow linger time * - use the above to estimate the amount of byte transferred * by the faster flow - * - check that the amount of queued data is greter than the above, + * - check that the amount of queued data is greater than the above, * otherwise do not use the picked, slower, subflow * We select the subflow with the shorter estimated time to flush * the queued mem, which basically ensure the above. We just need @@ -2201,8 +2230,8 @@ out_err: static void mptcp_retransmit_timer(struct timer_list *t) { - struct inet_connection_sock *icsk = from_timer(icsk, t, - icsk_retransmit_timer); + struct inet_connection_sock *icsk = timer_container_of(icsk, t, + icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; struct mptcp_sock *msk = mptcp_sk(sk); @@ -2221,7 +2250,7 @@ static void mptcp_retransmit_timer(struct timer_list *t) static void mptcp_tout_timer(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); mptcp_schedule_work(sk); sock_put(sk); @@ -2541,9 +2570,9 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) static void __mptcp_retrans(struct sock *sk) { + struct mptcp_sendmsg_info info = { .data_lock_held = true, }; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; - struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; struct sock *ssk; int ret, err; @@ -2588,6 +2617,18 @@ static void __mptcp_retrans(struct sock *sk) info.sent = 0; info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; + + /* + * make the whole retrans decision, xmit, disallow + * fallback atomic + */ + spin_lock_bh(&msk->fallback_lock); + if (__mptcp_check_fallback(msk)) { + spin_unlock_bh(&msk->fallback_lock); + release_sock(ssk); + return; + } + while (info.sent < info.limit) { ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) @@ -2601,8 +2642,9 @@ static void __mptcp_retrans(struct sock *sk) len = max(copied, len); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); - WRITE_ONCE(msk->allow_infinite_fallback, false); + msk->allow_infinite_fallback = false; } + spin_unlock_bh(&msk->fallback_lock); release_sock(ssk); } @@ -2728,7 +2770,8 @@ static void __mptcp_init_sock(struct sock *sk) WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); - WRITE_ONCE(msk->allow_infinite_fallback, true); + msk->allow_infinite_fallback = true; + msk->allow_subflows = true; msk->recovery = false; msk->subflow_id = 1; msk->last_data_sent = tcp_jiffies32; @@ -2736,6 +2779,7 @@ static void __mptcp_init_sock(struct sock *sk) msk->last_ack_recv = tcp_jiffies32; mptcp_pm_data_init(msk); + spin_lock_init(&msk->fallback_lock); /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); @@ -3115,7 +3159,16 @@ static int mptcp_disconnect(struct sock *sk, int flags) * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); + + /* The first subflow is already in TCP_CLOSE status, the following + * can't overlap with a fallback anymore + */ + spin_lock_bh(&msk->fallback_lock); + msk->allow_subflows = true; + msk->allow_infinite_fallback = true; WRITE_ONCE(msk->flags, 0); + spin_unlock_bh(&msk->fallback_lock); + msk->cb_flags = 0; msk->recovery = false; WRITE_ONCE(msk->can_ack, false); @@ -3142,9 +3195,9 @@ static int mptcp_disconnect(struct sock *sk, int flags) #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { - unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); + struct mptcp6_sock *msk6 = container_of(mptcp_sk(sk), struct mptcp6_sock, msk); - return (struct ipv6_pinfo *)(((u8 *)sk) + offset); + return &msk6->np; } static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) @@ -3501,7 +3554,7 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); - sk->sk_uid = SOCK_INODE(parent)->i_uid; + WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); write_unlock_bh(&sk->sk_callback_lock); } @@ -3522,13 +3575,21 @@ bool mptcp_finish_join(struct sock *ssk) /* active subflow, already present inside the conn_list */ if (!list_empty(&subflow->node)) { + spin_lock_bh(&msk->fallback_lock); + if (!msk->allow_subflows) { + spin_unlock_bh(&msk->fallback_lock); + return false; + } mptcp_subflow_joined(msk, ssk); + spin_unlock_bh(&msk->fallback_lock); mptcp_propagate_sndbuf(parent, ssk); return true; } - if (!mptcp_pm_allow_new_subflow(msk)) + if (!mptcp_pm_allow_new_subflow(msk)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_JOINREJECTED); goto err_prohibited; + } /* If we can't acquire msk socket lock here, let the release callback * handle it @@ -3644,16 +3705,15 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) - mptcp_subflow_early_fallback(msk, subflow); + mptcp_early_fallback(msk, subflow, MPTCP_MIB_MD5SIGFALLBACK); #endif if (subflow->request_mptcp) { - if (mptcp_active_should_disable(sk)) { - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDISABLED); - mptcp_subflow_early_fallback(msk, subflow); - } else if (mptcp_token_new_connect(ssk) < 0) { - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); - mptcp_subflow_early_fallback(msk, subflow); - } + if (mptcp_active_should_disable(sk)) + mptcp_early_fallback(msk, subflow, + MPTCP_MIB_MPCAPABLEACTIVEDISABLED); + else if (mptcp_token_new_connect(ssk) < 0) + mptcp_early_fallback(msk, subflow, + MPTCP_MIB_TOKENFALLBACKINIT); } WRITE_ONCE(msk->write_seq, subflow->idsn); @@ -3725,7 +3785,7 @@ static struct proto mptcp_prot = { .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, - .memory_allocated = &tcp_memory_allocated, + .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d409586b5977f..b15d7fab5c4b6 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -326,6 +326,7 @@ struct mptcp_sock { int keepalive_cnt; int keepalive_idle; int keepalive_intvl; + int maxseg; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -346,10 +347,16 @@ struct mptcp_sock { u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; u8 scaling_ratio; + bool allow_subflows; u32 subflow_id; u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; + + spinlock_t fallback_lock; /* protects fallback, + * allow_infinite_fallback and + * allow_join + */ }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) @@ -479,6 +486,7 @@ mptcp_subflow_rsk(const struct request_sock *rsk) struct mptcp_delegated_action { struct napi_struct napi; + local_lock_t bh_lock; struct list_head head; }; @@ -670,9 +678,11 @@ static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); delegated = this_cpu_ptr(&mptcp_delegated_actions); schedule = list_empty(&delegated->head); list_add_tail(&subflow->delegated_node, &delegated->head); + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); sock_hold(mptcp_subflow_tcp_sock(subflow)); if (schedule) napi_schedule(&delegated->napi); @@ -684,11 +694,15 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) { struct mptcp_subflow_context *ret; - if (list_empty(&delegated->head)) + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); + if (list_empty(&delegated->head)) { + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return NULL; + } ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); list_del_init(&ret->delegated_node); + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return ret; } @@ -744,6 +758,7 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); struct mptcp_sched_ops *mptcp_sched_find(const char *name); +int mptcp_validate_scheduler(struct mptcp_sched_ops *sched); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); void mptcp_sched_init(void); @@ -1208,17 +1223,6 @@ static inline bool mptcp_check_fallback(const struct sock *sk) return __mptcp_check_fallback(msk); } -static inline void __mptcp_do_fallback(struct mptcp_sock *msk) -{ - if (__mptcp_check_fallback(msk)) { - pr_debug("TCP fallback already done (msk=%p)\n", msk); - return; - } - if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback))) - return; - set_bit(MPTCP_FALLBACK_DONE, &msk->flags); -} - static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk) { struct sock *ssk = READ_ONCE(msk->first); @@ -1228,14 +1232,17 @@ static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk) TCPF_SYN_RECV | TCPF_LISTEN)); } -static inline void mptcp_do_fallback(struct sock *ssk) +bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib); + +static inline bool mptcp_try_fallback(struct sock *ssk, int fb_mib) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; struct mptcp_sock *msk; msk = mptcp_sk(sk); - __mptcp_do_fallback(msk); + if (!__mptcp_try_fallback(msk, fb_mib)) + return false; if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { gfp_t saved_allocation = ssk->sk_allocation; @@ -1247,16 +1254,15 @@ static inline void mptcp_do_fallback(struct sock *ssk) tcp_shutdown(ssk, SEND_SHUTDOWN); ssk->sk_allocation = saved_allocation; } + return true; } -#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)\n", __func__, a) - -static inline void mptcp_subflow_early_fallback(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) +static inline void mptcp_early_fallback(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + int fb_mib) { - pr_fallback(msk); subflow->request_mptcp = 0; - __mptcp_do_fallback(msk); + WARN_ON_ONCE(!__mptcp_try_fallback(msk, fb_mib)); } static inline bool mptcp_check_infinite_map(struct sk_buff *skb) diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index c16c6fbd4ba2f..1e59072d478c9 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,8 +16,7 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); -static int mptcp_sched_default_get_send(struct mptcp_sock *msk, - struct mptcp_sched_data *data) +static int mptcp_sched_default_get_send(struct mptcp_sock *msk) { struct sock *ssk; @@ -29,8 +28,7 @@ static int mptcp_sched_default_get_send(struct mptcp_sock *msk, return 0; } -static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk, - struct mptcp_sched_data *data) +static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk) { struct sock *ssk; @@ -84,10 +82,23 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) rcu_read_unlock(); } -int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +int mptcp_validate_scheduler(struct mptcp_sched_ops *sched) { - if (!sched->get_send) + if (!sched->get_send) { + pr_err("%s does not implement required ops\n", sched->name); return -EINVAL; + } + + return 0; +} + +int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +{ + int ret; + + ret = mptcp_validate_scheduler(sched); + if (ret) + return ret; spin_lock(&mptcp_sched_list_lock); if (mptcp_sched_find(sched->name)) { @@ -157,7 +168,6 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -178,14 +188,13 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_send(msk, data); - return msk->sched->get_send(msk, data); + return mptcp_sched_default_get_send(msk); + return msk->sched->get_send(msk); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -199,8 +208,8 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_retrans(msk, data); + return mptcp_sched_default_get_retrans(msk); if (msk->sched->get_retrans) - return msk->sched->get_retrans(msk, data); - return msk->sched->get_send(msk, data); + return msk->sched->get_retrans(msk); + return msk->sched->get_send(msk); } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 3caa0a9d3b388..2c267aff95bec 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -798,6 +798,23 @@ unlock: return ret; } +static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, + int optname, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + int ret = 0; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ret = tcp_setsockopt(ssk, level, optname, optval, optlen); + if (ret) + break; + } + return ret; +} + static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -859,6 +876,11 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, &msk->keepalive_cnt, val); break; + case TCP_MAXSEG: + msk->maxseg = val; + ret = mptcp_setsockopt_all_sf(msk, SOL_TCP, optname, optval, + optlen); + break; default: ret = -ENOPROTOOPT; } @@ -914,10 +936,8 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int lock_sock(sk); ssk = msk->first; - if (ssk) { - ret = tcp_getsockopt(ssk, level, optname, optval, optlen); - goto out; - } + if (ssk) + goto get; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { @@ -925,6 +945,7 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int goto out; } +get: ret = tcp_getsockopt(ssk, level, optname, optval, optlen); out: @@ -1407,6 +1428,9 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat); case TCP_IS_MPTCP: return mptcp_put_int_option(msk, optval, optlen, 1); + case TCP_MAXSEG: + return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, + optval, optlen); } return -EOPNOTSUPP; } @@ -1553,6 +1577,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle); tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl); tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt); + tcp_sock_set_maxseg(ssk, msk->maxseg); inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 24c2de1891bdf..3f1b62a9fe889 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -247,6 +247,7 @@ again: if (unlikely(req->syncookie)) { if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } @@ -543,10 +544,13 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) { - MPTCP_INC_STATS(sock_net(sk), - MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); - mptcp_do_fallback(sk); - pr_fallback(msk); + if (!mptcp_try_fallback(sk, + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK)) { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_FALLBACKFAILED); + goto do_reset; + } + goto fallback; } @@ -745,15 +749,11 @@ struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *op EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ -static bool subflow_hmac_valid(const struct request_sock *req, +static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req, const struct mptcp_options_received *mp_opt) { - const struct mptcp_subflow_request_sock *subflow_req; + struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; - struct mptcp_sock *msk; - - subflow_req = mptcp_subflow_rsk(req); - msk = subflow_req->msk; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), @@ -899,13 +899,14 @@ create_child: goto dispose_child; } - if (!subflow_hmac_valid(req, &mp_opt)) { + if (!subflow_hmac_valid(subflow_req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } if (!mptcp_can_accept_new_subflow(owner)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } @@ -1302,20 +1303,29 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss mptcp_schedule_work(sk); } -static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) +static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); unsigned long fail_tout; + /* we are really failing, prevent any later subflow join */ + spin_lock_bh(&msk->fallback_lock); + if (!msk->allow_infinite_fallback) { + spin_unlock_bh(&msk->fallback_lock); + return false; + } + msk->allow_subflows = false; + spin_unlock_bh(&msk->fallback_lock); + /* graceful failure can happen only on the MPC subflow */ if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) - return; + return false; /* since the close timeout take precedence on the fail one, * no need to start the latter when the first is already set */ if (sock_flag((struct sock *)msk, SOCK_DEAD)) - return; + return true; /* we don't need extreme accuracy here, use a zero fail_tout as special * value meaning no fail timeout at all; @@ -1327,6 +1337,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) tcp_send_ack(ssk); mptcp_reset_tout_timer(msk, subflow->fail_tout); + return true; } static bool subflow_check_data_avail(struct sock *ssk) @@ -1387,17 +1398,16 @@ fallback: (subflow->mp_join || subflow->valid_csum_seen)) { subflow->send_mp_fail = 1; - if (!READ_ONCE(msk->allow_infinite_fallback)) { + if (!mptcp_subflow_fail(msk, ssk)) { subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; goto reset; } - mptcp_subflow_fail(msk, ssk); WRITE_ONCE(subflow->data_avail, true); return true; } - if (!READ_ONCE(msk->allow_infinite_fallback)) { + if (!mptcp_try_fallback(ssk, MPTCP_MIB_DSSFALLBACK)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ @@ -1415,8 +1425,6 @@ reset: WRITE_ONCE(subflow->data_avail, false); return false; } - - mptcp_do_fallback(ssk); } skb = skb_peek(&ssk->sk_receive_queue); @@ -1681,7 +1689,6 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); - WRITE_ONCE(msk->allow_infinite_fallback, false); mptcp_stop_tout_timer(sk); return 0; @@ -1847,14 +1854,11 @@ static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; - struct mptcp_sock *msk; __subflow_state_change(sk); - msk = mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { - mptcp_do_fallback(sk); - pr_fallback(msk); + WARN_ON_ONCE(!mptcp_try_fallback(sk, MPTCP_MIB_SIMULTCONNFALLBACK)); subflow->conn_finished = 1; mptcp_propagate_state(parent, sk, subflow, NULL); } diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 4e0842df5234e..adee6dcabdc3f 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -110,7 +110,7 @@ struct ncsi_channel_version { u8 update; /* NCSI version update */ char alpha1; /* NCSI version alpha1 */ char alpha2; /* NCSI version alpha2 */ - u8 fw_name[12]; /* Firmware name string */ + u8 fw_name[12 + 1]; /* Firmware name string */ u32 fw_version; /* Firmware version */ u16 pci_ids[4]; /* PCI identification */ u32 mf_id; /* Manufacture ID */ @@ -143,16 +143,15 @@ struct ncsi_channel_vlan_filter { }; struct ncsi_channel_stats { - u32 hnc_cnt_hi; /* Counter cleared */ - u32 hnc_cnt_lo; /* Counter cleared */ - u32 hnc_rx_bytes; /* Rx bytes */ - u32 hnc_tx_bytes; /* Tx bytes */ - u32 hnc_rx_uc_pkts; /* Rx UC packets */ - u32 hnc_rx_mc_pkts; /* Rx MC packets */ - u32 hnc_rx_bc_pkts; /* Rx BC packets */ - u32 hnc_tx_uc_pkts; /* Tx UC packets */ - u32 hnc_tx_mc_pkts; /* Tx MC packets */ - u32 hnc_tx_bc_pkts; /* Tx BC packets */ + u64 hnc_cnt; /* Counter cleared */ + u64 hnc_rx_bytes; /* Rx bytes */ + u64 hnc_tx_bytes; /* Tx bytes */ + u64 hnc_rx_uc_pkts; /* Rx UC packets */ + u64 hnc_rx_mc_pkts; /* Rx MC packets */ + u64 hnc_rx_bc_pkts; /* Rx BC packets */ + u64 hnc_tx_uc_pkts; /* Tx UC packets */ + u64 hnc_tx_mc_pkts; /* Tx MC packets */ + u64 hnc_tx_bc_pkts; /* Tx BC packets */ u32 hnc_fcs_err; /* FCS errors */ u32 hnc_align_err; /* Alignment errors */ u32 hnc_false_carrier; /* False carrier detection */ @@ -181,7 +180,7 @@ struct ncsi_channel_stats { u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */ u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */ u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */ - u32 hnc_rx_valid_bytes; /* Rx valid bytes */ + u64 hnc_rx_valid_bytes; /* Rx valid bytes */ u32 hnc_rx_runt_pkts; /* Rx error runt packets */ u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */ u32 ncsi_rx_cmds; /* Rx NCSI commands */ @@ -323,7 +322,7 @@ struct ncsi_dev_priv { #define NCSI_DEV_RESHUFFLE 4 #define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ - struct sockaddr pending_mac; /* MAC address received from GMA */ + struct sockaddr_storage pending_mac; /* MAC address received from GMA */ spinlock_t lock; /* Protect the NCSI device */ unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index b369470637834..446e4e3b9553a 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -88,7 +88,7 @@ report: static void ncsi_channel_monitor(struct timer_list *t) { - struct ncsi_channel *nc = from_timer(nc, t, monitor.timer); + struct ncsi_channel *nc = timer_container_of(nc, t, monitor.timer); struct ncsi_package *np = nc->package; struct ncsi_dev_priv *ndp = np->ndp; struct ncsi_channel_mode *ncm; @@ -430,7 +430,7 @@ struct ncsi_dev *ncsi_find_dev(struct net_device *dev) static void ncsi_request_timeout(struct timer_list *t) { - struct ncsi_request *nr = from_timer(nr, t, timer); + struct ncsi_request *nr = timer_container_of(nr, t, timer); struct ncsi_dev_priv *ndp = nr->ndp; struct ncsi_cmd_pkt *cmd; struct ncsi_package *np; diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index f2f3b5c1b9412..24edb27379724 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -252,16 +252,15 @@ struct ncsi_rsp_gp_pkt { /* Get Controller Packet Statistics */ struct ncsi_rsp_gcps_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ - __be32 cnt_hi; /* Counter cleared */ - __be32 cnt_lo; /* Counter cleared */ - __be32 rx_bytes; /* Rx bytes */ - __be32 tx_bytes; /* Tx bytes */ - __be32 rx_uc_pkts; /* Rx UC packets */ - __be32 rx_mc_pkts; /* Rx MC packets */ - __be32 rx_bc_pkts; /* Rx BC packets */ - __be32 tx_uc_pkts; /* Tx UC packets */ - __be32 tx_mc_pkts; /* Tx MC packets */ - __be32 tx_bc_pkts; /* Tx BC packets */ + __be64 cnt; /* Counter cleared */ + __be64 rx_bytes; /* Rx bytes */ + __be64 tx_bytes; /* Tx bytes */ + __be64 rx_uc_pkts; /* Rx UC packets */ + __be64 rx_mc_pkts; /* Rx MC packets */ + __be64 rx_bc_pkts; /* Rx BC packets */ + __be64 tx_uc_pkts; /* Tx UC packets */ + __be64 tx_mc_pkts; /* Tx MC packets */ + __be64 tx_bc_pkts; /* Tx BC packets */ __be32 fcs_err; /* FCS errors */ __be32 align_err; /* Alignment errors */ __be32 false_carrier; /* False carrier detection */ @@ -290,11 +289,11 @@ struct ncsi_rsp_gcps_pkt { __be32 tx_1023_frames; /* Tx 512-1023 bytes frames */ __be32 tx_1522_frames; /* Tx 1024-1522 bytes frames */ __be32 tx_9022_frames; /* Tx 1523-9022 bytes frames */ - __be32 rx_valid_bytes; /* Rx valid bytes */ + __be64 rx_valid_bytes; /* Rx valid bytes */ __be32 rx_runt_pkts; /* Rx error runt packets */ __be32 rx_jabber_pkts; /* Rx error jabber packets */ __be32 checksum; /* Checksum */ -}; +} __packed __aligned(4); /* Get NCSI Statistics */ struct ncsi_rsp_gns_pkt { diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 4a8ce2949faea..271ec6c3929e8 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -628,7 +628,7 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id) { struct ncsi_dev_priv *ndp = nr->ndp; - struct sockaddr *saddr = &ndp->pending_mac; + struct sockaddr_storage *saddr = &ndp->pending_mac; struct net_device *ndev = ndp->ndev.dev; struct ncsi_rsp_oem_pkt *rsp; u32 mac_addr_off = 0; @@ -644,11 +644,11 @@ static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id) else if (mfr_id == NCSI_OEM_MFR_INTEL_ID) mac_addr_off = INTEL_MAC_ADDR_OFFSET; - saddr->sa_family = ndev->type; - memcpy(saddr->sa_data, &rsp->data[mac_addr_off], ETH_ALEN); + saddr->ss_family = ndev->type; + memcpy(saddr->__data, &rsp->data[mac_addr_off], ETH_ALEN); if (mfr_id == NCSI_OEM_MFR_BCM_ID || mfr_id == NCSI_OEM_MFR_INTEL_ID) - eth_addr_inc((u8 *)saddr->sa_data); - if (!is_valid_ether_addr((const u8 *)saddr->sa_data)) + eth_addr_inc(saddr->__data); + if (!is_valid_ether_addr(saddr->__data)) return -ENXIO; /* Set the flag for GMA command which should only be called once */ @@ -775,6 +775,7 @@ static int ncsi_rsp_handler_gvi(struct ncsi_request *nr) ncv->alpha1 = rsp->alpha1; ncv->alpha2 = rsp->alpha2; memcpy(ncv->fw_name, rsp->fw_name, 12); + ncv->fw_name[12] = '\0'; ncv->fw_version = ntohl(rsp->fw_version); for (i = 0; i < ARRAY_SIZE(ncv->pci_ids); i++) ncv->pci_ids[i] = ntohs(rsp->pci_ids[i]); @@ -926,16 +927,15 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) /* Update HNC's statistics */ ncs = &nc->stats; - ncs->hnc_cnt_hi = ntohl(rsp->cnt_hi); - ncs->hnc_cnt_lo = ntohl(rsp->cnt_lo); - ncs->hnc_rx_bytes = ntohl(rsp->rx_bytes); - ncs->hnc_tx_bytes = ntohl(rsp->tx_bytes); - ncs->hnc_rx_uc_pkts = ntohl(rsp->rx_uc_pkts); - ncs->hnc_rx_mc_pkts = ntohl(rsp->rx_mc_pkts); - ncs->hnc_rx_bc_pkts = ntohl(rsp->rx_bc_pkts); - ncs->hnc_tx_uc_pkts = ntohl(rsp->tx_uc_pkts); - ncs->hnc_tx_mc_pkts = ntohl(rsp->tx_mc_pkts); - ncs->hnc_tx_bc_pkts = ntohl(rsp->tx_bc_pkts); + ncs->hnc_cnt = be64_to_cpu(rsp->cnt); + ncs->hnc_rx_bytes = be64_to_cpu(rsp->rx_bytes); + ncs->hnc_tx_bytes = be64_to_cpu(rsp->tx_bytes); + ncs->hnc_rx_uc_pkts = be64_to_cpu(rsp->rx_uc_pkts); + ncs->hnc_rx_mc_pkts = be64_to_cpu(rsp->rx_mc_pkts); + ncs->hnc_rx_bc_pkts = be64_to_cpu(rsp->rx_bc_pkts); + ncs->hnc_tx_uc_pkts = be64_to_cpu(rsp->tx_uc_pkts); + ncs->hnc_tx_mc_pkts = be64_to_cpu(rsp->tx_mc_pkts); + ncs->hnc_tx_bc_pkts = be64_to_cpu(rsp->tx_bc_pkts); ncs->hnc_fcs_err = ntohl(rsp->fcs_err); ncs->hnc_align_err = ntohl(rsp->align_err); ncs->hnc_false_carrier = ntohl(rsp->false_carrier); @@ -964,7 +964,7 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) ncs->hnc_tx_1023_frames = ntohl(rsp->tx_1023_frames); ncs->hnc_tx_1522_frames = ntohl(rsp->tx_1522_frames); ncs->hnc_tx_9022_frames = ntohl(rsp->tx_9022_frames); - ncs->hnc_rx_valid_bytes = ntohl(rsp->rx_valid_bytes); + ncs->hnc_rx_valid_bytes = be64_to_cpu(rsp->rx_valid_bytes); ncs->hnc_rx_runt_pkts = ntohl(rsp->rx_runt_pkts); ncs->hnc_rx_jabber_pkts = ntohl(rsp->rx_jabber_pkts); @@ -1089,7 +1089,7 @@ static int ncsi_rsp_handler_netlink(struct ncsi_request *nr) static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) { struct ncsi_dev_priv *ndp = nr->ndp; - struct sockaddr *saddr = &ndp->pending_mac; + struct sockaddr_storage *saddr = &ndp->pending_mac; struct net_device *ndev = ndp->ndev.dev; struct ncsi_rsp_gmcma_pkt *rsp; int i; @@ -1106,15 +1106,15 @@ static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) rsp->addresses[i][4], rsp->addresses[i][5]); } - saddr->sa_family = ndev->type; + saddr->ss_family = ndev->type; for (i = 0; i < rsp->address_count; i++) { if (!is_valid_ether_addr(rsp->addresses[i])) { netdev_warn(ndev, "NCSI: Unable to assign %pM to device\n", rsp->addresses[i]); continue; } - memcpy(saddr->sa_data, rsp->addresses[i], ETH_ALEN); - netdev_warn(ndev, "NCSI: Will set MAC address to %pM\n", saddr->sa_data); + memcpy(saddr->__data, rsp->addresses[i], ETH_ALEN); + netdev_warn(ndev, "NCSI: Will set MAC address to %pM\n", saddr->__data); break; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 047ba81865edf..6cdc994fdc8a2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -195,16 +195,6 @@ config NF_CONNTRACK_LABELS config NF_CONNTRACK_OVS bool -config NF_CT_PROTO_DCCP - bool 'DCCP protocol connection tracking support' - depends on NETFILTER_ADVANCED - default y - help - With this option enabled, the layer 3 independent connection - tracking code will be able to do state tracking on DCCP connections. - - If unsure, say Y. - config NF_CT_PROTO_GRE bool @@ -212,7 +202,7 @@ config NF_CT_PROTO_SCTP bool 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED default y - select CRC32 + select NET_CRC32C help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. @@ -475,7 +465,7 @@ endif # NF_CONNTRACK config NF_TABLES select NETFILTER_NETLINK - select CRC32 + select NET_CRC32C tristate "Netfilter nf_tables support" help nftables is the new packet classification framework that intends to @@ -516,6 +506,12 @@ config NFT_CT This option adds the "ct" expression that you can use to match connection tracking information such as the flow state. +config NFT_EXTHDR_DCCP + bool "Netfilter nf_tables exthdr DCCP support (DEPRECATED)" + default n + help + This option adds support for matching on DCCP extension headers. + config NFT_FLOW_OFFLOAD depends on NF_CONNTRACK && NF_FLOW_TABLE tristate "Netfilter nf_tables hardware flow offload module" @@ -762,6 +758,16 @@ config NETFILTER_XTABLES_COMPAT If unsure, say N. +config NETFILTER_XTABLES_LEGACY + bool "Netfilter legacy tables support" + depends on !PREEMPT_RT + help + Say Y here if you still require support for legacy tables. This is + required by the legacy tools (iptables-legacy) and is not needed if + you use iptables over nftables (iptables-nft). + Legacy support is not limited to IP, it also includes EBTABLES and + ARPTABLES. + comment "Xtables combined modules" config NETFILTER_XT_MARK @@ -1180,7 +1186,7 @@ config NETFILTER_XT_MATCH_CGROUP tristate '"control group" match support' depends on NETFILTER_ADVANCED depends on CGROUPS - select CGROUP_NET_CLASSID + select SOCK_CGROUP_DATA help Socket/process control group matching allows you to match locally generated packets based on which net_cls control group processes @@ -1278,9 +1284,9 @@ config NETFILTER_XT_MATCH_CPU To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_MATCH_DCCP - tristate '"dccp" protocol match support' + tristate '"dccp" protocol match support (DEPRECATED)' depends on NETFILTER_ADVANCED - default IP_DCCP + default n help With this option enabled, you will be able to use the iptables `dccp' match in order to match on DCCP source/destination ports diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index f0aa4d7ef4998..e43e20f529f87 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -12,7 +12,6 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o -nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o ifeq ($(CONFIG_NF_CONNTRACK),m) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b9f551f02c813..11a702065bab5 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -31,9 +31,6 @@ const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); -DEFINE_PER_CPU(bool, nf_skb_duplicated); -EXPORT_SYMBOL_GPL(nf_skb_duplicated); - #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 6ae042f702d20..798c7993635e6 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -264,7 +264,7 @@ out: static void mtype_gc(struct timer_list *t) { - struct mtype *map = from_timer(map, t, gc); + struct mtype *map = timer_container_of(map, t, gc); struct ip_set *set = map->set; void *x; u32 id; diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index db794fe1300e6..13c7a08aa868c 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -571,7 +571,7 @@ static const struct ip_set_type_variant set_variant = { static void list_set_gc(struct timer_list *t) { - struct list_set *map = from_timer(map, t, gc); + struct list_set *map = timer_container_of(map, t, gc); struct ip_set *set = map->set; spin_lock_bh(&set->lock); diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 8c5b1fe12d078..c203252e856d8 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -105,7 +105,7 @@ config IP_VS_PROTO_AH config IP_VS_PROTO_SCTP bool "SCTP load balancing support" - select CRC32 + select NET_CRC32C help This option enables support for load balancing SCTP transport protocol. Say Y if unsure. diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 8699944c0baf3..965f3c8e5089d 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -846,7 +846,7 @@ static void ip_vs_conn_del_put(struct ip_vs_conn *cp) static void ip_vs_conn_expire(struct timer_list *t) { - struct ip_vs_conn *cp = from_timer(cp, t, timer); + struct ip_vs_conn *cp = timer_container_of(cp, t, timer); struct netns_ipvs *ipvs = cp->ipvs; /* @@ -926,7 +926,7 @@ static void ip_vs_conn_expire(struct timer_list *t) void ip_vs_conn_expire_now(struct ip_vs_conn *cp) { /* Using mod_timer_pending will ensure the timer is not - * modified after the final del_timer in ip_vs_conn_expire. + * modified after the final timer_delete in ip_vs_conn_expire. */ if (timer_pending(&cp->timer) && time_after(cp->timer.expires, jiffies)) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7d5b7418f8c72..6a6fc44785337 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1331,7 +1331,8 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) static void ip_vs_dest_trash_expire(struct timer_list *t) { - struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); + struct netns_ipvs *ipvs = timer_container_of(ipvs, t, + dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 2423513d701d4..156181a3bacd7 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -292,7 +292,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) */ static void ip_vs_lblc_check_expire(struct timer_list *t) { - struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_lblc_table *tbl = timer_container_of(tbl, t, + periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index cdb1d4bf6761c..a021e6aba3d7b 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -456,7 +456,8 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) */ static void ip_vs_lblcr_check_expire(struct timer_list *t) { - struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); + struct ip_vs_lblcr_table *tbl = timer_container_of(tbl, t, + periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 014f077403695..95af252b29397 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -97,7 +97,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest) if (!dest_dst) return NULL; dst = dest_dst->dst_cache; - if (dst->obsolete && + if (READ_ONCE(dst->obsolete) && dst->ops->check(dst, dest_dst->dst_cookie) == NULL) return NULL; return dest_dst; diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 06b0848447003..46e667a50d988 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -17,7 +17,7 @@ static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, .skb = skb, }; - return bpf_prog_run(prog, &ctx); + return bpf_prog_run_pin_on_cpu(prog, &ctx); } struct bpf_nf_link { @@ -225,7 +225,8 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (!link) return -ENOMEM; - bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog, + attr->link_create.attach_type); link->hook_ops.hook = nf_hook_run_bpf; link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF; @@ -295,6 +296,9 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type, if (off < 0 || off >= sizeof(struct bpf_nf_ctx)) return false; + if (off % size != 0) + return false; + if (type == BPF_WRITE) return false; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7f8b245e287ae..344f88295976d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -136,8 +136,8 @@ static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) } /* return true if we need to recompute hashes (in case hash table was resized) */ -static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, - unsigned int h2, unsigned int sequence) +static bool nf_conntrack_double_lock(unsigned int h1, unsigned int h2, + unsigned int sequence) { h1 %= CONNTRACK_LOCKS; h2 %= CONNTRACK_LOCKS; @@ -329,9 +329,6 @@ nf_ct_get_tuple(const struct sk_buff *skb, #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: -#endif /* fallthrough */ return nf_ct_get_tuple_ports(skb, dataoff, tuple); default: @@ -505,6 +502,11 @@ u32 nf_ct_get_id(const struct nf_conn *ct) } EXPORT_SYMBOL_GPL(nf_ct_get_id); +static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) +{ + return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); +} + static void clean_from_lists(struct nf_conn *ct) { @@ -531,10 +533,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, p = tmpl; tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); - if (tmpl != p) { - tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); + if (tmpl != p) tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; - } } else { tmpl = kzalloc(sizeof(*tmpl), flags); if (!tmpl) @@ -613,7 +613,7 @@ static void __nf_ct_delete_from_lists(struct nf_conn *ct) reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); - } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); clean_from_lists(ct); nf_conntrack_double_unlock(hash, reply_hash); @@ -890,7 +890,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); - } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); @@ -1121,6 +1121,12 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); + /* confirmed bit must be set after hlist add, not before: + * loser_ct can still be visible to other cpu due to + * SLAB_TYPESAFE_BY_RCU. + */ + smp_mb__before_atomic(); + set_bit(IPS_CONFIRMED_BIT, &loser_ct->status); NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; @@ -1228,7 +1234,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); - } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related * connections for unconfirmed conns. But packet copies and @@ -1257,8 +1263,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) * user context, else we insert an already 'dead' hash, blocking * further use of that particular connection -JM. */ - ct->status |= IPS_CONFIRMED; - if (unlikely(nf_ct_is_dying(ct))) { NF_CT_STAT_INC(net, insert_failed); goto dying; @@ -1290,7 +1294,7 @@ chaintoolong: } } - /* Timer relative to confirmation time, not original + /* Timeout is relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout += nfct_time_stamp; @@ -1298,11 +1302,21 @@ chaintoolong: __nf_conntrack_insert_prepare(ct); /* Since the lookup is lockless, hash insertion must be done after - * starting the timer and setting the CONFIRMED bit. The RCU barriers - * guarantee that no other CPU can find the conntrack before the above - * stores are visible. + * setting ct->timeout. The RCU barriers guarantee that no other CPU + * can find the conntrack before the above stores are visible. */ __nf_conntrack_hash_insert(ct, hash, reply_hash); + + /* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups + * skip entries that lack this bit. This happens when a CPU is looking + * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU + * or when another CPU encounters this entry right after the insertion + * but before the set-confirm-bit below. This bit must not be set until + * after __nf_conntrack_hash_insert(). + */ + smp_mb__before_atomic(); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); @@ -1659,7 +1673,11 @@ __nf_conntrack_alloc(struct net *net, if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; atomic_dec(&cnet->count); - net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); + if (net == &init_net) + net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); + else + net_warn_ratelimited("nf_conntrack: table full in netns %u, dropping packet\n", + net->ns.inum); return ERR_PTR(-ENOMEM); } } @@ -1979,11 +1997,6 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct, return nf_conntrack_sctp_packet(ct, skb, dataoff, ctinfo, state); #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: - return nf_conntrack_dccp_packet(ct, skb, dataoff, - ctinfo, state); -#endif #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return nf_conntrack_gre_packet(ct, skb, dataoff, @@ -2712,6 +2725,7 @@ static const struct nf_ct_hook nf_conntrack_hook = { .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, .confirm = __nf_conntrack_confirm, + .get_id = nf_conntrack_get_id, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 21d22fa22e4e7..cfc2daa3fc7f3 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { - struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); + struct nf_conntrack_expect *exp = timer_container_of(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 2cc0fde233447..486d52b45fe57 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2036,7 +2036,6 @@ static void ctnetlink_change_mark(struct nf_conn *ct, static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, - [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED }, [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, }; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index f36727ed91e1a..bc1d96686b9c5 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -100,9 +100,6 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) case IPPROTO_UDP: return &nf_conntrack_l4proto_udp; case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp; case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp; -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp; #endif @@ -681,9 +678,6 @@ void nf_conntrack_proto_pernet_init(struct net *net) #if IS_ENABLED(CONFIG_IPV6) nf_conntrack_icmpv6_init_net(net); #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - nf_conntrack_dccp_init_net(net); -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP nf_conntrack_sctp_init_net(net); #endif diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c deleted file mode 100644 index ebc4f733bb2e6..0000000000000 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ /dev/null @@ -1,826 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * DCCP connection tracking protocol helper - * - * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net> - */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/sysctl.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/dccp.h> -#include <linux/slab.h> - -#include <net/net_namespace.h> -#include <net/netns/generic.h> - -#include <linux/netfilter/nfnetlink_conntrack.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_ecache.h> -#include <net/netfilter/nf_conntrack_timeout.h> -#include <net/netfilter/nf_log.h> - -/* Timeouts are based on values from RFC4340: - * - * - REQUEST: - * - * 8.1.2. Client Request - * - * A client MAY give up on its DCCP-Requests after some time - * (3 minutes, for example). - * - * - RESPOND: - * - * 8.1.3. Server Response - * - * It MAY also leave the RESPOND state for CLOSED after a timeout of - * not less than 4MSL (8 minutes); - * - * - PARTOPEN: - * - * 8.1.5. Handshake Completion - * - * If the client remains in PARTOPEN for more than 4MSL (8 minutes), - * it SHOULD reset the connection with Reset Code 2, "Aborted". - * - * - OPEN: - * - * The DCCP timestamp overflows after 11.9 hours. If the connection - * stays idle this long the sequence number won't be recognized - * as valid anymore. - * - * - CLOSEREQ/CLOSING: - * - * 8.3. Termination - * - * The retransmission timer should initially be set to go off in two - * round-trip times and should back off to not less than once every - * 64 seconds ... - * - * - TIMEWAIT: - * - * 4.3. States - * - * A server or client socket remains in this state for 2MSL (4 minutes) - * after the connection has been town down, ... - */ - -#define DCCP_MSL (2 * 60 * HZ) - -#ifdef CONFIG_NF_CONNTRACK_PROCFS -static const char * const dccp_state_names[] = { - [CT_DCCP_NONE] = "NONE", - [CT_DCCP_REQUEST] = "REQUEST", - [CT_DCCP_RESPOND] = "RESPOND", - [CT_DCCP_PARTOPEN] = "PARTOPEN", - [CT_DCCP_OPEN] = "OPEN", - [CT_DCCP_CLOSEREQ] = "CLOSEREQ", - [CT_DCCP_CLOSING] = "CLOSING", - [CT_DCCP_TIMEWAIT] = "TIMEWAIT", - [CT_DCCP_IGNORE] = "IGNORE", - [CT_DCCP_INVALID] = "INVALID", -}; -#endif - -#define sNO CT_DCCP_NONE -#define sRQ CT_DCCP_REQUEST -#define sRS CT_DCCP_RESPOND -#define sPO CT_DCCP_PARTOPEN -#define sOP CT_DCCP_OPEN -#define sCR CT_DCCP_CLOSEREQ -#define sCG CT_DCCP_CLOSING -#define sTW CT_DCCP_TIMEWAIT -#define sIG CT_DCCP_IGNORE -#define sIV CT_DCCP_INVALID - -/* - * DCCP state transition table - * - * The assumption is the same as for TCP tracking: - * - * We are the man in the middle. All the packets go through us but might - * get lost in transit to the destination. It is assumed that the destination - * can't receive segments we haven't seen. - * - * The following states exist: - * - * NONE: Initial state, expecting Request - * REQUEST: Request seen, waiting for Response from server - * RESPOND: Response from server seen, waiting for Ack from client - * PARTOPEN: Ack after Response seen, waiting for packet other than Response, - * Reset or Sync from server - * OPEN: Packet other than Response, Reset or Sync seen - * CLOSEREQ: CloseReq from server seen, expecting Close from client - * CLOSING: Close seen, expecting Reset - * TIMEWAIT: Reset seen - * IGNORE: Not determinable whether packet is valid - * - * Some states exist only on one side of the connection: REQUEST, RESPOND, - * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to - * the one it was in before. - * - * Packets are marked as ignored (sIG) if we don't know if they're valid - * (for example a reincarnation of a connection we didn't notice is dead - * already) and the server may send back a connection closing Reset or a - * Response. They're also used for Sync/SyncAck packets, which we don't - * care about. - */ -static const u_int8_t -dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = { - [CT_DCCP_ROLE_CLIENT] = { - [DCCP_PKT_REQUEST] = { - /* - * sNO -> sRQ Regular Request - * sRQ -> sRQ Retransmitted Request or reincarnation - * sRS -> sRS Retransmitted Request (apparently Response - * got lost after we saw it) or reincarnation - * sPO -> sIG Ignore, conntrack might be out of sync - * sOP -> sIG Ignore, conntrack might be out of sync - * sCR -> sIG Ignore, conntrack might be out of sync - * sCG -> sIG Ignore, conntrack might be out of sync - * sTW -> sRQ Reincarnation - * - * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */ - sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ, - }, - [DCCP_PKT_RESPONSE] = { - /* - * sNO -> sIV Invalid - * sRQ -> sIG Ignore, might be response to ignored Request - * sRS -> sIG Ignore, might be response to ignored Request - * sPO -> sIG Ignore, might be response to ignored Request - * sOP -> sIG Ignore, might be response to ignored Request - * sCR -> sIG Ignore, might be response to ignored Request - * sCG -> sIG Ignore, might be response to ignored Request - * sTW -> sIV Invalid, reincarnation in reverse direction - * goes through sRQ - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV, - }, - [DCCP_PKT_ACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) - * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN - * sOP -> sOP Regular ACK, remain in OPEN - * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV - }, - [DCCP_PKT_DATA] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.) - * sOP -> sOP Regular Data packet - * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV, - }, - [DCCP_PKT_DATAACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) - * sPO -> sPO Remain in PARTOPEN state - * sOP -> sOP Regular DataAck packet in OPEN state - * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV - }, - [DCCP_PKT_CLOSEREQ] = { - /* - * CLOSEREQ may only be sent by the server. - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV - }, - [DCCP_PKT_CLOSE] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sCG Client-initiated close - * sOP -> sCG Client-initiated close - * sCR -> sCG Close in response to CloseReq (8.3.) - * sCG -> sCG Retransmit - * sTW -> sIV Late retransmit, already in TIME_WAIT - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV - }, - [DCCP_PKT_RESET] = { - /* - * sNO -> sIV No connection - * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.) - * sRS -> sTW Response received without Request - * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.) - * sOP -> sTW Connection reset - * sCR -> sTW Connection reset - * sCG -> sTW Connection reset - * sTW -> sIG Ignore (don't refresh timer) - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG - }, - [DCCP_PKT_SYNC] = { - /* - * We currently ignore Sync packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - [DCCP_PKT_SYNCACK] = { - /* - * We currently ignore SyncAck packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - }, - [CT_DCCP_ROLE_SERVER] = { - [DCCP_PKT_REQUEST] = { - /* - * sNO -> sIV Invalid - * sRQ -> sIG Ignore, conntrack might be out of sync - * sRS -> sIG Ignore, conntrack might be out of sync - * sPO -> sIG Ignore, conntrack might be out of sync - * sOP -> sIG Ignore, conntrack might be out of sync - * sCR -> sIG Ignore, conntrack might be out of sync - * sCG -> sIG Ignore, conntrack might be out of sync - * sTW -> sRQ Reincarnation, must reverse roles - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ - }, - [DCCP_PKT_RESPONSE] = { - /* - * sNO -> sIV Response without Request - * sRQ -> sRS Response to clients Request - * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT) - * sPO -> sIG Response to an ignored Request or late retransmit - * sOP -> sIG Ignore, might be response to ignored Request - * sCR -> sIG Ignore, might be response to ignored Request - * sCG -> sIG Ignore, might be response to ignored Request - * sTW -> sIV Invalid, Request from client in sTW moves to sRQ - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV - }, - [DCCP_PKT_ACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular Ack in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_DATA] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular Data packet in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_DATAACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular DataAck in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_CLOSEREQ] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.) - * sOP -> sCR CloseReq in OPEN state - * sCR -> sCR Retransmit - * sCG -> sCR Simultaneous close, client sends another Close - * sTW -> sIV Already closed - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV - }, - [DCCP_PKT_CLOSE] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP -> sCG Move direcly to CLOSING - * sOP -> sCG Move to CLOSING - * sCR -> sIV Close after CloseReq is invalid - * sCG -> sCG Retransmit - * sTW -> sIV Already closed - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV - }, - [DCCP_PKT_RESET] = { - /* - * sNO -> sIV No connection - * sRQ -> sTW Reset in response to Request - * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.) - * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.) - * sOP -> sTW - * sCR -> sTW - * sCG -> sTW - * sTW -> sIG Ignore (don't refresh timer) - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */ - sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG - }, - [DCCP_PKT_SYNC] = { - /* - * We currently ignore Sync packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - [DCCP_PKT_SYNCACK] = { - /* - * We currently ignore SyncAck packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - }, -}; - -static noinline bool -dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - const struct dccp_hdr *dh, - const struct nf_hook_state *hook_state) -{ - struct net *net = nf_ct_net(ct); - struct nf_dccp_net *dn; - const char *msg; - u_int8_t state; - - state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; - switch (state) { - default: - dn = nf_dccp_pernet(net); - if (dn->dccp_loose == 0) { - msg = "not picking up existing connection "; - goto out_invalid; - } - break; - case CT_DCCP_REQUEST: - break; - case CT_DCCP_INVALID: - msg = "invalid state transition "; - goto out_invalid; - } - - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.state = CT_DCCP_NONE; - ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; - ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; - ct->proto.dccp.handshake_seq = 0; - return true; - -out_invalid: - nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg); - return false; -} - -static u64 dccp_ack_seq(const struct dccp_hdr *dh) -{ - const struct dccp_hdr_ack_bits *dhack; - - dhack = (void *)dh + __dccp_basic_hdr_len(dh); - return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + - ntohl(dhack->dccph_ack_nr_low); -} - -static bool dccp_error(const struct dccp_hdr *dh, - struct sk_buff *skb, unsigned int dataoff, - const struct nf_hook_state *state) -{ - static const unsigned long require_seq48 = 1 << DCCP_PKT_REQUEST | - 1 << DCCP_PKT_RESPONSE | - 1 << DCCP_PKT_CLOSEREQ | - 1 << DCCP_PKT_CLOSE | - 1 << DCCP_PKT_RESET | - 1 << DCCP_PKT_SYNC | - 1 << DCCP_PKT_SYNCACK; - unsigned int dccp_len = skb->len - dataoff; - unsigned int cscov; - const char *msg; - u8 type; - - BUILD_BUG_ON(DCCP_PKT_INVALID >= BITS_PER_LONG); - - if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || - dh->dccph_doff * 4 > dccp_len) { - msg = "nf_ct_dccp: truncated/malformed packet "; - goto out_invalid; - } - - cscov = dccp_len; - if (dh->dccph_cscov) { - cscov = (dh->dccph_cscov - 1) * 4; - if (cscov > dccp_len) { - msg = "nf_ct_dccp: bad checksum coverage "; - goto out_invalid; - } - } - - if (state->hook == NF_INET_PRE_ROUTING && - state->net->ct.sysctl_checksum && - nf_checksum_partial(skb, state->hook, dataoff, cscov, - IPPROTO_DCCP, state->pf)) { - msg = "nf_ct_dccp: bad checksum "; - goto out_invalid; - } - - type = dh->dccph_type; - if (type >= DCCP_PKT_INVALID) { - msg = "nf_ct_dccp: reserved packet type "; - goto out_invalid; - } - - if (test_bit(type, &require_seq48) && !dh->dccph_x) { - msg = "nf_ct_dccp: type lacks 48bit sequence numbers"; - goto out_invalid; - } - - return false; -out_invalid: - nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg); - return true; -} - -struct nf_conntrack_dccp_buf { - struct dccp_hdr dh; /* generic header part */ - struct dccp_hdr_ext ext; /* optional depending dh->dccph_x */ - union { /* depends on header type */ - struct dccp_hdr_ack_bits ack; - struct dccp_hdr_request req; - struct dccp_hdr_response response; - struct dccp_hdr_reset rst; - } u; -}; - -static struct dccp_hdr * -dccp_header_pointer(const struct sk_buff *skb, int offset, const struct dccp_hdr *dh, - struct nf_conntrack_dccp_buf *buf) -{ - unsigned int hdrlen = __dccp_hdr_len(dh); - - if (hdrlen > sizeof(*buf)) - return NULL; - - return skb_header_pointer(skb, offset, hdrlen, buf); -} - -int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct nf_conntrack_dccp_buf _dh; - u_int8_t type, old_state, new_state; - enum ct_dccp_roles role; - unsigned int *timeouts; - struct dccp_hdr *dh; - - dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh); - if (!dh) - return -NF_ACCEPT; - - if (dccp_error(dh, skb, dataoff, state)) - return -NF_ACCEPT; - - /* pull again, including possible 48 bit sequences and subtype header */ - dh = dccp_header_pointer(skb, dataoff, dh, &_dh); - if (!dh) - return -NF_ACCEPT; - - type = dh->dccph_type; - if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) - return -NF_ACCEPT; - - if (type == DCCP_PKT_RESET && - !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - /* Tear down connection immediately if only reply is a RESET */ - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_ACCEPT; - } - - spin_lock_bh(&ct->lock); - - role = ct->proto.dccp.role[dir]; - old_state = ct->proto.dccp.state; - new_state = dccp_state_table[role][type][old_state]; - - switch (new_state) { - case CT_DCCP_REQUEST: - if (old_state == CT_DCCP_TIMEWAIT && - role == CT_DCCP_ROLE_SERVER) { - /* Reincarnation in the reverse direction: reopen and - * reverse client/server roles. */ - ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER; - } - break; - case CT_DCCP_RESPOND: - if (old_state == CT_DCCP_REQUEST) - ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); - break; - case CT_DCCP_PARTOPEN: - if (old_state == CT_DCCP_RESPOND && - type == DCCP_PKT_ACK && - dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq) - set_bit(IPS_ASSURED_BIT, &ct->status); - break; - case CT_DCCP_IGNORE: - /* - * Connection tracking might be out of sync, so we ignore - * packets that might establish a new connection and resync - * if the server responds with a valid Response. - */ - if (ct->proto.dccp.last_dir == !dir && - ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST && - type == DCCP_PKT_RESPONSE) { - ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); - new_state = CT_DCCP_RESPOND; - break; - } - ct->proto.dccp.last_dir = dir; - ct->proto.dccp.last_pkt = type; - - spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet"); - return NF_ACCEPT; - case CT_DCCP_INVALID: - spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition"); - return -NF_ACCEPT; - } - - ct->proto.dccp.last_dir = dir; - ct->proto.dccp.last_pkt = type; - ct->proto.dccp.state = new_state; - spin_unlock_bh(&ct->lock); - - if (new_state != old_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, ct); - - timeouts = nf_ct_timeout_lookup(ct); - if (!timeouts) - timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; - nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); - - return NF_ACCEPT; -} - -static bool dccp_can_early_drop(const struct nf_conn *ct) -{ - switch (ct->proto.dccp.state) { - case CT_DCCP_CLOSEREQ: - case CT_DCCP_CLOSING: - case CT_DCCP_TIMEWAIT: - return true; - default: - break; - } - - return false; -} - -#ifdef CONFIG_NF_CONNTRACK_PROCFS -static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) -{ - seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) -static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct, bool destroy) -{ - struct nlattr *nest_parms; - - spin_lock_bh(&ct->lock); - nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP); - if (!nest_parms) - goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state)) - goto nla_put_failure; - - if (destroy) - goto skip_state; - - if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || - nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, - cpu_to_be64(ct->proto.dccp.handshake_seq), - CTA_PROTOINFO_DCCP_PAD)) - goto nla_put_failure; -skip_state: - nla_nest_end(skb, nest_parms); - spin_unlock_bh(&ct->lock); - - return 0; - -nla_put_failure: - spin_unlock_bh(&ct->lock); - return -1; -} - -static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { - [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, - [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, - [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, - [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC }, -}; - -#define DCCP_NLATTR_SIZE ( \ - NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \ - NLA_ALIGN(NLA_HDRLEN + 0)) - -static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) -{ - struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; - struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1]; - int err; - - if (!attr) - return 0; - - err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr, - dccp_nla_policy, NULL); - if (err < 0) - return err; - - if (!tb[CTA_PROTOINFO_DCCP_STATE] || - !tb[CTA_PROTOINFO_DCCP_ROLE] || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { - return -EINVAL; - } - - spin_lock_bh(&ct->lock); - ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); - if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; - } else { - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; - } - if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) { - ct->proto.dccp.handshake_seq = - be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ])); - } - spin_unlock_bh(&ct->lock); - return 0; -} -#endif - -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - -#include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/nfnetlink_cttimeout.h> - -static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - struct nf_dccp_net *dn = nf_dccp_pernet(net); - unsigned int *timeouts = data; - int i; - - if (!timeouts) - timeouts = dn->dccp_timeout; - - /* set default DCCP timeouts. */ - for (i=0; i<CT_DCCP_MAX; i++) - timeouts[i] = dn->dccp_timeout[i]; - - /* there's a 1:1 mapping between attributes and protocol states. */ - for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { - if (tb[i]) { - timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ; - } - } - - timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST]; - return 0; -} - -static int -dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) -{ - const unsigned int *timeouts = data; - int i; - - for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) { - if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ))) - goto nla_put_failure; - } - return 0; - -nla_put_failure: - return -ENOSPC; -} - -static const struct nla_policy -dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = { - [CTA_TIMEOUT_DCCP_REQUEST] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_RESPOND] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_PARTOPEN] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_OPEN] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_CLOSEREQ] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_CLOSING] = { .type = NLA_U32 }, - [CTA_TIMEOUT_DCCP_TIMEWAIT] = { .type = NLA_U32 }, -}; -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - -void nf_conntrack_dccp_init_net(struct net *net) -{ - struct nf_dccp_net *dn = nf_dccp_pernet(net); - - /* default values */ - dn->dccp_loose = 1; - dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; - - /* timeouts[0] is unused, make it same as SYN_SENT so - * ->timeouts[0] contains 'new' timeout, like udp or icmp. - */ - dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST]; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { - .l4proto = IPPROTO_DCCP, - .can_early_drop = dccp_can_early_drop, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = dccp_print_conntrack, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = DCCP_NLATTR_SIZE, - .to_nlattr = dccp_to_nlattr, - .from_nlattr = nlattr_to_dccp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = dccp_timeout_nlattr_to_obj, - .obj_to_nlattr = dccp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_DCCP_MAX, - .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, - .nla_policy = dccp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -}; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 2f666751c7e7c..9b8b10a852339 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -14,6 +14,7 @@ #include <linux/sysctl.h> #endif +#include <net/netfilter/nf_log.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> @@ -67,11 +68,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, ntohs(tuple->dst.u.udp.port)); break; - case IPPROTO_DCCP: - seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.dccp.port), - ntohs(tuple->dst.u.dccp.port)); - break; case IPPROTO_SCTP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.sctp.port), @@ -98,69 +94,87 @@ struct ct_iter_state { struct seq_net_private p; struct hlist_nulls_head *hash; unsigned int htable_size; + unsigned int skip_elems; unsigned int bucket; u_int64_t time_now; }; -static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) +static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net, + struct ct_iter_state *st) { - struct ct_iter_state *st = seq->private; + struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int i; - for (st->bucket = 0; - st->bucket < st->htable_size; - st->bucket++) { - n = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - if (!is_a_nulls(n)) - return n; - } - return NULL; -} + for (i = st->bucket; i < st->htable_size; i++) { + unsigned int skip = 0; -static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, - struct hlist_nulls_node *head) -{ - struct ct_iter_state *st = seq->private; +restart: + hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct hlist_nulls_node *tmp = n; + + if (!net_eq(net, nf_ct_net(ct))) + continue; + + if (++skip <= st->skip_elems) + continue; + + /* h should be returned, skip to nulls marker. */ + while (!is_a_nulls(tmp)) + tmp = rcu_dereference(hlist_nulls_next_rcu(tmp)); - head = rcu_dereference(hlist_nulls_next_rcu(head)); - while (is_a_nulls(head)) { - if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= st->htable_size) - return NULL; + /* check if h is still linked to hash[i] */ + if (get_nulls_value(tmp) != i) { + skip = 0; + goto restart; + } + + st->skip_elems = skip; + st->bucket = i; + return h; } - head = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - } - return head; -} -static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_nulls_node *head = ct_get_first(seq); + skip = 0; + if (get_nulls_value(n) != i) + goto restart; + + st->skip_elems = 0; + } - if (head) - while (pos && (head = ct_get_next(seq, head))) - pos--; - return pos ? NULL : head; + st->bucket = i; + return NULL; } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ct_iter_state *st = seq->private; + struct net *net = seq_file_net(seq); st->time_now = ktime_get_real_ns(); rcu_read_lock(); nf_conntrack_get_ht(&st->hash, &st->htable_size); - return ct_get_idx(seq, *pos); + + if (*pos == 0) { + st->skip_elems = 0; + st->bucket = 0; + } else if (st->skip_elems) { + /* resume from last dumped entry */ + st->skip_elems--; + } + + return ct_get_next(net, st); } static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { + struct ct_iter_state *st = s->private; + struct net *net = seq_file_net(s); + (*pos)++; - return ct_get_next(s, v); + return ct_get_next(net, st); } static void ct_seq_stop(struct seq_file *s, void *v) @@ -261,7 +275,6 @@ static const char* l4proto_name(u16 proto) case IPPROTO_ICMP: return "icmp"; case IPPROTO_TCP: return "tcp"; case IPPROTO_UDP: return "udp"; - case IPPROTO_DCCP: return "dccp"; case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; case IPPROTO_UDPLITE: return "udplite"; @@ -543,6 +556,29 @@ nf_conntrack_hash_sysctl(const struct ctl_table *table, int write, return ret; } +static int +nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, i; + + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret < 0 || !write) + return ret; + + if (*(u8 *)table->data == 0) + return ret; + + /* Load nf_log_syslog only if no logger is currently registered */ + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + if (nf_log_is_registered(i)) + return ret; + } + request_module("%s", "nf_log_syslog"); + + return ret; +} + static struct ctl_table_header *nf_ct_netfilter_header; enum nf_ct_sysctl_index { @@ -594,16 +630,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT, - NF_SYSCTL_CT_PROTO_DCCP_LOOSE, -#endif #ifdef CONFIG_NF_CT_PROTO_GRE NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, @@ -649,7 +675,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dou8vec_minmax, + .proc_handler = nf_conntrack_log_invalid_sysctl, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", @@ -877,58 +903,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { - .procname = "nf_conntrack_dccp_timeout_request", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = { - .procname = "nf_conntrack_dccp_timeout_respond", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = { - .procname = "nf_conntrack_dccp_timeout_partopen", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = { - .procname = "nf_conntrack_dccp_timeout_open", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = { - .procname = "nf_conntrack_dccp_timeout_closereq", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = { - .procname = "nf_conntrack_dccp_timeout_closing", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = { - .procname = "nf_conntrack_dccp_timeout_timewait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = { - .procname = "nf_conntrack_dccp_loose", - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_NF_CT_PROTO_GRE [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = { .procname = "nf_conntrack_gre_timeout", @@ -1014,29 +988,6 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, #endif } -static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net, - struct ctl_table *table) -{ -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct nf_dccp_net *dn = nf_dccp_pernet(net); - -#define XASSIGN(XNAME, dn) \ - table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \ - &(dn)->dccp_timeout[CT_DCCP_ ## XNAME] - - XASSIGN(REQUEST, dn); - XASSIGN(RESPOND, dn); - XASSIGN(PARTOPEN, dn); - XASSIGN(OPEN, dn); - XASSIGN(CLOSEREQ, dn); - XASSIGN(CLOSING, dn); - XASSIGN(TIMEWAIT, dn); -#undef XASSIGN - - table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose; -#endif -} - static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, struct ctl_table *table) { @@ -1082,7 +1033,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); - nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); /* Don't allow non-init_net ns to alter global sysctls */ diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a8e2425e43b0d..fab8b9011098f 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -15,12 +15,26 @@ #define NF_RECURSION_LIMIT 2 -static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); +#ifndef CONFIG_PREEMPT_RT +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +} +#else + +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return ¤t->net_xmit.nf_dup_skb_recursion; +} + +#endif static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { - if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); + + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) goto err; if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { @@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb_clear_tstamp(skb); - __this_cpu_inc(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)++; dev_queue_xmit(skb); - __this_cpu_dec(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)--; return; err: kfree_skb(skb); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 6dd0de33eebd8..74cef8bf554c5 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -125,6 +125,32 @@ void nf_log_unregister(struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_unregister); +/** + * nf_log_is_registered - Check if any logger is registered for a given + * protocol family. + * + * @pf: Protocol family + * + * Returns: true if at least one logger is active for @pf, false otherwise. + */ +bool nf_log_is_registered(u_int8_t pf) +{ + int i; + + if (pf >= NFPROTO_NUMPROTO) { + WARN_ON_ONCE(1); + return false; + } + + for (i = 0; i < NF_LOG_TYPE_MAX; i++) { + if (rcu_access_pointer(loggers[pf][i])) + return true; + } + + return false; +} +EXPORT_SYMBOL(nf_log_is_registered); + int nf_log_bind_pf(struct net *net, u_int8_t pf, const struct nf_logger *logger) { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index aad84aabd7f1d..78a61dac4ade8 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -69,7 +69,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } @@ -81,7 +80,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } @@ -102,7 +100,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } @@ -114,7 +111,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } @@ -248,7 +244,7 @@ static noinline bool nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_ct) { - static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT; + static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; const struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conn *ct; @@ -287,8 +283,14 @@ nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, zone = nf_ct_zone(ignored_ct); thash = nf_conntrack_find_get(net, zone, tuple); - if (unlikely(!thash)) /* clashing entry went away */ - return false; + if (unlikely(!thash)) { + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuple(&reply, tuple); + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) /* clashing entry went away */ + return false; + } ct = nf_ct_tuplehash_to_ctrack(thash); @@ -426,7 +428,6 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: - case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; @@ -626,7 +627,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: - case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index dc450cc81222b..b14a434b95612 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -180,46 +180,6 @@ tcp_manip_pkt(struct sk_buff *skb, } static bool -dccp_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct dccp_hdr *hdr; - __be16 *portptr, oldport, newport; - int hdrsize = 8; /* DCCP connection tracking guarantees this much */ - - if (skb->len >= hdroff + sizeof(struct dccp_hdr)) - hdrsize = sizeof(struct dccp_hdr); - - if (skb_ensure_writable(skb, hdroff + hdrsize)) - return false; - - hdr = (struct dccp_hdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - newport = tuple->src.u.dccp.port; - portptr = &hdr->dccph_sport; - } else { - newport = tuple->dst.u.dccp.port; - portptr = &hdr->dccph_dport; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return true; - - nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); - inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, - false); -#endif - return true; -} - -static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, @@ -338,9 +298,6 @@ static bool l4proto_manip_pkt(struct sk_buff *skb, case IPPROTO_ICMPV6: return icmpv6_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); - case IPPROTO_DCCP: - return dccp_manip_pkt(skb, iphdroff, hdroff, - tuple, maniptype); case IPPROTO_GRE: return gre_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a133e1c175ce9..13d0ed9d18954 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -300,40 +300,75 @@ void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { + struct nf_hook_ops *ops; struct nft_hook *hook; int err, j; j = 0; list_for_each_entry(hook, hook_list, list) { - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) - goto err_register; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nf_register_net_hook(net, ops); + if (err < 0) + goto err_register; - j++; + j++; + } } return 0; err_register: list_for_each_entry(hook, hook_list, list) { - if (j-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (j-- <= 0) + break; - nf_unregister_net_hook(net, &hook->ops); + nf_unregister_net_hook(net, ops); + } } return err; } +static void nft_netdev_hook_free_ops(struct nft_hook *hook) +{ + struct nf_hook_ops *ops, *next; + + list_for_each_entry_safe(ops, next, &hook->ops_list, list) { + list_del(&ops->list); + kfree(ops); + } +} + +static void nft_netdev_hook_free(struct nft_hook *hook) +{ + nft_netdev_hook_free_ops(hook); + kfree(hook); +} + +static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu) +{ + struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu); + + nft_netdev_hook_free(hook); +} + +static void nft_netdev_hook_free_rcu(struct nft_hook *hook) +{ + call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu); +} + static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } @@ -1118,9 +1153,9 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, { struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, - NFNETLINK_V0, nft_base_seq(net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; @@ -1130,7 +1165,8 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, NFTA_TABLE_PAD)) goto nla_put_failure; - if (event == NFT_MSG_DELTABLE) { + if (event == NFT_MSG_DELTABLE || + event == NFT_MSG_DESTROYTABLE) { nlmsg_end(skb, nlh); return 0; } @@ -1981,9 +2017,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, { struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, - NFNETLINK_V0, nft_base_seq(net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; @@ -1993,7 +2029,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, NFTA_CHAIN_PAD)) goto nla_put_failure; - if (event == NFT_MSG_DELCHAIN && !hook_list) { + if (!hook_list && + (event == NFT_MSG_DELCHAIN || + event == NFT_MSG_DESTROYCHAIN)) { nlmsg_end(skb, nlh); return 0; } @@ -2253,7 +2291,7 @@ void nf_tables_chain_destroy(struct nft_chain *chain) list_for_each_entry_safe(hook, next, &basechain->hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } module_put(basechain->type->owner); @@ -2274,19 +2312,20 @@ void nf_tables_chain_destroy(struct nft_chain *chain) static struct nft_hook *nft_netdev_hook_alloc(struct net *net, const struct nlattr *attr) { + struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; int err; hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); - if (!hook) { - err = -ENOMEM; - goto err_hook_alloc; - } + if (!hook) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&hook->ops_list); err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); if (err < 0) - goto err_hook_dev; + goto err_hook_free; hook->ifnamelen = nla_len(attr); @@ -2294,18 +2333,22 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, * indirectly serializing all the other holders of the commit_mutex with * the rtnl_mutex. */ - dev = __dev_get_by_name(net, hook->ifname); - if (!dev) { - err = -ENOENT; - goto err_hook_dev; - } - hook->ops.dev = dev; + for_each_netdev(net, dev) { + if (strncmp(dev->name, hook->ifname, hook->ifnamelen)) + continue; + ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); + if (!ops) { + err = -ENOMEM; + goto err_hook_free; + } + ops->dev = dev; + list_add_tail(&ops->list, &hook->ops_list); + } return hook; -err_hook_dev: - kfree(hook); -err_hook_alloc: +err_hook_free: + nft_netdev_hook_free(hook); return ERR_PTR(err); } @@ -2315,7 +2358,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { - if (!strcmp(hook->ifname, this->ifname)) + if (!strncmp(hook->ifname, this->ifname, + min(hook->ifnamelen, this->ifnamelen))) return hook; } @@ -2345,7 +2389,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, } if (nft_hook_list_find(hook_list, hook)) { NL_SET_BAD_ATTR(extack, tmp); - kfree(hook); + nft_netdev_hook_free(hook); err = -EEXIST; goto err_hook; } @@ -2363,7 +2407,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, err_hook: list_for_each_entry_safe(hook, next, hook_list, list) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } return err; } @@ -2506,7 +2550,7 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) list_for_each_entry_safe(h, next, &hook->list, list) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } module_put(hook->type->owner); } @@ -2559,6 +2603,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, struct nft_chain_hook *hook, u32 flags) { struct nft_chain *chain; + struct nf_hook_ops *ops; struct nft_hook *h; basechain->type = hook->type; @@ -2567,8 +2612,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, if (nft_base_chain_netdev(family, hook->num)) { list_splice_init(&hook->list, &basechain->hook_list); - list_for_each_entry(h, &basechain->hook_list, list) - nft_basechain_hook_init(&h->ops, family, hook, chain); + list_for_each_entry(h, &basechain->hook_list, list) { + list_for_each_entry(ops, &h->ops_list, list) + nft_basechain_hook_init(ops, family, hook, chain); + } } nft_basechain_hook_init(&basechain->ops, family, hook, chain); @@ -2787,15 +2834,17 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { list_for_each_entry_safe(h, next, &hook.list, list) { - h->ops.pf = basechain->ops.pf; - h->ops.hooknum = basechain->ops.hooknum; - h->ops.priority = basechain->ops.priority; - h->ops.priv = basechain->ops.priv; - h->ops.hook = basechain->ops.hook; + list_for_each_entry(ops, &h->ops_list, list) { + ops->pf = basechain->ops.pf; + ops->hooknum = basechain->ops.hooknum; + ops->priority = basechain->ops.priority; + ops->priv = basechain->ops.priv; + ops->hook = basechain->ops.hook; + } if (nft_hook_list_find(&basechain->hook_list, h)) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } } } else { @@ -2913,10 +2962,12 @@ err_trans: err_hooks: if (nla[NFTA_CHAIN_HOOK]) { list_for_each_entry_safe(h, next, &hook.list, list) { - if (unregister) - nf_unregister_net_hook(ctx->net, &h->ops); + if (unregister) { + list_for_each_entry(ops, &h->ops_list, list) + nf_unregister_net_hook(ctx->net, ops); + } list_del(&h->list); - kfree_rcu(h, rcu); + nft_netdev_hook_free_rcu(h); } module_put(hook.type->owner); } @@ -3991,7 +4042,7 @@ void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) /* can only be used if rule is no longer visible to dumps */ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { - lockdep_commit_lock_is_held(ctx->net); + WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net)); nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); @@ -4569,6 +4620,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), + [NFTA_SET_TYPE] = { .type = NLA_REJECT }, + [NFTA_SET_COUNT] = { .type = NLA_REJECT }, }; static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { @@ -4763,6 +4816,27 @@ static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) return size; } +static noinline_for_stack int +nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set) +{ + unsigned int nelems; + char str[40]; + int ret; + + ret = snprintf(str, sizeof(str), "%ps", set->ops); + + /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped + * to userspace purely for informational/debug purposes. + */ + DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str)); + + if (nla_put_string(skb, NFTA_SET_TYPE, str)) + return -EMSGSIZE; + + nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems)); + return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -4774,9 +4848,10 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, u32 seq = ctx->seq; int i; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, ctx->family, NFNETLINK_V0, + nft_base_seq(ctx->net)); if (!nlh) goto nla_put_failure; @@ -4788,7 +4863,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, NFTA_SET_PAD)) goto nla_put_failure; - if (event == NFT_MSG_DELSET) { + if (event == NFT_MSG_DELSET || + event == NFT_MSG_DESTROYSET) { nlmsg_end(skb, nlh); return 0; } @@ -4843,6 +4919,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_nest_end(skb, nest); + if (nf_tables_fill_set_info(skb, set)) + goto nla_put_failure; + if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) @@ -5785,7 +5864,7 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase) { - lockdep_commit_lock_is_held(ctx->net); + WARN_ON_ONCE(!lockdep_commit_lock_is_held(ctx->net)); switch (phase) { case NFT_TRANS_PREPARE_ERROR: @@ -8264,25 +8343,26 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, { struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, - NFNETLINK_V0, nft_base_seq(net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) || + nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle), NFTA_OBJ_PAD)) goto nla_put_failure; - if (event == NFT_MSG_DELOBJ) { + if (event == NFT_MSG_DELOBJ || + event == NFT_MSG_DESTROYOBJ) { nlmsg_end(skb, nlh); return 0; } - if (nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || - nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || + if (nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) || nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset)) goto nla_put_failure; @@ -8759,6 +8839,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, struct netlink_ext_ack *extack, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; + struct nf_hook_ops *ops; struct nft_hook *hook; int hooknum, priority; int err; @@ -8813,11 +8894,14 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, } list_for_each_entry(hook, &flowtable_hook->list, list) { - hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = flowtable_hook->num; - hook->ops.priority = flowtable_hook->priority; - hook->ops.priv = &flowtable->data; - hook->ops.hook = flowtable->data.type->hook; + list_for_each_entry(ops, &hook->ops_list, list) { + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable_hook->num; + ops->priority = flowtable_hook->priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->hook_ops_type = NF_HOOK_OP_NFT_FT; + } } return err; @@ -8859,12 +8943,12 @@ nft_flowtable_type_get(struct net *net, u8 family) } /* Only called from error and netdev event paths. */ -static void nft_unregister_flowtable_hook(struct net *net, - struct nft_flowtable *flowtable, - struct nft_hook *hook) +static void nft_unregister_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + nf_unregister_net_hook(net, ops); + flowtable->data.type->setup(&flowtable->data, ops->dev, FLOW_BLOCK_UNBIND); } @@ -8874,14 +8958,14 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } @@ -8893,6 +8977,26 @@ static void nft_unregister_flowtable_net_hooks(struct net *net, __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } +static int nft_register_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) +{ + int err; + + err = flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_BIND); + if (err < 0) + return err; + + err = nf_register_net_hook(net, ops); + if (!err) + return 0; + + flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_UNBIND); + return err; +} + static int nft_register_flowtable_net_hooks(struct net *net, struct nft_table *table, struct list_head *hook_list, @@ -8900,6 +9004,7 @@ static int nft_register_flowtable_net_hooks(struct net *net, { struct nft_hook *hook, *next; struct nft_flowtable *ft; + struct nf_hook_ops *ops; int err, i = 0; list_for_each_entry(hook, hook_list, list) { @@ -8913,33 +9018,27 @@ static int nft_register_flowtable_net_hooks(struct net *net, } } - err = flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_BIND); - if (err < 0) - goto err_unregister_net_hooks; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nft_register_flowtable_ops(net, flowtable, ops); + if (err < 0) + goto err_unregister_net_hooks; - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) { - flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_UNBIND); - goto err_unregister_net_hooks; + i++; } - - i++; } return 0; err_unregister_net_hooks: list_for_each_entry_safe(hook, next, hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - nft_unregister_flowtable_hook(net, flowtable, hook); + nft_unregister_flowtable_ops(net, flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; @@ -8951,7 +9050,7 @@ static void nft_hooks_destroy(struct list_head *hook_list) list_for_each_entry_safe(hook, next, hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } @@ -8962,6 +9061,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; struct nft_hook *hook, *next; + struct nf_hook_ops *ops; struct nft_trans *trans; bool unregister = false; u32 flags; @@ -8975,7 +9075,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } } @@ -9019,10 +9119,13 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, err_flowtable_update_hook: list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { - if (unregister) - nft_unregister_flowtable_hook(ctx->net, flowtable, hook); + if (unregister) { + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(ctx->net, + flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; @@ -9168,7 +9271,7 @@ static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook list_for_each_entry_safe(this, next, &flowtable_hook->list, list) { list_del(&this->list); - kfree(this); + nft_netdev_hook_free(this); } } @@ -9286,9 +9389,9 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, struct nft_hook *hook; struct nlmsghdr *nlh; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, - NFNETLINK_V0, nft_base_seq(net)); + nlh = nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), + flags, family, NFNETLINK_V0, nft_base_seq(net)); if (!nlh) goto nla_put_failure; @@ -9298,7 +9401,9 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, NFTA_FLOWTABLE_PAD)) goto nla_put_failure; - if (event == NFT_MSG_DELFLOWTABLE && !hook_list) { + if (!hook_list && + (event == NFT_MSG_DELFLOWTABLE || + event == NFT_MSG_DESTROYFLOWTABLE)) { nlmsg_end(skb, nlh); return 0; } @@ -9531,7 +9636,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } kfree(flowtable->name); module_put(flowtable->data.type->owner); @@ -9564,46 +9669,132 @@ nla_put_failure: return -EMSGSIZE; } -static void nft_flowtable_event(unsigned long event, struct net_device *dev, - struct nft_flowtable *flowtable) +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev) +{ + struct nf_hook_ops *ops; + + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops); + +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev) { + struct nf_hook_ops *ops; + + list_for_each_entry_rcu(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); + +static int nft_flowtable_event(unsigned long event, struct net_device *dev, + struct nft_flowtable *flowtable, bool changename) +{ + struct nf_hook_ops *ops; struct nft_hook *hook; + bool match; list_for_each_entry(hook, &flowtable->hook_list, list) { - if (hook->ops.dev != dev) - continue; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - /* flow_offload_netdev_event() cleans up entries for us. */ - nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + /* flow_offload_netdev_event() cleans up entries for us. */ + nft_unregister_flowtable_ops(dev_net(dev), + flowtable, ops); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; + + ops = kzalloc(sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable->hooknum; + ops->priority = flowtable->data.priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->hook_ops_type = NF_HOOK_OP_NFT_FT; + ops->dev = dev; + if (nft_register_flowtable_ops(dev_net(dev), + flowtable, ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } break; } + return 0; +} + +static int __nf_tables_flowtable_event(unsigned long event, + struct net_device *dev, + bool changename) +{ + struct nftables_pernet *nft_net = nft_pernet(dev_net(dev)); + struct nft_flowtable *flowtable; + struct nft_table *table; + + list_for_each_entry(table, &nft_net->tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { + if (nft_flowtable_event(event, dev, + flowtable, changename)) + return 1; + } + } + return 0; } static int nf_tables_flowtable_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_flowtable *flowtable; struct nftables_pernet *nft_net; - struct nft_table *table; + int ret = NOTIFY_DONE; struct net *net; - if (event != NETDEV_UNREGISTER) - return 0; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; net = dev_net(dev); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); - list_for_each_entry(table, &nft_net->tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - nft_flowtable_event(event, dev, flowtable); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; } + __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_flowtable_event(event, dev, false)) { + ret = NOTIFY_BAD; } +out_unlock: mutex_unlock(&nft_net->commit_mutex); - - return NOTIFY_DONE; + return ret; } static struct notifier_block nf_tables_flowtable_notifier = { diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 64675f1c7f295..fd30e205de849 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -220,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain) bool nft_chain_offload_support(const struct nft_base_chain *basechain) { + struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; @@ -227,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain) return false; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.pf != NFPROTO_NETDEV || - hook->ops.hooknum != NF_NETDEV_INGRESS) - return false; - - dev = hook->ops.dev; - if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) - return false; + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->pf != NFPROTO_NETDEV || + ops->hooknum != NF_NETDEV_INGRESS) + return false; + + dev = ops->dev; + if (!dev->netdev_ops->ndo_setup_tc && + !flow_indr_dev_exists()) + return false; + } } return true; @@ -455,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { - struct net_device *dev; + struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { - dev = hook->ops.dev; - if (this_dev && this_dev != dev) - continue; + list_for_each_entry(ops, &hook->ops_list, list) { + if (this_dev && this_dev != ops->dev) + continue; - err = nft_chain_offload_cmd(basechain, dev, cmd); - if (err < 0 && cmd == FLOW_BLOCK_BIND) { - if (!this_dev) - goto err_flow_block; + err = nft_chain_offload_cmd(basechain, ops->dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; - return err; + return err; + } + i++; } - i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - dev = hook->ops.dev; - nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + nft_chain_offload_cmd(basechain, ops->dev, + FLOW_BLOCK_UNBIND); + } } return err; } @@ -638,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops(hook, dev)) continue; found = hook; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 580c55268f657..a88abae5a9de2 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -15,6 +15,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> @@ -90,6 +91,52 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb, return 0; } +static int nf_trace_fill_ct_info(struct sk_buff *nlskb, + const struct sk_buff *skb) +{ + const struct nf_ct_hook *ct_hook; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + u32 state; + + ct_hook = rcu_dereference(nf_ct_hook); + if (!ct_hook) + return 0; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { + if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */ + return 0; + + state = NF_CT_STATE_UNTRACKED_BIT; + } else { + state = NF_CT_STATE_BIT(ctinfo); + } + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state))) + return -1; + + if (ct) { + u32 id = ct_hook->get_id(&ct->ct_general); + u32 status = READ_ONCE(ct->status); + u8 dir = CTINFO2DIR(ctinfo); + + if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir)) + return -1; + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id)) + return -1; + + /* Kernel implementation detail, withhold this from userspace for now */ + status &= ~IPS_NAT_CLASH; + + if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status))) + return -1; + } + + return 0; +} + static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, const struct nft_pktinfo *pkt) { @@ -210,7 +257,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ - nla_total_size(sizeof(u32)) + /* id */ + nla_total_size(sizeof(u32)) + /* ct id */ + nla_total_size(sizeof(u8)) + /* ct direction */ + nla_total_size(sizeof(u32)) + /* ct state */ + nla_total_size(sizeof(u32)) + /* ct status */ + nla_total_size(sizeof(u32)) + /* trace id */ nla_total_size(NFT_TRACETYPE_LL_HSIZE) + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + @@ -291,6 +342,10 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, if (nf_trace_fill_pkt_info(skb, pkt)) goto nla_put_failure; + + if (nf_trace_fill_ct_info(skb, pkt->skb)) + goto nla_put_failure; + info->packet_dumped = true; } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index eab4f476b47fc..38d75484e531c 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -461,11 +461,6 @@ static int cttimeout_default_get(struct sk_buff *skb, case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(info->net)->timeouts; break; - case IPPROTO_DCCP: -#ifdef CONFIG_NF_CT_PROTO_DCCP - timeouts = nf_dccp_pernet(info->net)->dccp_timeout; -#endif - break; case IPPROTO_ICMPV6: timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index ade8ee1988b1e..92d869317cba5 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -109,13 +109,30 @@ cancel_nest: return -EMSGSIZE; } +static int nfnl_hook_put_nft_info_desc(struct sk_buff *nlskb, const char *tname, + const char *name, u8 family) +{ + struct nlattr *nest; + + nest = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC); + if (!nest || + nla_put_string(nlskb, NFNLA_CHAIN_TABLE, tname) || + nla_put_string(nlskb, NFNLA_CHAIN_NAME, name) || + nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, family)) { + nla_nest_cancel(nlskb, nest); + return -EMSGSIZE; + } + nla_nest_end(nlskb, nest); + return 0; +} + static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, const struct nfnl_dump_hook_data *ctx, unsigned int seq, struct nft_chain *chain) { struct net *net = sock_net(nlskb->sk); - struct nlattr *nest, *nest2; + struct nlattr *nest; int ret = 0; if (WARN_ON_ONCE(!chain)) @@ -128,29 +145,47 @@ static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, if (!nest) return -EMSGSIZE; - nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC); - if (!nest2) - goto cancel_nest; + ret = nfnl_hook_put_nft_info_desc(nlskb, chain->table->name, + chain->name, chain->table->family); + if (ret) { + nla_nest_cancel(nlskb, nest); + return ret; + } - ret = nla_put_string(nlskb, NFNLA_CHAIN_TABLE, chain->table->name); - if (ret) - goto cancel_nest; + nla_nest_end(nlskb, nest); + return 0; +} - ret = nla_put_string(nlskb, NFNLA_CHAIN_NAME, chain->name); - if (ret) - goto cancel_nest; +static int nfnl_hook_put_nft_ft_info(struct sk_buff *nlskb, + const struct nfnl_dump_hook_data *ctx, + unsigned int seq, + struct nf_flowtable *nf_ft) +{ + struct nft_flowtable *ft = + container_of(nf_ft, struct nft_flowtable, data); + struct net *net = sock_net(nlskb->sk); + struct nlattr *nest; + int ret = 0; - ret = nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, chain->table->family); - if (ret) - goto cancel_nest; + if (WARN_ON_ONCE(!nf_ft)) + return 0; - nla_nest_end(nlskb, nest2); - nla_nest_end(nlskb, nest); - return ret; + if (!nft_is_active(net, ft)) + return 0; -cancel_nest: - nla_nest_cancel(nlskb, nest); - return -EMSGSIZE; + nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFT_FLOWTABLE); + if (!nest) + return -EMSGSIZE; + + ret = nfnl_hook_put_nft_info_desc(nlskb, ft->table->name, + ft->name, ft->table->family); + if (ret) { + nla_nest_cancel(nlskb, nest); + return ret; + } + + nla_nest_end(nlskb, nest); + return 0; } static int nfnl_hook_dump_one(struct sk_buff *nlskb, @@ -220,6 +255,9 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb, case NF_HOOK_OP_BPF: ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv); break; + case NF_HOOK_OP_NFT_FT: + ret = nfnl_hook_put_nft_ft_info(nlskb, ctx, seq, ops->priv); + break; case NF_HOOK_OP_UNDEFINED: break; default: diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 882962f3c84db..bfcb9cd335bff 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -390,7 +390,7 @@ __nfulnl_flush(struct nfulnl_instance *inst) static void nfulnl_timer(struct timer_list *t) { - struct nfulnl_instance *inst = from_timer(inst, t, timer); + struct nfulnl_instance *inst = timer_container_of(inst, t, timer); spin_lock_bh(&inst->lock); if (inst->skb) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 19a553550c769..b16185e9a6dd7 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -318,38 +318,66 @@ static const struct nft_chain_type nft_chain_filter_netdev = { }, }; -static void nft_netdev_event(unsigned long event, struct net_device *dev, - struct nft_base_chain *basechain) +static int nft_netdev_event(unsigned long event, struct net_device *dev, + struct nft_base_chain *basechain, bool changename) { + struct nft_table *table = basechain->chain.table; + struct nf_hook_ops *ops; struct nft_hook *hook; + bool match; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) - continue; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); + + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nf_unregister_net_hook(dev_net(dev), ops); - if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT)) - nf_unregister_net_hook(dev_net(dev), &hook->ops); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + ops = kmemdup(&basechain->ops, + sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->dev = dev; + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + nf_register_net_hook(dev_net(dev), ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } break; } + return 0; } -static int nf_tables_netdev_event(struct notifier_block *this, - unsigned long event, void *ptr) +static int __nf_tables_netdev_event(unsigned long event, + struct net_device *dev, + bool changename) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_base_chain *basechain; struct nftables_pernet *nft_net; struct nft_chain *chain; struct nft_table *table; - if (event != NETDEV_UNREGISTER) - return NOTIFY_DONE; - nft_net = nft_pernet(dev_net(dev)); - mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV && table->family != NFPROTO_INET) @@ -364,12 +392,40 @@ static int nf_tables_netdev_event(struct notifier_block *this, basechain->ops.hooknum != NF_INET_INGRESS) continue; - nft_netdev_event(event, dev, basechain); + if (nft_netdev_event(event, dev, basechain, changename)) + return 1; } } - mutex_unlock(&nft_net->commit_mutex); + return 0; +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; + int ret = NOTIFY_DONE; - return NOTIFY_DONE; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; + + nft_net = nft_pernet(dev_net(dev)); + mutex_lock(&nft_net->commit_mutex); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; + } + __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_netdev_event(event, dev, false)) { + ret = NOTIFY_BAD; + } +out_unlock: + mutex_unlock(&nft_net->commit_mutex); + return ret; } static struct notifier_block nf_tables_netdev_notifier = { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 88922e0e8e837..7807d81296646 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -44,9 +44,9 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, return 0; } -static struct nft_elem_priv *nft_dynset_new(struct nft_set *set, - const struct nft_expr *expr, - struct nft_regs *regs) +struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs) { const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set_ext *ext; @@ -91,8 +91,8 @@ void nft_dynset_eval(const struct nft_expr *expr, return; } - if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, - expr, regs, &ext)) { + ext = set->ops->update(set, ®s->data[priv->sreg_key], expr, regs); + if (ext) { if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) { diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index c74012c991255..7eedf4e3ae9c7 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -407,6 +407,7 @@ err: regs->verdict.code = NFT_BREAK; } +#ifdef CONFIG_NFT_EXTHDR_DCCP static void nft_exthdr_dccp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -482,6 +483,7 @@ static void nft_exthdr_dccp_eval(const struct nft_expr *expr, err: *dest = 0; } +#endif static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, @@ -634,6 +636,7 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, return 0; } +#ifdef CONFIG_NFT_EXTHDR_DCCP static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -649,6 +652,7 @@ static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, return 0; } +#endif static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { @@ -779,6 +783,7 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = { .reduce = nft_exthdr_reduce, }; +#ifdef CONFIG_NFT_EXTHDR_DCCP static const struct nft_expr_ops nft_exthdr_dccp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -787,6 +792,7 @@ static const struct nft_expr_ops nft_exthdr_dccp_ops = { .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; +#endif static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, @@ -822,10 +828,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_sctp_ops; break; +#ifdef CONFIG_NFT_EXTHDR_DCCP case NFT_EXTHDR_OP_DCCP: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_dccp_ops; break; +#endif } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 221d502230181..225ff293cd500 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -175,7 +175,7 @@ static bool nft_flowtable_find_dev(const struct net_device *dev, bool found = false; list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops_rcu(hook, dev)) continue; found = true; diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 817ab978d24a1..c4569d4b92285 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -23,7 +23,14 @@ #include <linux/ip.h> #include <linux/ipv6.h> -static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); +struct nft_inner_tun_ctx_locked { + struct nft_inner_tun_ctx ctx; + local_lock_t bh_lock; +}; + +static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /* Same layout as nft_expr but it embeds the private expression data area. */ struct __nft_expr { @@ -237,12 +244,15 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); - this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { local_bh_enable(); + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); return false; } *tun_ctx = *this_cpu_tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); local_bh_enable(); return true; @@ -254,9 +264,11 @@ static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); - this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != tun_ctx->cookie) *this_cpu_tun_ctx = *tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); local_bh_enable(); } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 63ef832b8aa71..40c602ffbcba7 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -25,32 +25,33 @@ struct nft_lookup { }; #ifdef CONFIG_MITIGATION_RETPOLINE -bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { if (set->ops == &nft_set_hash_fast_type.ops) - return nft_hash_lookup_fast(net, set, key, ext); + return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) - return nft_hash_lookup(net, set, key, ext); + return nft_hash_lookup(net, set, key); if (set->ops == &nft_set_rhash_type.ops) - return nft_rhash_lookup(net, set, key, ext); + return nft_rhash_lookup(net, set, key); if (set->ops == &nft_set_bitmap_type.ops) - return nft_bitmap_lookup(net, set, key, ext); + return nft_bitmap_lookup(net, set, key); if (set->ops == &nft_set_pipapo_type.ops) - return nft_pipapo_lookup(net, set, key, ext); + return nft_pipapo_lookup(net, set, key); #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) if (set->ops == &nft_set_pipapo_avx2_type.ops) - return nft_pipapo_avx2_lookup(net, set, key, ext); + return nft_pipapo_avx2_lookup(net, set, key); #endif if (set->ops == &nft_set_rbtree_type.ops) - return nft_rbtree_lookup(net, set, key, ext); + return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); - return set->ops->lookup(net, set, key, ext); + return set->ops->lookup(net, set, key); } EXPORT_SYMBOL_GPL(nft_set_do_lookup); #endif @@ -61,12 +62,12 @@ void nft_lookup_eval(const struct nft_expr *expr, { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; - const struct nft_set_ext *ext = NULL; const struct net *net = nft_net(pkt); + const struct nft_set_ext *ext; bool found; - found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext) ^ - priv->invert; + ext = nft_set_do_lookup(net, set, ®s->data[priv->sreg]); + found = !!ext ^ priv->invert; if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 09da7a3f9f967..8ee66a86c3bc7 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -111,10 +111,9 @@ void nft_objref_map_eval(const struct nft_expr *expr, struct net *net = nft_net(pkt); const struct nft_set_ext *ext; struct nft_object *obj; - bool found; - found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext); - if (!found) { + ext = nft_set_do_lookup(net, set, ®s->data[priv->sreg]); + if (!ext) { ext = nft_set_catchall_lookup(net, set); if (!ext) { regs->verdict.code = NFT_BREAK; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 9b2d7463d3d32..df0798da2329b 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -19,10 +19,16 @@ struct nft_quota { }; static inline bool nft_overquota(struct nft_quota *priv, - const struct sk_buff *skb) + const struct sk_buff *skb, + bool *report) { - return atomic64_add_return(skb->len, priv->consumed) >= - atomic64_read(&priv->quota); + u64 consumed = atomic64_add_return(skb->len, priv->consumed); + u64 quota = atomic64_read(&priv->quota); + + if (report) + *report = consumed >= quota; + + return consumed > quota; } static inline bool nft_quota_invert(struct nft_quota *priv) @@ -34,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv)) + if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; } @@ -51,13 +57,13 @@ static void nft_quota_obj_eval(struct nft_object *obj, const struct nft_pktinfo *pkt) { struct nft_quota *priv = nft_obj_data(obj); - bool overquota; + bool overquota, report; - overquota = nft_overquota(priv, pkt->skb); + overquota = nft_overquota(priv, pkt->skb, &report); if (overquota ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; - if (overquota && + if (report && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 12390d2e994fc..c24c922f895d8 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -75,16 +75,21 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) } INDIRECT_CALLABLE_SCOPE -bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { const struct nft_bitmap *priv = nft_set_priv(set); + static const struct nft_set_ext found; u8 genmask = nft_genmask_cur(net); u32 idx, off; nft_bitmap_location(set, key, &idx, &off); - return nft_bitmap_active(priv->bitmap, idx, off, genmask); + if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) + return &found; + + return NULL; } static struct nft_bitmap_elem * diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index abb0c8ec63719..266d0c637225c 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -81,8 +81,9 @@ static const struct rhashtable_params nft_rhash_params = { }; INDIRECT_CALLABLE_SCOPE -bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_rhash *priv = nft_set_priv(set); const struct nft_rhash_elem *he; @@ -95,9 +96,9 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) - *ext = &he->ext; + return &he->ext; - return !!he; + return NULL; } static struct nft_elem_priv * @@ -120,14 +121,9 @@ nft_rhash_get(const struct net *net, const struct nft_set *set, return ERR_PTR(-ENOENT); } -static bool nft_rhash_update(struct nft_set *set, const u32 *key, - struct nft_elem_priv * - (*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *regs), - const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext) +static const struct nft_set_ext * +nft_rhash_update(struct nft_set *set, const u32 *key, + const struct nft_expr *expr, struct nft_regs *regs) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he, *prev; @@ -143,7 +139,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, if (he != NULL) goto out; - elem_priv = new(set, expr, regs); + elem_priv = nft_dynset_new(set, expr, regs); if (!elem_priv) goto err1; @@ -161,14 +157,13 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, } out: - *ext = &he->ext; - return true; + return &he->ext; err2: nft_set_elem_destroy(set, &he->priv, true); atomic_dec(&set->nelems); err1: - return false; + return NULL; } static int nft_rhash_insert(const struct net *net, const struct nft_set *set, @@ -507,8 +502,9 @@ struct nft_hash_elem { }; INDIRECT_CALLABLE_SCOPE -bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -519,12 +515,10 @@ bool nft_hash_lookup(const struct net *net, const struct nft_set *set, hash = reciprocal_scale(hash, priv->buckets); hlist_for_each_entry_rcu(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) && - nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; - return true; - } + nft_set_elem_active(&he->ext, genmask)) + return &he->ext; } - return false; + return NULL; } static struct nft_elem_priv * @@ -547,9 +541,9 @@ nft_hash_get(const struct net *net, const struct nft_set *set, } INDIRECT_CALLABLE_SCOPE -bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -562,12 +556,10 @@ bool nft_hash_lookup_fast(const struct net *net, hlist_for_each_entry_rcu(he, &priv->table[hash], node) { k2 = *(u32 *)nft_set_ext_key(&he->ext)->data; if (k1 == k2 && - nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; - return true; - } + nft_set_elem_active(&he->ext, genmask)) + return &he->ext; } - return false; + return NULL; } static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 7be342b495f5f..1a19649c28511 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -397,34 +397,36 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, } /** - * nft_pipapo_lookup() - Lookup function - * @net: Network namespace - * @set: nftables API set representation - * @key: nftables API element representation containing key data - * @ext: nftables API extension pointer, filled with matching reference + * pipapo_get() - Get matching element reference given key data + * @m: storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: timestamp to check for expired elements * * For more details, see DOC: Theory of Operation. * - * Return: true on match, false otherwise. + * This is the main lookup function. It matches key data against either + * the working match set or the uncommitted copy, depending on what the + * caller passed to us. + * nft_pipapo_get (lookup from userspace/control plane) and nft_pipapo_lookup + * (datapath lookup) pass the active copy. + * The insertion path will pass the uncommitted working copy. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_scratch *scratch; unsigned long *res_map, *fill_map; - u8 genmask = nft_genmask_cur(net); - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; - const u8 *rp = (const u8 *)key; bool map_index; int i; local_bh_disable(); - m = rcu_dereference(priv->match); - - if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) + if (unlikely(!raw_cpu_ptr(m->scratch))) goto out; scratch = *raw_cpu_ptr(m->scratch); @@ -444,12 +446,12 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, * packet bytes value, then AND bucket value */ if (likely(f->bb == 8)) - pipapo_and_field_buckets_8bit(f, res_map, rp); + pipapo_and_field_buckets_8bit(f, res_map, data); else - pipapo_and_field_buckets_4bit(f, res_map, rp); + pipapo_and_field_buckets_4bit(f, res_map, data); NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; - rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); + data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' @@ -465,13 +467,15 @@ next_match: scratch->map_index = map_index; local_bh_enable(); - return false; + return NULL; } if (last) { - *ext = &f->mt[b].e->ext; - if (unlikely(nft_set_elem_expired(*ext) || - !nft_set_elem_active(*ext, genmask))) + struct nft_pipapo_elem *e; + + e = f->mt[b].e; + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || + !nft_set_elem_active(&e->ext, genmask))) goto next_match; /* Last field: we're just returning the key without @@ -481,8 +485,7 @@ next_match: */ scratch->map_index = map_index; local_bh_enable(); - - return true; + return e; } /* Swap bitmap indices: res_map is the initial bitmap for the @@ -492,112 +495,38 @@ next_match: map_index = !map_index; swap(res_map, fill_map); - rp += NFT_PIPAPO_GROUPS_PADDING(f); + data += NFT_PIPAPO_GROUPS_PADDING(f); } out: local_bh_enable(); - return false; + return NULL; } /** - * pipapo_get() - Get matching element reference given key data + * nft_pipapo_lookup() - Dataplane fronted for main lookup function * @net: Network namespace * @set: nftables API set representation - * @m: storage containing active/existing elements - * @data: Key data to be matched against existing elements - * @genmask: If set, check that element is active in given genmask - * @tstamp: timestamp to check for expired elements - * @gfp: the type of memory to allocate (see kmalloc). + * @key: pointer to nft registers containing key data * - * This is essentially the same as the lookup function, except that it matches - * key data against the uncommitted copy and doesn't use preallocated maps for - * bitmap results. + * This function is called from the data path. It will search for + * an element matching the given key in the current active copy. * - * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. + * Return: ntables API extension pointer or NULL if no match. */ -static struct nft_pipapo_elem *pipapo_get(const struct net *net, - const struct nft_set *set, - const struct nft_pipapo_match *m, - const u8 *data, u8 genmask, - u64 tstamp, gfp_t gfp) +const struct nft_set_ext * +nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { - struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); - unsigned long *res_map, *fill_map = NULL; - const struct nft_pipapo_field *f; - int i; - - if (m->bsize_max == 0) - return ret; - - res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), gfp); - if (!res_map) { - ret = ERR_PTR(-ENOMEM); - goto out; - } - - fill_map = kcalloc(m->bsize_max, sizeof(*res_map), gfp); - if (!fill_map) { - ret = ERR_PTR(-ENOMEM); - goto out; - } - - pipapo_resmap_init(m, res_map); - - nft_pipapo_for_each_field(f, i, m) { - bool last = i == m->field_count - 1; - int b; - - /* For each bit group: select lookup table bucket depending on - * packet bytes value, then AND bucket value - */ - if (f->bb == 8) - pipapo_and_field_buckets_8bit(f, res_map, data); - else if (f->bb == 4) - pipapo_and_field_buckets_4bit(f, res_map, data); - else - BUG(); - - data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); - - /* Now populate the bitmap for the next field, unless this is - * the last field, in which case return the matched 'ext' - * pointer if any. - * - * Now res_map contains the matching bitmap, and fill_map is the - * bitmap for the next field. - */ -next_match: - b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, - last); - if (b < 0) - goto out; - - if (last) { - if (__nft_set_elem_expired(&f->mt[b].e->ext, tstamp)) - goto next_match; - if ((genmask && - !nft_set_elem_active(&f->mt[b].e->ext, genmask))) - goto next_match; - - ret = f->mt[b].e; - goto out; - } - - data += NFT_PIPAPO_GROUPS_PADDING(f); + struct nft_pipapo *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + const struct nft_pipapo_match *m; + const struct nft_pipapo_elem *e; - /* Swap bitmap indices: fill_map will be the initial bitmap for - * the next field (i.e. the new res_map), and res_map is - * guaranteed to be all-zeroes at this point, ready to be filled - * according to the next mapping table. - */ - swap(res_map, fill_map); - } + m = rcu_dereference(priv->match); + e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64()); -out: - kfree(fill_map); - kfree(res_map); - return ret; + return e ? &e->ext : NULL; } /** @@ -606,6 +535,11 @@ out: * @set: nftables API set representation * @elem: nftables API element representation containing key data * @flags: Unused + * + * This function is called from the control plane path under + * RCU read lock. + * + * Return: set element private pointer or ERR_PTR(-ENOENT). */ static struct nft_elem_priv * nft_pipapo_get(const struct net *net, const struct nft_set *set, @@ -615,11 +549,10 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set, struct nft_pipapo_match *m = rcu_dereference(priv->match); struct nft_pipapo_elem *e; - e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, - nft_genmask_cur(net), get_jiffies_64(), - GFP_ATOMIC); - if (IS_ERR(e)) - return ERR_CAST(e); + e = pipapo_get(m, (const u8 *)elem->key.val.data, + nft_genmask_cur(net), get_jiffies_64()); + if (!e) + return ERR_PTR(-ENOENT); return &e->priv; } @@ -663,6 +596,9 @@ static int pipapo_realloc_mt(struct nft_pipapo_field *f, check_add_overflow(rules, extra, &rules_alloc)) return -EOVERFLOW; + if (rules_alloc > (INT_MAX / sizeof(*new_mt))) + return -ENOMEM; + new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); if (!new_mt) return -ENOMEM; @@ -683,6 +619,30 @@ out_free: return 0; } + +/** + * lt_calculate_size() - Get storage size for lookup table with overflow check + * @groups: Amount of bit groups + * @bb: Number of bits grouped together in lookup table buckets + * @bsize: Size of each bucket in lookup table, in longs + * + * Return: allocation size including alignment overhead, negative on overflow + */ +static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb, + unsigned int bsize) +{ + ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long); + + if (check_mul_overflow(ret, bsize, &ret)) + return -1; + if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret)) + return -1; + if (ret > INT_MAX) + return -1; + + return ret; +} + /** * pipapo_resize() - Resize lookup or mapping table, or both * @f: Field containing lookup and mapping tables @@ -701,6 +661,7 @@ static int pipapo_resize(struct nft_pipapo_field *f, long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; unsigned int new_bucket_size, copy; int group, bucket, err; + ssize_t lt_size; if (rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; @@ -719,10 +680,11 @@ static int pipapo_resize(struct nft_pipapo_field *f, else copy = new_bucket_size; - new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) * - new_bucket_size * sizeof(*new_lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL); + lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size); + if (lt_size < 0) + return -ENOMEM; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return -ENOMEM; @@ -907,7 +869,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) { unsigned int groups, bb; unsigned long *new_lt; - size_t lt_size; + ssize_t lt_size; lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * sizeof(*f->lt); @@ -917,15 +879,17 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) groups = f->groups * 2; bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && lt_size < NFT_PIPAPO_LT_SIZE_LOW) { groups = f->groups / 2; bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; /* Don't increase group width if the resulting lookup table size * would exceed the upper size threshold for a "small" set. @@ -936,7 +900,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) return; } - new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT); + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return; @@ -1188,7 +1152,7 @@ static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int c mem = s; mem -= s->align_off; - kfree(mem); + kvfree(mem); } /** @@ -1209,10 +1173,9 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, void *scratch_aligned; u32 align_off; #endif - scratch = kzalloc_node(struct_size(scratch, map, - bsize_max * 2) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL_ACCOUNT, cpu_to_node(i)); + scratch = kvzalloc_node(struct_size(scratch, map, bsize_max * 2) + + NFT_PIPAPO_ALIGN_HEADROOM, + GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { /* On failure, there's no need to undo previous * allocations: this means that some scratch maps have @@ -1314,8 +1277,8 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, else end = start; - dup = pipapo_get(net, set, m, start, genmask, tstamp, GFP_KERNEL); - if (!IS_ERR(dup)) { + dup = pipapo_get(m, start, genmask, tstamp); + if (dup) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; @@ -1334,15 +1297,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, return -ENOTEMPTY; } - if (PTR_ERR(dup) == -ENOENT) { - /* Look for partially overlapping entries */ - dup = pipapo_get(net, set, m, end, nft_genmask_next(net), tstamp, - GFP_KERNEL); - } - - if (PTR_ERR(dup) != -ENOENT) { - if (IS_ERR(dup)) - return PTR_ERR(dup); + /* Look for partially overlapping entries */ + dup = pipapo_get(m, end, nft_genmask_next(net), tstamp); + if (dup) { *elem_priv = &dup->priv; return -ENOTEMPTY; } @@ -1451,13 +1408,15 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) for (i = 0; i < old->field_count; i++) { unsigned long *new_lt; + ssize_t lt_size; memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); - new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * - src->bsize * sizeof(*dst->lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL_ACCOUNT); + lt_size = lt_calculate_size(src->groups, src->bb, src->bsize); + if (lt_size < 0) + goto out_lt; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) goto out_lt; @@ -1469,6 +1428,9 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) src->groups * NFT_PIPAPO_BUCKETS(src->bb)); if (src->rules > 0) { + if (src->rules_alloc > (INT_MAX / sizeof(*src->mt))) + goto out_mt; + dst->mt = kvmalloc_array(src->rules_alloc, sizeof(*src->mt), GFP_KERNEL_ACCOUNT); @@ -1878,9 +1840,9 @@ nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, if (!m) return NULL; - e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, - nft_genmask_next(net), nft_net_tstamp(net), GFP_KERNEL); - if (IS_ERR(e)) + e = pipapo_get(m, (const u8 *)elem->key.val.data, + nft_genmask_next(net), nft_net_tstamp(net)); + if (!e) return NULL; nft_set_elem_change_active(net, set, &e->ext); diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index c15db28c5ebc4..db5d367e43c46 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1114,11 +1114,29 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, } /** + * pipapo_resmap_init_avx2() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Like pipapo_resmap_init() but do not set start map bits covered by the first field. + */ +static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + /* Starting map doesn't need to be set to all-ones for this implementation, + * but we do need to zero the remaining bits, if any. + */ + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} + +/** * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace * @set: nftables API set representation * @key: nftables API element representation containing key data - * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * @@ -1127,8 +1145,9 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, * * Return: true on match, false otherwise. */ -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_scratch *scratch; @@ -1136,17 +1155,18 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; + const struct nft_set_ext *ext; unsigned long *res, *fill; bool map_index; - int i, ret = 0; + int i; local_bh_disable(); if (unlikely(!irq_fpu_usable())) { - bool fallback_res = nft_pipapo_lookup(net, set, key, ext); + ext = nft_pipapo_lookup(net, set, key); local_bh_enable(); - return fallback_res; + return ext; } m = rcu_dereference(priv->match); @@ -1163,7 +1183,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, if (unlikely(!scratch)) { kernel_fpu_end(); local_bh_enable(); - return false; + return NULL; } map_index = scratch->map_index; @@ -1171,13 +1191,14 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, res = scratch->map + (map_index ? m->bsize_max : 0); fill = scratch->map + (map_index ? 0 : m->bsize_max); - /* Starting map doesn't need to be set for this implementation */ + pipapo_resmap_init_avx2(m, res); nft_pipapo_avx2_prepare(); next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; + int ret = 0; #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ @@ -1225,10 +1246,10 @@ next_match: goto out; if (last) { - *ext = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(*ext) || - !nft_set_elem_active(*ext, genmask))) { - ret = 0; + ext = &f->mt[ret].e->ext; + if (unlikely(nft_set_elem_expired(ext) || + !nft_set_elem_active(ext, genmask))) { + ext = NULL; goto next_match; } @@ -1245,5 +1266,5 @@ out: kernel_fpu_end(); local_bh_enable(); - return ret >= 0; + return ext; } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 2e8ef16ff191d..938a257c069e2 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -52,9 +52,9 @@ static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) return nft_set_elem_expired(&rbe->ext); } -static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext, - unsigned int seq) +static const struct nft_set_ext * +__nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, unsigned int seq) { struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; @@ -65,7 +65,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set parent = rcu_dereference_raw(priv->root.rb_node); while (parent != NULL) { if (read_seqcount_retry(&priv->count, seq)) - return false; + return NULL; rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -87,50 +87,48 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set } if (nft_rbtree_elem_expired(rbe)) - return false; + return NULL; if (nft_rbtree_interval_end(rbe)) { if (nft_set_is_anonymous(set)) - return false; + return NULL; parent = rcu_dereference_raw(parent->rb_left); interval = NULL; continue; } - *ext = &rbe->ext; - return true; + return &rbe->ext; } } if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && !nft_rbtree_elem_expired(interval) && - nft_rbtree_interval_start(interval)) { - *ext = &interval->ext; - return true; - } + nft_rbtree_interval_start(interval)) + return &interval->ext; - return false; + return NULL; } INDIRECT_CALLABLE_SCOPE -bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); - bool ret; + const struct nft_set_ext *ext; - ret = __nft_rbtree_lookup(net, set, key, ext, seq); - if (ret || !read_seqcount_retry(&priv->count, seq)) - return ret; + ext = __nft_rbtree_lookup(net, set, key, seq); + if (ext || !read_seqcount_retry(&priv->count, seq)) + return ext; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); - ret = __nft_rbtree_lookup(net, set, key, ext, seq); + ext = __nft_rbtree_lookup(net, set, key, seq); read_unlock_bh(&priv->lock); - return ret; + return ext; } static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 0c63d1367cf7a..a12486ae089d6 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -621,10 +621,10 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct geneve_opt *opt; int offset = 0; - inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); - if (!inner) - goto failure; while (opts->len > offset) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); + if (!inner) + goto failure; opt = (struct geneve_opt *)(opts->u.data + offset); if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, opt->opt_class) || @@ -634,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, opt->length * 4, opt->opt_data)) goto inner_failure; offset += sizeof(*opt) + opt->length * 4; + nla_nest_end(skb, inner); } - nla_nest_end(skb, inner); } nla_nest_end(skb, nest); return 0; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 709840612f0df..90b7630421c44 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1317,12 +1317,13 @@ void xt_compat_unlock(u_int8_t af) EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif -DEFINE_PER_CPU(seqcount_t, xt_recseq); -EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); - struct static_key xt_tee_enabled __read_mostly; EXPORT_SYMBOL_GPL(xt_tee_enabled); +#ifdef CONFIG_NETFILTER_XTABLES_LEGACY +DEFINE_PER_CPU(seqcount_t, xt_recseq); +EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); + static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; @@ -1514,6 +1515,7 @@ void *xt_unregister_table(struct xt_table *table) return private; } EXPORT_SYMBOL_GPL(xt_unregister_table); +#endif #ifdef CONFIG_PROC_FS static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) @@ -1897,6 +1899,7 @@ void xt_proto_fini(struct net *net, u_int8_t af) } EXPORT_SYMBOL_GPL(xt_proto_fini); +#ifdef CONFIG_NETFILTER_XTABLES_LEGACY /** * xt_percpu_counter_alloc - allocate x_tables rule counter * @@ -1951,6 +1954,7 @@ void xt_percpu_counter_free(struct xt_counters *counters) free_percpu((void __percpu *)pcnt); } EXPORT_SYMBOL_GPL(xt_percpu_counter_free); +#endif static int __net_init xt_net_init(struct net *net) { @@ -1983,8 +1987,10 @@ static int __init xt_init(void) unsigned int i; int rv; - for_each_possible_cpu(i) { - seqcount_init(&per_cpu(xt_recseq, i)); + if (IS_ENABLED(CONFIG_NETFILTER_XTABLES_LEGACY)) { + for_each_possible_cpu(i) { + seqcount_init(&per_cpu(xt_recseq, i)); + } } xt = kcalloc(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 9f54819eb52ca..d73957592c9d9 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -100,7 +100,7 @@ static void idletimer_tg_work(struct work_struct *work) static void idletimer_tg_expired(struct timer_list *t) { - struct idletimer_tg *timer = from_timer(timer, t, timer); + struct idletimer_tg *timer = timer_container_of(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); @@ -168,7 +168,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return 0; @@ -229,7 +229,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return 0; @@ -254,7 +254,7 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, info->label, info->timeout); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } @@ -275,7 +275,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; @@ -320,7 +320,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -382,7 +382,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) } } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 8a80fd76fe45b..90dcf088071a6 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -72,8 +72,9 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par) static void led_timeout_callback(struct timer_list *t) { - struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t, - timer); + struct xt_led_info_internal *ledinternal = timer_container_of(ledinternal, + t, + timer); led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF); } diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 30e99464171b7..93f064306901c 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_tcpoptstrip_target_info), .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TCPOPTSTRIP", .family = NFPROTO_IPV6, diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index c0f5e9a4f3c65..c437fbd59ec13 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -23,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); +#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n" + static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v0 *info = par->matchinfo; @@ -30,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) if (info->invert & ~1) return -EINVAL; + if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + return 0; } @@ -51,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -83,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -100,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { +#ifdef CONFIG_CGROUP_NET_CLASSID const struct xt_cgroup_info_v0 *info = par->matchinfo; struct sock *sk = skb->sk; @@ -108,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ info->invert; +#endif + return false; } static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) @@ -123,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) @@ -141,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 65b965ca40ea7..59b9d04400cac 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -48,7 +48,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_mark_tginfo2), .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) +#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP) { .name = "MARK", .revision = 2, diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 7c6bf1c168131..0ca1cdfc4095b 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -38,8 +38,8 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par) nfacct = nfnl_acct_find_get(par->net, info->name); if (nfacct == NULL) { - pr_info_ratelimited("accounting object `%s' does not exists\n", - info->name); + pr_info_ratelimited("accounting object `%.*s' does not exist\n", + NFACCT_NAME_MAX, info->name); return -ENOENT; } info->nfacct = nfacct; diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index cd9160bbc9197..33b77084a4e5f 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1165,6 +1165,11 @@ int netlbl_conn_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + if (sk->sk_family != AF_INET6) { + ret_val = -EAFNOSUPPORT; + goto conn_setattr_return; + } + addr6 = (struct sockaddr_in6 *)addr; entry = netlbl_domhsh_getentry_af6(secattr->domain, &addr6->sin6_addr); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e8972a857e51e..e2f7080dd5d7c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -387,7 +387,6 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) WARN_ON(skb->sk != NULL); skb->sk = sk; skb->destructor = netlink_skb_destructor; - atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } @@ -1212,41 +1211,48 @@ struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast) int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk) { + DECLARE_WAITQUEUE(wait, current); struct netlink_sock *nlk; + unsigned int rmem; nlk = nlk_sk(sk); + rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc); - if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_S_CONGESTED, &nlk->state))) { - DECLARE_WAITQUEUE(wait, current); - if (!*timeo) { - if (!ssk || netlink_is_kernel(ssk)) - netlink_overrun(sk); - sock_put(sk); - kfree_skb(skb); - return -EAGAIN; - } - - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&nlk->wait, &wait); + if ((rmem == skb->truesize || rmem <= READ_ONCE(sk->sk_rcvbuf)) && + !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { + netlink_skb_set_owner_r(skb, sk); + return 0; + } - if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_S_CONGESTED, &nlk->state)) && - !sock_flag(sk, SOCK_DEAD)) - *timeo = schedule_timeout(*timeo); + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&nlk->wait, &wait); + if (!*timeo) { + if (!ssk || netlink_is_kernel(ssk)) + netlink_overrun(sk); sock_put(sk); + kfree_skb(skb); + return -EAGAIN; + } - if (signal_pending(current)) { - kfree_skb(skb); - return sock_intr_errno(*timeo); - } - return 1; + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&nlk->wait, &wait); + rmem = atomic_read(&sk->sk_rmem_alloc); + + if (((rmem && rmem + skb->truesize > READ_ONCE(sk->sk_rcvbuf)) || + test_bit(NETLINK_S_CONGESTED, &nlk->state)) && + !sock_flag(sk, SOCK_DEAD)) + *timeo = schedule_timeout(*timeo); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nlk->wait, &wait); + sock_put(sk); + + if (signal_pending(current)) { + kfree_skb(skb); + return sock_intr_errno(*timeo); } - netlink_skb_set_owner_r(skb, sk); - return 0; + + return 1; } static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) @@ -1307,6 +1313,7 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; netlink_deliver_tap_kernel(sk, ssk, skb); @@ -1383,13 +1390,19 @@ EXPORT_SYMBOL_GPL(netlink_strict_get_check); static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); + unsigned int rmem, rcvbuf; - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc); + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + + if ((rmem == skb->truesize || rmem <= rcvbuf) && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); - return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); + return rmem > (rcvbuf >> 1); } + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); return -1; } @@ -2245,6 +2258,7 @@ static int netlink_dump(struct sock *sk, bool lock_taken) struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; + unsigned int rmem, rcvbuf; size_t max_recvmsg_len; struct module *module; int err = -ENOBUFS; @@ -2258,9 +2272,6 @@ static int netlink_dump(struct sock *sk, bool lock_taken) goto errout_skb; } - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - goto errout_skb; - /* NLMSG_GOODSIZE is small to avoid high order allocations being * required, but it makes sense to _attempt_ a 32KiB allocation * to reduce number of system calls on dump operations, if user @@ -2283,6 +2294,13 @@ static int netlink_dump(struct sock *sk, bool lock_taken) if (!skb) goto errout_skb; + rcvbuf = READ_ONCE(sk->sk_rcvbuf); + rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc); + if (rmem != skb->truesize && rmem >= rcvbuf) { + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + goto errout_skb; + } + /* Trim skb to allocated size. User is expected to provide buffer as * large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at * netlink_recvmsg())). dump will pack as many smaller messages as @@ -2455,7 +2473,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, unsigned int flags = 0; size_t tlvlen; - /* Error messages get the original request appened, unless the user + /* Error messages get the original request appended, unless the user * requests to cap the error message, and get extra error data if * requested. */ @@ -2869,8 +2887,7 @@ static const struct rhashtable_params netlink_rhashtable_params = { }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) -BTF_ID_LIST(btf_netlink_sock_id) -BTF_ID(struct, netlink_sock) +BTF_ID_LIST_SINGLE(btf_netlink_sock_id, struct, netlink_sock) static const struct bpf_iter_seq_info netlink_seq_info = { .seq_ops = &netlink_seq_ops, diff --git a/net/netlink/policy.c b/net/netlink/policy.c index 1f8909c16f14c..99458da6be322 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -311,6 +311,8 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, NL_POLICY_TYPE_ATTR_PAD)) goto nla_put_failure; break; + } else if (pt->validation_type == NLA_VALIDATE_FUNCTION) { + break; } nla_get_range_unsigned(pt, &range); @@ -340,6 +342,9 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, else type = NL_ATTR_TYPE_SINT; + if (pt->validation_type == NLA_VALIDATE_FUNCTION) + break; + nla_get_range_signed(pt, &range); if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 6ee148f0e6d04..3331669d8e33a 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -240,7 +240,7 @@ void nr_destroy_socket(struct sock *); */ static void nr_destroy_timer(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); bh_lock_sock(sk); sock_hold(sk); nr_destroy_socket(sk); diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 5e3ca068f04e0..b3a62b1f3a09e 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -111,7 +111,7 @@ int nr_t1timer_running(struct sock *sk) static void nr_heartbeat_expiry(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); struct nr_sock *nr = nr_sk(sk); bh_lock_sock(sk); @@ -152,7 +152,7 @@ out: static void nr_t2timer_expiry(struct timer_list *t) { - struct nr_sock *nr = from_timer(nr, t, t2timer); + struct nr_sock *nr = timer_container_of(nr, t, t2timer); struct sock *sk = &nr->sock; bh_lock_sock(sk); @@ -166,7 +166,7 @@ static void nr_t2timer_expiry(struct timer_list *t) static void nr_t4timer_expiry(struct timer_list *t) { - struct nr_sock *nr = from_timer(nr, t, t4timer); + struct nr_sock *nr = timer_container_of(nr, t, t4timer); struct sock *sk = &nr->sock; bh_lock_sock(sk); @@ -177,7 +177,7 @@ static void nr_t4timer_expiry(struct timer_list *t) static void nr_idletimer_expiry(struct timer_list *t) { - struct nr_sock *nr = from_timer(nr, t, idletimer); + struct nr_sock *nr = timer_container_of(nr, t, idletimer); struct sock *sk = &nr->sock; bh_lock_sock(sk); @@ -206,7 +206,7 @@ static void nr_idletimer_expiry(struct timer_list *t) static void nr_t1timer_expiry(struct timer_list *t) { - struct nr_sock *nr = from_timer(nr, t, t1timer); + struct nr_sock *nr = timer_container_of(nr, t, t1timer); struct sock *sk = &nr->sock; bh_lock_sock(sk); diff --git a/net/nfc/core.c b/net/nfc/core.c index 75ed8a9146baa..ae1c842f9c642 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -1010,7 +1010,7 @@ exit: static void nfc_check_pres_timeout(struct timer_list *t) { - struct nfc_dev *dev = from_timer(dev, t, check_pres_timer); + struct nfc_dev *dev = timer_container_of(dev, t, check_pres_timer); schedule_work(&dev->check_pres_work); } diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index aa493344d93e3..8618d57c23dae 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -441,7 +441,7 @@ exit_noskb: static void nfc_hci_cmd_timeout(struct timer_list *t) { - struct nfc_hci_dev *hdev = from_timer(hdev, t, cmd_timer); + struct nfc_hci_dev *hdev = timer_container_of(hdev, t, cmd_timer); schedule_work(&hdev->msg_tx_work); } diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index ce9c683a3ead7..4fc37894860c9 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -564,14 +564,14 @@ static void llc_shdlc_handle_send_queue(struct llc_shdlc *shdlc) static void llc_shdlc_connect_timeout(struct timer_list *t) { - struct llc_shdlc *shdlc = from_timer(shdlc, t, connect_timer); + struct llc_shdlc *shdlc = timer_container_of(shdlc, t, connect_timer); schedule_work(&shdlc->sm_work); } static void llc_shdlc_t1_timeout(struct timer_list *t) { - struct llc_shdlc *shdlc = from_timer(shdlc, t, t1_timer); + struct llc_shdlc *shdlc = timer_container_of(shdlc, t, t1_timer); pr_debug("SoftIRQ: need to send ack\n"); @@ -580,7 +580,7 @@ static void llc_shdlc_t1_timeout(struct timer_list *t) static void llc_shdlc_t2_timeout(struct timer_list *t) { - struct llc_shdlc *shdlc = from_timer(shdlc, t, t2_timer); + struct llc_shdlc *shdlc = timer_container_of(shdlc, t, t2_timer); pr_debug("SoftIRQ: need to retransmit\n"); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 27e863f96ed1e..beeb3b4d28cab 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -243,7 +243,8 @@ static void nfc_llcp_timeout_work(struct work_struct *work) static void nfc_llcp_symm_timer(struct timer_list *t) { - struct nfc_llcp_local *local = from_timer(local, t, link_timer); + struct nfc_llcp_local *local = timer_container_of(local, t, + link_timer); pr_err("SYMM timeout\n"); @@ -286,7 +287,8 @@ static void nfc_llcp_sdreq_timeout_work(struct work_struct *work) static void nfc_llcp_sdreq_timer(struct timer_list *t) { - struct nfc_llcp_local *local = from_timer(local, t, sdreq_timer); + struct nfc_llcp_local *local = timer_container_of(local, t, + sdreq_timer); schedule_work(&local->sdreq_timeout_work); } diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 0171bf3c70162..fc921cd2cdffa 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -610,7 +610,7 @@ static int nci_close_device(struct nci_dev *ndev) /* NCI command timer function */ static void nci_cmd_timer(struct timer_list *t) { - struct nci_dev *ndev = from_timer(ndev, t, cmd_timer); + struct nci_dev *ndev = timer_container_of(ndev, t, cmd_timer); atomic_set(&ndev->cmd_cnt, 1); queue_work(ndev->cmd_wq, &ndev->cmd_work); @@ -619,7 +619,7 @@ static void nci_cmd_timer(struct timer_list *t) /* NCI data exchange timer function */ static void nci_data_timer(struct timer_list *t) { - struct nci_dev *ndev = from_timer(ndev, t, data_timer); + struct nci_dev *ndev = timer_container_of(ndev, t, data_timer); set_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags); queue_work(ndev->rx_wq, &ndev->rx_work); diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index ed1508a9e093e..aab107727f186 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -119,22 +119,22 @@ static int nci_uart_set_driver(struct tty_struct *tty, unsigned int driver) memcpy(nu, nci_uart_drivers[driver], sizeof(struct nci_uart)); nu->tty = tty; - tty->disc_data = nu; skb_queue_head_init(&nu->tx_q); INIT_WORK(&nu->write_work, nci_uart_write_work); spin_lock_init(&nu->rx_lock); ret = nu->ops.open(nu); if (ret) { - tty->disc_data = NULL; kfree(nu); + return ret; } else if (!try_module_get(nu->owner)) { nu->ops.close(nu); - tty->disc_data = NULL; kfree(nu); return -ENOENT; } - return ret; + tty->disc_data = nu; + + return 0; } /* ------ LDISC part ------ */ diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 6a40b8d0350d9..a18e2c503da61 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1192,7 +1192,7 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) continue; uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]); - if (uri == NULL || *uri == 0) + if (*uri == 0) continue; tid = local->sdreq_next_tid++; @@ -1540,10 +1540,6 @@ static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); - if (!apdu) { - rc = -EINVAL; - goto put_dev; - } ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); if (!ctx) { diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 5481bd561eb41..e6aaee92dba48 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -11,8 +11,8 @@ config OPENVSWITCH (!NF_NAT || NF_NAT) && \ (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT))) depends on PSAMPLE || !PSAMPLE - select CRC32 select MPLS + select NET_CRC32C select NET_MPLS_GSO select DST_CACHE select NET_NSH diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2f22ca59586f2..2832e07941971 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -39,56 +39,16 @@ #include "flow_netlink.h" #include "openvswitch_trace.h" -struct deferred_action { - struct sk_buff *skb; - const struct nlattr *actions; - int actions_len; - - /* Store pkt_key clone when creating deferred action. */ - struct sw_flow_key pkt_key; -}; - -#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) -struct ovs_frag_data { - unsigned long dst; - struct vport *vport; - struct ovs_skb_cb cb; - __be16 inner_protocol; - u16 network_offset; /* valid only for MPLS */ - u16 vlan_tci; - __be16 vlan_proto; - unsigned int l2_len; - u8 mac_proto; - u8 l2_data[MAX_L2_LEN]; -}; - -static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); - -#define DEFERRED_ACTION_FIFO_SIZE 10 -#define OVS_RECURSION_LIMIT 5 -#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) -struct action_fifo { - int head; - int tail; - /* Deferred action fifo queue storage. */ - struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; -}; - -struct action_flow_keys { - struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; -}; - -static struct action_fifo __percpu *action_fifos; -static struct action_flow_keys __percpu *flow_keys; -static DEFINE_PER_CPU(int, exec_actions_level); +struct ovs_pcpu_storage __percpu *ovs_pcpu_storage; /* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' * space. Return NULL if out of key spaces. */ static struct sw_flow_key *clone_key(const struct sw_flow_key *key_) { - struct action_flow_keys *keys = this_cpu_ptr(flow_keys); - int level = this_cpu_read(exec_actions_level); + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage); + struct action_flow_keys *keys = &ovs_pcpu->flow_keys; + int level = ovs_pcpu->exec_level; struct sw_flow_key *key = NULL; if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { @@ -132,10 +92,9 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, const struct nlattr *actions, const int actions_len) { - struct action_fifo *fifo; + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage->action_fifos); struct deferred_action *da; - fifo = this_cpu_ptr(action_fifos); da = action_fifo_put(fifo); if (da) { da->skb = skb; @@ -794,7 +753,7 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); + struct ovs_frag_data *data = this_cpu_ptr(&ovs_pcpu_storage->frag_data); struct vport *vport = data->vport; if (skb_cow_head(skb, data->l2_len) < 0) { @@ -846,7 +805,7 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb, unsigned int hlen = skb_network_offset(skb); struct ovs_frag_data *data; - data = this_cpu_ptr(&ovs_frag_data_storage); + data = this_cpu_ptr(&ovs_pcpu_storage->frag_data); data->dst = skb->_skb_refdst; data->vport = vport; data->cb = *OVS_CB(skb); @@ -982,8 +941,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, break; case OVS_USERSPACE_ATTR_PID: - if (dp->user_features & - OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + if (OVS_CB(skb)->upcall_pid) + upcall.portid = OVS_CB(skb)->upcall_pid; + else if (dp->user_features & + OVS_DP_F_DISPATCH_UPCALL_PER_CPU) upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id()); @@ -1605,16 +1566,15 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, clone = clone_flow_key ? clone_key(key) : key; if (clone) { int err = 0; - if (actions) { /* Sample action */ if (clone_flow_key) - __this_cpu_inc(exec_actions_level); + __this_cpu_inc(ovs_pcpu_storage->exec_level); err = do_execute_actions(dp, skb, clone, actions, len); if (clone_flow_key) - __this_cpu_dec(exec_actions_level); + __this_cpu_dec(ovs_pcpu_storage->exec_level); } else { /* Recirc action */ clone->recirc_id = recirc_id; ovs_dp_process_packet(skb, clone); @@ -1650,7 +1610,7 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, static void process_deferred_actions(struct datapath *dp) { - struct action_fifo *fifo = this_cpu_ptr(action_fifos); + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage->action_fifos); /* Do not touch the FIFO in case there is no deferred actions. */ if (action_fifo_is_empty(fifo)) @@ -1681,7 +1641,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, { int err, level; - level = __this_cpu_inc_return(exec_actions_level); + level = __this_cpu_inc_return(ovs_pcpu_storage->exec_level); if (unlikely(level > OVS_RECURSION_LIMIT)) { net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", ovs_dp_name(dp)); @@ -1698,27 +1658,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, process_deferred_actions(dp); out: - __this_cpu_dec(exec_actions_level); + __this_cpu_dec(ovs_pcpu_storage->exec_level); return err; } - -int action_fifos_init(void) -{ - action_fifos = alloc_percpu(struct action_fifo); - if (!action_fifos) - return -ENOMEM; - - flow_keys = alloc_percpu(struct action_flow_keys); - if (!flow_keys) { - free_percpu(action_fifos); - return -ENOMEM; - } - - return 0; -} - -void action_fifos_exit(void) -{ - free_percpu(action_fifos); - free_percpu(flow_keys); -} diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 5d548eda742df..d5b6e2002bc1f 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -244,11 +244,13 @@ void ovs_dp_detach_port(struct vport *p) /* Must be called with rcu_read_lock. */ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage); const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; + bool ovs_pcpu_locked = false; u64 *stats_counter; u32 n_mask_hit; u32 n_cache_hit; @@ -265,7 +267,9 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + if (OVS_CB(skb)->upcall_pid) + upcall.portid = OVS_CB(skb)->upcall_pid; + else if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id()); else @@ -290,10 +294,26 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); + /* This path can be invoked recursively: Use the current task to + * identify recursive invocation - the lock must be acquired only once. + * Even with disabled bottom halves this can be preempted on PREEMPT_RT. + * Limit the locking to RT to avoid assigning `owner' if it can be + * avoided. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) { + local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); + ovs_pcpu->owner = current; + ovs_pcpu_locked = true; + } + error = ovs_execute_actions(dp, skb, sf_acts, key); if (unlikely(error)) net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", ovs_dp_name(dp), error); + if (ovs_pcpu_locked) { + ovs_pcpu->owner = NULL; + local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); + } stats_counter = &stats->n_hit; @@ -633,6 +653,9 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) !!(hash & OVS_PACKET_HASH_L4_BIT)); } + OVS_CB(packet)->upcall_pid = + nla_get_u32_default(a[OVS_PACKET_ATTR_UPCALL_PID], 0); + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); @@ -671,7 +694,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); + local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage->owner, current); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage->owner, NULL); + local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); local_bh_enable(); rcu_read_unlock(); @@ -695,6 +724,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, + [OVS_PACKET_ATTR_UPCALL_PID] = { .type = NLA_U32 }, }; static const struct genl_small_ops dp_packet_genl_ops[] = { @@ -2720,6 +2750,28 @@ static struct drop_reason_list drop_reason_list_ovs = { .n_reasons = ARRAY_SIZE(ovs_drop_reasons), }; +static int __init ovs_alloc_percpu_storage(void) +{ + unsigned int cpu; + + ovs_pcpu_storage = alloc_percpu(*ovs_pcpu_storage); + if (!ovs_pcpu_storage) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct ovs_pcpu_storage *ovs_pcpu; + + ovs_pcpu = per_cpu_ptr(ovs_pcpu_storage, cpu); + local_lock_init(&ovs_pcpu->bh_lock); + } + return 0; +} + +static void ovs_free_percpu_storage(void) +{ + free_percpu(ovs_pcpu_storage); +} + static int __init dp_init(void) { int err; @@ -2729,13 +2781,13 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = action_fifos_init(); + err = ovs_alloc_percpu_storage(); if (err) goto error; err = ovs_internal_dev_rtnl_link_register(); if (err) - goto error_action_fifos_exit; + goto error; err = ovs_flow_init(); if (err) @@ -2778,9 +2830,8 @@ error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); -error_action_fifos_exit: - action_fifos_exit(); error: + ovs_free_percpu_storage(); return err; } @@ -2795,7 +2846,7 @@ static void dp_cleanup(void) ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); - action_fifos_exit(); + ovs_free_percpu_storage(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 384ca77f4e794..db0c3e69d66ca 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -13,6 +13,7 @@ #include <linux/skbuff.h> #include <linux/u64_stats_sync.h> #include <net/ip_tunnels.h> +#include <net/mpls.h> #include "conntrack.h" #include "flow.h" @@ -120,6 +121,8 @@ struct datapath { * @cutlen: The number of bytes from the packet end to be removed. * @probability: The sampling probability that was applied to this skb; 0 means * no sampling has occurred; U32_MAX means 100% probability. + * @upcall_pid: Netlink socket PID to use for sending this packet to userspace; + * 0 means "not set" and default per-CPU or per-vport dispatch should be used. */ struct ovs_skb_cb { struct vport *input_vport; @@ -127,6 +130,7 @@ struct ovs_skb_cb { u16 acts_origlen; u32 cutlen; u32 probability; + u32 upcall_pid; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -173,6 +177,55 @@ struct ovs_net { bool xt_label; }; +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + u16 network_offset; /* valid only for MPLS */ + u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 mac_proto; + u8 l2_data[MAX_L2_LEN]; +}; + +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + int actions_len; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +#define OVS_RECURSION_LIMIT 5 +#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) + +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +struct action_flow_keys { + struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; +}; + +struct ovs_pcpu_storage { + struct action_fifo action_fifos; + struct action_flow_keys flow_keys; + struct ovs_frag_data frag_data; + int exec_level; + struct task_struct *owner; + local_lock_t bh_lock; +}; + +extern struct ovs_pcpu_storage __percpu *ovs_pcpu_storage; + /** * enum ovs_pkt_hash_types - hash info to include with a packet * to send to userspace. @@ -281,9 +334,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, void ovs_dp_notify_wq(struct work_struct *work); -int action_fifos_init(void); -void action_fifos_exit(void); - /* 'KEY' must not have any bits set outside of the 'MASK' */ #define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) #define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8a848ce72e291..b80bd3a907739 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -788,7 +788,7 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (eth_p_mpls(key->eth.type)) { - u8 label_count = 1; + size_t label_count = 1; memset(&key->mpls, 0, sizeof(key->mpls)); skb_set_inner_network_header(skb, skb->mac_len); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 518be23e48ea9..ad64bb9ab5e25 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -3049,7 +3049,8 @@ static int validate_userspace(const struct nlattr *attr) struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; int error; - error = nla_parse_nested_deprecated(a, OVS_USERSPACE_ATTR_MAX, attr, + error = nla_parse_deprecated_strict(a, OVS_USERSPACE_ATTR_MAX, + nla_data(attr), nla_len(attr), userspace_policy, NULL); if (error) return error; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 8732f6e51ae5a..6bbbc16ab7780 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -501,6 +501,7 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, OVS_CB(skb)->mru = 0; OVS_CB(skb)->cutlen = 0; OVS_CB(skb)->probability = 0; + OVS_CB(skb)->upcall_pid = 0; if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { u32 mark; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d4dba06297c33..a7017d7f09272 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -722,7 +722,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) static void prb_retire_rx_blk_timer_expired(struct timer_list *t) { struct packet_sock *po = - from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); + timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; @@ -2785,7 +2785,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; - long timeo = 0; + long timeo; mutex_lock(&po->pg_vec_lock); @@ -2839,22 +2839,28 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; + timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { - if (need_wait && skb) { - timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); + /* Note: packet_read_pending() might be slow if we + * have to call it as it's per_cpu variable, but in + * fast-path we don't have to call it, only when ph + * is NULL, we need to check the pending_refcnt. + */ + if (need_wait && packet_read_pending(&po->tx_ring)) { timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } - } - /* check for additional frames */ - continue; + /* check for additional frames */ + continue; + } else + break; } skb = NULL; @@ -2943,14 +2949,7 @@ tpacket_error: } packet_increment_head(&po->tx_ring); len_sum += tp_len; - } while (likely((ph != NULL) || - /* Note: packet_read_pending() might be slow if we have - * to call it as it's per_cpu variable, but in fast-path - * we already short-circuit the loop with the first - * condition, and luckily don't have to go that path - * anyway. - */ - (need_wait && packet_read_pending(&po->tx_ring)))); + } while (1); err = len_sum; goto out_put; @@ -3713,15 +3712,15 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, } static void packet_dev_mclist_delete(struct net_device *dev, - struct packet_mclist **mlp) + struct packet_mclist **mlp, + struct list_head *list) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { - packet_dev_mc(dev, ml, -1); + list_add(&ml->remove_list, list); *mlp = ml->next; - kfree(ml); } else mlp = &ml->next; } @@ -3769,6 +3768,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; + INIT_LIST_HEAD(&i->remove_list); i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); @@ -4233,9 +4233,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { - struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct packet_mclist *ml, *tmp; + LIST_HEAD(mclist); + struct sock *sk; rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { @@ -4244,7 +4246,8 @@ static int packet_notifier(struct notifier_block *this, switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) - packet_dev_mclist_delete(dev, &po->mclist); + packet_dev_mclist_delete(dev, &po->mclist, + &mclist); fallthrough; case NETDEV_DOWN: @@ -4277,6 +4280,13 @@ static int packet_notifier(struct notifier_block *this, } } rcu_read_unlock(); + + /* packet_dev_mc might grab instance locks so can't run under rcu */ + list_for_each_entry_safe(ml, tmp, &mclist, remove_list) { + packet_dev_mc(dev, ml, -1); + kfree(ml); + } + return NOTIFY_DONE; } @@ -4563,10 +4573,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; - if (was_running) { - WRITE_ONCE(po->num, 0); + WRITE_ONCE(po->num, 0); + if (was_running) __unregister_prot_hook(sk, false); - } + spin_unlock(&po->bind_lock); synchronize_net(); @@ -4598,10 +4608,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); - if (was_running) { - WRITE_ONCE(po->num, num); + WRITE_ONCE(po->num, num); + if (was_running) register_prot_hook(sk); - } + spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ @@ -4772,7 +4782,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) READ_ONCE(po->ifindex), packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), + from_kuid_munged(seq_user_ns(seq), sk_uid(s)), sock_i_ino(s)); } diff --git a/net/packet/diag.c b/net/packet/diag.c index 47f69f3dbf73e..6ce1dcc284d92 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -153,7 +153,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, if ((req->pdiag_show & PACKET_SHOW_INFO) && nla_put_u32(skb, PACKET_DIAG_UID, - from_kuid_munged(user_ns, sock_i_uid(sk)))) + from_kuid_munged(user_ns, sk_uid(sk)))) goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_MCLIST) && diff --git a/net/packet/internal.h b/net/packet/internal.h index d5d70712007ad..1e743d0316fdd 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -11,6 +11,7 @@ struct packet_mclist { unsigned short type; unsigned short alen; unsigned char addr[MAX_ADDR_LEN]; + struct list_head remove_list; }; /* kbdq - kernel block descriptor queue */ diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 53a858478e22f..62527e1ebb883 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -826,6 +826,7 @@ static struct sock *pep_sock_accept(struct sock *sk, } /* Check for duplicate pipe handle */ + pn_skb_get_dst_sockaddr(skb, &dst); newsk = pep_find_pipe(&pn->hlist, &dst, pipe_handle); if (unlikely(newsk)) { __sock_put(newsk); @@ -850,7 +851,6 @@ static struct sock *pep_sock_accept(struct sock *sk, newsk->sk_destruct = pipe_destruct; newpn = pep_sk(newsk); - pn_skb_get_dst_sockaddr(skb, &dst); pn_skb_get_src_sockaddr(skb, &src); newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst); newpn->pn_sk.dobject = pn_sockaddr_get_object(&src); diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 5ce0b3ee5def8..ea4d5e6533dba 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -584,7 +584,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) sk->sk_protocol, pn->sobject, pn->dobject, pn->resource, sk->sk_state, sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, atomic_read(&sk->sk_drops)); @@ -755,7 +755,7 @@ static int pn_res_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%02X %5u %lu", (int) (psk - pnres.sk), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk)); } seq_pad(seq, '\n'); diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 8435a20968ef5..086a13170e096 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -598,7 +598,7 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, } if (addr_type & IPV6_ADDR_LINKLOCAL) { - /* If socket is arleady bound to a link local address, + /* If socket is already bound to a link local address, * the peer address must be on the same link. */ if (sin6->sin6_scope_id == 0 || diff --git a/net/rds/connection.c b/net/rds/connection.c index c749c5525b40f..d62f486ab29fa 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -749,8 +749,7 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) cinfo->laddr = conn->c_laddr.s6_addr32[3]; cinfo->faddr = conn->c_faddr.s6_addr32[3]; cinfo->tos = conn->c_tos; - strncpy(cinfo->transport, conn->c_trans->t_name, - sizeof(cinfo->transport)); + strscpy_pad(cinfo->transport, conn->c_trans->t_name); cinfo->flags = 0; rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), @@ -775,8 +774,7 @@ static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) cinfo6->next_rx_seq = cp->cp_next_rx_seq; cinfo6->laddr = conn->c_laddr; cinfo6->faddr = conn->c_faddr; - strncpy(cinfo6->transport, conn->c_trans->t_name, - sizeof(cinfo6->transport)); + strscpy_pad(cinfo6->transport, conn->c_trans->t_name); cinfo6->flags = 0; rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), diff --git a/net/rds/page.c b/net/rds/page.c index 7cc57e098ddb9..afb151eac271c 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -40,10 +40,12 @@ struct rds_page_remainder { struct page *r_page; unsigned long r_offset; + local_lock_t bh_lock; }; -static -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /** * rds_page_remainder_alloc - build up regions of a message. @@ -69,7 +71,6 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp) { struct rds_page_remainder *rem; - unsigned long flags; struct page *page; int ret; @@ -87,8 +88,9 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, goto out; } - rem = &per_cpu(rds_page_remainders, get_cpu()); - local_irq_save(flags); + local_bh_disable(); + local_lock_nested_bh(&rds_page_remainders.bh_lock); + rem = this_cpu_ptr(&rds_page_remainders); while (1) { /* avoid a tiny region getting stuck by tossing it */ @@ -116,13 +118,14 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } /* alloc if there is nothing for us to use */ - local_irq_restore(flags); - put_cpu(); + local_unlock_nested_bh(&rds_page_remainders.bh_lock); + local_bh_enable(); page = alloc_page(gfp); - rem = &per_cpu(rds_page_remainders, get_cpu()); - local_irq_save(flags); + local_bh_disable(); + local_lock_nested_bh(&rds_page_remainders.bh_lock); + rem = this_cpu_ptr(&rds_page_remainders); if (!page) { ret = -ENOMEM; @@ -140,8 +143,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem->r_offset = 0; } - local_irq_restore(flags); - put_cpu(); + local_unlock_nested_bh(&rds_page_remainders.bh_lock); + local_bh_enable(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, diff --git a/net/rds/send.c b/net/rds/send.c index 09a2801106549..42d991bc8543c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -232,7 +232,7 @@ restart: * If not already working on one, grab the next message. * * cp_xmit_rm holds a ref while we're sending this message down - * the connction. We can use this ref while holding the + * the connection. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ if (!rm) { diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index d89bd8d0c3545..91e34af3fe5d5 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -105,10 +105,6 @@ int rds_tcp_accept_one(struct socket *sock) int conn_state; struct rds_conn_path *cp; struct in6_addr *my_addr, *peer_addr; - struct proto_accept_arg arg = { - .flags = O_NONBLOCK, - .kern = true, - }; #if !IS_ENABLED(CONFIG_IPV6) struct in6_addr saddr, daddr; #endif @@ -117,25 +113,9 @@ int rds_tcp_accept_one(struct socket *sock) if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; - ret = sock_create_lite(sock->sk->sk_family, - sock->sk->sk_type, sock->sk->sk_protocol, - &new_sock); + ret = kernel_accept(sock, &new_sock, O_NONBLOCK); if (ret) - goto out; - - ret = sock->ops->accept(sock, new_sock, &arg); - if (ret < 0) - goto out; - - /* sock_create_lite() does not get a hold on the owner module so we - * need to do it here. Note that sock_release() uses sock->ops to - * determine if it needs to decrement the reference count. So set - * sock->ops after calling accept() in case that fails. And there's - * no need to do try_module_get() as the listener should have a hold - * already. - */ - new_sock->ops = sock->ops; - __module_get(new_sock->ops->owner); + return ret; rds_tcp_keepalive(new_sock); if (!rds_tcp_tune(new_sock)) { @@ -298,15 +278,15 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) sin6 = (struct sockaddr_in6 *)&ss; sin6->sin6_family = PF_INET6; sin6->sin6_addr = in6addr_any; - sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); + sin6->sin6_port = htons(RDS_TCP_PORT); sin6->sin6_scope_id = 0; sin6->sin6_flowinfo = 0; addr_len = sizeof(*sin6); } else { sin = (struct sockaddr_in *)&ss; sin->sin_family = PF_INET; - sin->sin_addr.s_addr = INADDR_ANY; - sin->sin_port = (__force u16)htons(RDS_TCP_PORT); + sin->sin_addr.s_addr = htonl(INADDR_ANY); + sin->sin_port = htons(RDS_TCP_PORT); addr_len = sizeof(*sin); } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index a4a668b88a8f2..4e72b636a46a5 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -345,7 +345,7 @@ void rose_destroy_socket(struct sock *); */ static void rose_destroy_timer(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); rose_destroy_socket(sk); } diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 4d67f36dce1b4..3e99181e759f9 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -101,6 +101,7 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety */ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m) { + enum skb_drop_reason dr; /* ignored */ struct rose_sock *rose = rose_sk(sk); int queued = 0; @@ -162,7 +163,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_frames_acked(sk, nr); if (ns == rose->vr) { rose_start_idletimer(sk); - if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 && + if (!sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN, &dr) && __sock_queue_rcv_skb(sk, skb) == 0) { rose->vr = (rose->vr + 1) % ROSE_MODULUS; queued = 1; diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index 9f9629e6fdaea..7746229fdc8cf 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -78,7 +78,7 @@ static void rose_ftimer_expiry(struct timer_list *t) static void rose_t0timer_expiry(struct timer_list *t) { - struct rose_neigh *neigh = from_timer(neigh, t, t0timer); + struct rose_neigh *neigh = timer_container_of(neigh, t, t0timer); rose_transmit_restart_request(neigh); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 2dd6bd3a3011f..b72bf8a08d489 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -497,22 +497,15 @@ void rose_rt_device_down(struct net_device *dev) t = rose_node; rose_node = rose_node->next; - for (i = 0; i < t->count; i++) { + for (i = t->count - 1; i >= 0; i--) { if (t->neighbour[i] != s) continue; t->count--; - switch (i) { - case 0: - t->neighbour[0] = t->neighbour[1]; - fallthrough; - case 1: - t->neighbour[1] = t->neighbour[2]; - break; - case 2: - break; - } + memmove(&t->neighbour[i], &t->neighbour[i + 1], + sizeof(t->neighbour[0]) * + (t->count - i)); } if (t->count <= 0) diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index 1525773e94aa1..020369c49587f 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -118,7 +118,7 @@ void rose_stop_idletimer(struct sock *sk) static void rose_heartbeat_expiry(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); struct rose_sock *rose = rose_sk(sk); bh_lock_sock(sk); @@ -163,7 +163,7 @@ out: static void rose_timer_expiry(struct timer_list *t) { - struct rose_sock *rose = from_timer(rose, t, timer); + struct rose_sock *rose = timer_container_of(rose, t, timer); struct sock *sk = &rose->sock; bh_lock_sock(sk); @@ -198,7 +198,7 @@ out: static void rose_idletimer_expiry(struct timer_list *t) { - struct rose_sock *rose = from_timer(rose, t, idletimer); + struct rose_sock *rose = timer_container_of(rose, t, idletimer); struct sock *sk = &rose->sock; bh_lock_sock(sk); diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index a20986806fea9..f60b81c660789 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -67,6 +67,29 @@ config RXKAD See Documentation/networking/rxrpc.rst. +config RXGK + bool "RxRPC GSSAPI security" + select CRYPTO_KRB5 + select CRYPTO_MANAGER + select CRYPTO_KRB5ENC + select CRYPTO_AUTHENC + select CRYPTO_SKCIPHER + select CRYPTO_HASH_INFO + select CRYPTO_HMAC + select CRYPTO_CMAC + select CRYPTO_SHA1 + select CRYPTO_SHA256 + select CRYPTO_SHA512 + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_AES + select CRYPTO_CAMELLIA + help + Provide the GSSAPI-based RxGK security class for AFS. Keys are added + with add_key(). + + See Documentation/networking/rxrpc.rst. + config RXPERF tristate "RxRPC test service" help diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 210b75e3179e9..c0542bae719e3 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -24,6 +24,7 @@ rxrpc-y := \ local_object.o \ misc.o \ net_ns.o \ + oob.o \ output.o \ peer_event.o \ peer_object.o \ @@ -39,6 +40,9 @@ rxrpc-y := \ rxrpc-$(CONFIG_PROC_FS) += proc.o rxrpc-$(CONFIG_RXKAD) += rxkad.o rxrpc-$(CONFIG_SYSCTL) += sysctl.o - +rxrpc-$(CONFIG_RXGK) += \ + rxgk.o \ + rxgk_app.o \ + rxgk_kdf.o obj-$(CONFIG_RXPERF) += rxperf.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 86873399f7d50..36df0274d7b74 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -265,7 +265,10 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @gfp: Allocation flags * * Lookup or create a remote transport endpoint record for the specified - * address and return it with a ref held. + * address. + * + * Return: The peer record found with a reference, %NULL if no record is found + * or a negative error code if the address is invalid or unsupported. */ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, struct sockaddr_rxrpc *srx, gfp_t gfp) @@ -283,9 +286,11 @@ EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); /** * rxrpc_kernel_get_peer - Get a reference on a peer - * @peer: The peer to get a reference on. + * @peer: The peer to get a reference on (may be NULL). + * + * Get a reference for a remote peer record (if not NULL). * - * Get a record for the remote peer in a call. + * Return: The @peer argument. */ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer) { @@ -296,6 +301,8 @@ EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference * @peer: The peer to drop a ref on + * + * Drop a reference on a peer record. */ void rxrpc_kernel_put_peer(struct rxrpc_peer *peer) { @@ -320,10 +327,12 @@ EXPORT_SYMBOL(rxrpc_kernel_put_peer); * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and - * call IDs as appropriate. The call to be used is returned. + * call IDs as appropriate. * * The default socket destination address and security may be overridden by * supplying @srx and @key. + * + * Return: The new call or an error code. */ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, @@ -437,6 +446,8 @@ EXPORT_SYMBOL(rxrpc_kernel_put_call); * * Allow a kernel service to find out whether a call is still alive - whether * it has completed successfully and all received data has been consumed. + * + * Return: %true if the call is still ongoing and %false if it has completed. */ bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) @@ -450,63 +461,20 @@ bool rxrpc_kernel_check_life(const struct socket *sock, EXPORT_SYMBOL(rxrpc_kernel_check_life); /** - * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. - * @sock: The socket the call is on - * @call: The call to query - * - * Allow a kernel service to retrieve the epoch value from a service call to - * see if the client at the other end rebooted. - */ -u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) -{ - return call->conn->proto.epoch; -} -EXPORT_SYMBOL(rxrpc_kernel_get_epoch); - -/** - * rxrpc_kernel_new_call_notification - Get notifications of new calls - * @sock: The socket to intercept received messages on - * @notify_new_call: Function to be called when new calls appear - * @discard_new_call: Function to discard preallocated calls + * rxrpc_kernel_set_notifications - Set table of callback operations + * @sock: The socket to install table upon + * @app_ops: Callback operation table to set * - * Allow a kernel service to be given notifications about new calls. + * Allow a kernel service to set a table of event notifications on a socket. */ -void rxrpc_kernel_new_call_notification( - struct socket *sock, - rxrpc_notify_new_call_t notify_new_call, - rxrpc_discard_new_call_t discard_new_call) +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - rx->notify_new_call = notify_new_call; - rx->discard_new_call = discard_new_call; + rx->app_ops = app_ops; } -EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); - -/** - * rxrpc_kernel_set_max_life - Set maximum lifespan on a call - * @sock: The socket the call is on - * @call: The call to configure - * @hard_timeout: The maximum lifespan of the call in ms - * - * Set the maximum lifespan of a call. The call will end with ETIME or - * ETIMEDOUT if it takes longer than this. - */ -void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call, - unsigned long hard_timeout) -{ - ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by; - - mutex_lock(&call->user_mutex); - - expect_term_by = ktime_add(ktime_get_real(), delay); - WRITE_ONCE(call->expect_term_by, expect_term_by); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); - rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); - - mutex_unlock(&call->user_mutex); -} -EXPORT_SYMBOL(rxrpc_kernel_set_max_life); +EXPORT_SYMBOL(rxrpc_kernel_set_notifications); /* * connect an RxRPC socket @@ -624,7 +592,10 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) fallthrough; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: - ret = rxrpc_do_sendmsg(rx, m, len); + if (m->msg_flags & MSG_OOB) + ret = rxrpc_sendmsg_oob(rx, m, len); + else + ret = rxrpc_do_sendmsg(rx, m, len); /* The socket has been unlocked */ goto out; default: @@ -659,7 +630,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - unsigned int min_sec_level; + unsigned int min_sec_level, val; u16 service_upgrade[2]; int ret; @@ -740,6 +711,26 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, rx->service_upgrade.to = service_upgrade[1]; goto success; + case RXRPC_MANAGE_RESPONSE: + ret = -EINVAL; + if (optlen != sizeof(unsigned int)) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNBOUND) + goto error; + ret = copy_safe_from_sockptr(&val, sizeof(val), + optval, optlen); + if (ret) + goto error; + ret = -EINVAL; + if (val > 1) + goto error; + if (val) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + goto success; + default: break; } @@ -846,6 +837,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rx->calls = RB_ROOT; spin_lock_init(&rx->incoming_lock); + skb_queue_head_init(&rx->recvmsg_oobq); + rx->pending_oobq = RB_ROOT; INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); @@ -879,8 +872,10 @@ static int rxrpc_shutdown(struct socket *sock, int flags) lock_sock(sk); if (sk->sk_state < RXRPC_CLOSE) { + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; sk->sk_shutdown = SHUTDOWN_MASK; + spin_unlock_irq(&rx->recvmsg_lock); } else { ret = -ESHUTDOWN; } @@ -892,12 +887,30 @@ static int rxrpc_shutdown(struct socket *sock, int flags) } /* + * Purge the out-of-band queue. + */ +static void rxrpc_purge_oob_queue(struct sock *sk) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&rx->recvmsg_oobq))) + rxrpc_kernel_free_oob(skb); + while (!RB_EMPTY_ROOT(&rx->pending_oobq)) { + skb = rb_entry(rx->pending_oobq.rb_node, struct sk_buff, rbnode); + rb_erase(&skb->rbnode, &rx->pending_oobq); + rxrpc_kernel_free_oob(skb); + } +} + +/* * RxRPC socket destructor */ static void rxrpc_sock_destructor(struct sock *sk) { _enter("%p", sk); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); @@ -936,7 +949,9 @@ static int rxrpc_release_sock(struct sock *sk) break; } + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; + spin_unlock_irq(&rx->recvmsg_lock); if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); @@ -948,6 +963,7 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 3cc3af15086ff..5b7342d434869 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -31,6 +31,7 @@ struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; struct rxrpc_txqueue; +struct rxgk_context; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -39,9 +40,11 @@ struct rxrpc_txqueue; enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ + RXRPC_SKB_MARK_CHALLENGE, /* Challenge notification */ RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ + RXRPC_SKB_MARK_REJECT_CONN_ABORT, /* Reject with connection ABORT (code in skb->priority) */ }; /* @@ -146,10 +149,12 @@ struct rxrpc_backlog { struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; - rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ - rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ + const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ + struct sk_buff_head recvmsg_oobq; /* OOB messages for recvmsg to pick up */ + struct rb_root pending_oobq; /* OOB messages awaiting userspace to respond to */ + u64 oob_id_counter; /* OOB message ID counter */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ @@ -160,6 +165,7 @@ struct rxrpc_sock { struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ +#define RXRPC_SOCK_MANAGE_RESPONSE 1 /* User wants to manage RESPONSE packets */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT @@ -203,7 +209,7 @@ struct rxrpc_host_header { */ struct rxrpc_skb_priv { union { - struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ + struct rxrpc_connection *poke_conn; /* Conn referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ @@ -217,6 +223,19 @@ struct rxrpc_skb_priv { u16 nr_acks; /* Number of acks+nacks */ u8 reason; /* Reason for ack */ } ack; + struct { + struct rxrpc_connection *conn; /* Connection referred to */ + union { + u32 rxkad_nonce; + }; + } chall; + struct { + rxrpc_serial_t challenge_serial; + u32 kvno; + u32 version; + u16 len; + u16 ticket_len; + } resp; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; @@ -270,9 +289,24 @@ struct rxrpc_security { /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); + /* Validate a challenge packet */ + bool (*validate_challenge)(struct rxrpc_connection *conn, + struct sk_buff *skb); + + /* Fill out the cmsg for recvmsg() to pass on a challenge to userspace. + * The security class gets to add additional information. + */ + int (*challenge_to_recvmsg)(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg); + + /* Parse sendmsg() control message and respond to challenge. */ + int (*sendmsg_respond_to_challenge)(struct sk_buff *challenge, + struct msghdr *msg); + /* respond to a challenge */ - int (*respond_to_challenge)(struct rxrpc_connection *, - struct sk_buff *); + int (*respond_to_challenge)(struct rxrpc_connection *conn, + struct sk_buff *challenge); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, @@ -280,6 +314,11 @@ struct rxrpc_security { /* clear connection security */ void (*clear)(struct rxrpc_connection *); + + /* Default ticket -> key decoder */ + int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); }; /* @@ -323,12 +362,15 @@ struct rxrpc_local { struct list_head new_client_calls; /* Newly created client calls need connection */ spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ - /* Provide a kvec table sufficiently large to manage either a DATA - * packet with a maximum set of jumbo subpackets or a PING ACK padded - * out to 64K with zeropages for PMTUD. - */ - struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ? - 1 + RXRPC_MAX_NR_JUMBO : 3 + 16]; + union { + /* Provide a kvec table sufficiently large to manage either a + * DATA packet with a maximum set of jumbo subpackets or a PING + * ACK padded out to 64K with zeropages for PMTUD. + */ + struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ? + 1 + RXRPC_MAX_NR_JUMBO : 3 + 16]; + struct bio_vec bvec[3 + 16]; + }; }; /* @@ -526,7 +568,17 @@ struct rxrpc_connection { struct rxrpc_crypt csum_iv; /* packet checksum base */ u32 nonce; /* response re-use preventer */ } rxkad; + struct { + struct rxgk_context *keys[4]; /* (Re-)keying buffer */ + u64 start_time; /* The start time for TK derivation */ + u8 nonce[20]; /* Response re-use preventer */ + u32 enctype; /* Kerberos 5 encoding type */ + u32 key_number; /* Current key number */ + } rxgk; }; + rwlock_t security_use_lock; /* Security use/modification lock */ + struct sk_buff *tx_response; /* Response packet to be transmitted */ + unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ @@ -692,6 +744,7 @@ struct rxrpc_call { u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 security_level; /* Security level selected */ + u32 security_enctype; /* Security-specific encoding type (or 0) */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ @@ -867,6 +920,8 @@ struct rxrpc_txbuf { unsigned short len; /* Amount of data in buffer */ unsigned short space; /* Remaining data space */ unsigned short offset; /* Offset of fill point */ + unsigned short crypto_header; /* Size of crypto header */ + unsigned short sec_header; /* Size of security header */ unsigned short pkt_len; /* Size of packet content */ unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; @@ -1001,7 +1056,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct rxrpc_call_params *, gfp_t, - unsigned int); + unsigned int) + __releases(&rx->sk.sk_lock) + __acquires(&call->user_mutex); void rxrpc_start_call_timer(struct rxrpc_call *call); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); @@ -1197,9 +1254,14 @@ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); void rxrpc_error_report(struct sock *); bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err); +bool rxrpc_direct_conn_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, + s32 abort_code, int err); int rxrpc_io_thread(void *data); +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { + if (!local->io_thread) + return; wake_up_process(READ_ONCE(local->io_thread)); } @@ -1289,8 +1351,16 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) } /* + * out_of_band.c + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb); +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len); + +/* * output.c */ +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len); void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); @@ -1299,6 +1369,7 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *skb); /* * peer_event.c @@ -1315,6 +1386,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, const struct sockaddr_rxrpc *); struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp); +void rxrpc_assess_MTU_size(struct rxrpc_local *local, struct rxrpc_peer *peer); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t, enum rxrpc_peer_trace); void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer); @@ -1363,6 +1435,11 @@ ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); void rxrpc_call_init_rtt(struct rxrpc_call *call); /* + * rxgk.c + */ +extern const struct rxrpc_security rxgk_yfs; + +/* * rxkad.c */ #ifdef CONFIG_RXKAD @@ -1433,7 +1510,6 @@ static inline void rxrpc_sysctl_exit(void) {} extern atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp); -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index e685034ce4f7c..00982a030744b 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -34,7 +34,6 @@ static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call, static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -123,9 +122,10 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->user_call_ID = user_call_ID; call->notify_rx = notify_rx; - if (user_attach_call) { + if (rx->app_ops && + rx->app_ops->user_attach_call) { rxrpc_get_call(call, rxrpc_call_get_kernel_service); - user_attach_call(call, user_call_ID); + rx->app_ops->user_attach_call(call, user_call_ID); } rxrpc_get_call(call, rxrpc_call_get_userid); @@ -149,6 +149,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, id_in_use: write_unlock(&rx->call_lock); + rxrpc_prefail_call(call, RXRPC_CALL_LOCAL_ERROR, -EBADSLT); rxrpc_cleanup_call(call); _leave(" = -EBADSLT"); return -EBADSLT; @@ -218,10 +219,12 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->call_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; + rxrpc_see_call(call, rxrpc_call_see_discard); rcu_assign_pointer(call->socket, rx); - if (rx->discard_new_call) { + if (rx->app_ops && + rx->app_ops->discard_new_call) { _debug("discard %lx", call->user_call_ID); - rx->discard_new_call(call, call->user_call_ID); + rx->app_ops->discard_new_call(call, call->user_call_ID); if (call->notify_rx) call->notify_rx = rxrpc_dummy_notify; rxrpc_put_call(call, rxrpc_call_put_kernel); @@ -253,6 +256,9 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, unsigned short call_tail, conn_tail, peer_tail; unsigned short call_count, conn_count; + if (!b) + return NULL; + /* #calls >= #conns >= #peers must hold true. */ call_head = smp_load_acquire(&b->call_backlog_head); call_tail = b->call_backlog_tail; @@ -368,8 +374,8 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { - rxrpc_direct_abort(skb, rxrpc_abort_shut_down, - RX_INVALID_OPERATION, -ESHUTDOWN); + rxrpc_direct_conn_abort(skb, rxrpc_abort_shut_down, + RX_INVALID_OPERATION, -ESHUTDOWN); goto no_call; } @@ -387,8 +393,9 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, rxrpc_incoming_call(rx, call, skb); conn = call->conn; - if (rx->notify_new_call) - rx->notify_new_call(&rx->sk, call, call->user_call_ID); + if (rx->app_ops && + rx->app_ops->notify_new_call) + rx->app_ops->notify_new_call(&rx->sk, call, call->user_call_ID); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) { @@ -400,6 +407,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, spin_unlock(&rx->incoming_lock); read_unlock_irq(&local->services_lock); + rxrpc_assess_MTU_size(local, call->peer); if (hlist_unhashed(&call->error_link)) { spin_lock_irq(&call->peer->lock); @@ -414,12 +422,12 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, unsupported_service: read_unlock_irq(&local->services_lock); - return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, - RX_INVALID_OPERATION, -EOPNOTSUPP); + return rxrpc_direct_conn_abort(skb, rxrpc_abort_service_not_offered, + RX_INVALID_OPERATION, -EOPNOTSUPP); unsupported_security: read_unlock_irq(&local->services_lock); - return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, - RX_INVALID_OPERATION, -EKEYREJECTED); + return rxrpc_direct_conn_abort(skb, rxrpc_abort_service_not_offered, + RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); read_unlock_irq(&local->services_lock); @@ -440,8 +448,7 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) if (rx->sk.sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID, - GFP_KERNEL, + return rxrpc_service_prealloc_one(rx, b, NULL, user_call_ID, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); } @@ -449,20 +456,18 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls * @sock: The socket on which to preallocate * @notify_rx: Event notification function for the call - * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. * @debug_id: The tracing debug ID. * - * Charge up the socket with preallocated calls, each with a user ID. A - * function should be provided to effect the attachment from the user's side. - * The user is given a ref to hold on the call. + * Charge up the socket with preallocated calls, each with a user ID. The + * ->user_attach_call() callback function should be provided to effect the + * attachment from the user's side. The user is given a ref to hold on the + * call. * * Note that the call may be come connected before this function returns. */ -int rxrpc_kernel_charge_accept(struct socket *sock, - rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -472,8 +477,7 @@ int rxrpc_kernel_charge_accept(struct socket *sock, if (sock->sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, notify_rx, - user_attach_call, user_call_ID, + return rxrpc_service_prealloc_one(rx, b, notify_rx, user_call_ID, gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index fce58be65e7cf..918f41d97a2f9 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -64,7 +64,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) static void rxrpc_call_timer_expired(struct timer_list *t) { - struct rxrpc_call *call = from_timer(call, t, timer); + struct rxrpc_call *call = timer_container_of(call, t, timer); _enter("%d", call->debug_id); @@ -145,8 +145,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); - skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->recvmsg_queue); + skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); @@ -322,7 +322,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; @@ -561,7 +561,7 @@ static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call) void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; - bool put = false, putu = false; + bool putu = false; _enter("{%d,%d}", call->debug_id, refcount_read(&call->ref)); @@ -573,23 +573,13 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_put_call_slot(call); - /* Make sure we don't get any more notifications */ + /* Note that at this point, the call may still be on or may have been + * added back on to the socket receive queue. recvmsg() must discard + * released calls. The CALL_RELEASED flag should prevent further + * notifications. + */ spin_lock_irq(&rx->recvmsg_lock); - - if (!list_empty(&call->recvmsg_link)) { - _debug("unlinking once-pending call %p { e=%lx f=%lx }", - call, call->events, call->flags); - list_del(&call->recvmsg_link); - put = true; - } - - /* list_empty() must return false in rxrpc_notify_socket() */ - call->recvmsg_link.next = NULL; - call->recvmsg_link.prev = NULL; - spin_unlock_irq(&rx->recvmsg_lock); - if (put) - rxrpc_put_call(call, rxrpc_call_put_unnotify); write_lock(&rx->call_lock); @@ -638,6 +628,12 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) rxrpc_put_call(call, rxrpc_call_put_release_sock); } + while ((call = list_first_entry_or_null(&rx->recvmsg_q, + struct rxrpc_call, recvmsg_link))) { + list_del_init(&call->recvmsg_link); + rxrpc_put_call(call, rxrpc_call_put_release_recvmsg_q); + } + _leave(""); } @@ -760,3 +756,23 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } + +/** + * rxrpc_kernel_query_call_security - Query call's security parameters + * @call: The call to query + * @_service_id: Where to return the service ID + * @_enctype: Where to return the "encoding type" + * + * This queries the security parameters of a call, setting *@_service_id and + * *@_enctype and returning the security class. + * + * Return: The security class protocol number. + */ +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype) +{ + *_service_id = call->dest_srx.srx_service; + *_enctype = call->security_enctype; + return call->security_ix; +} +EXPORT_SYMBOL(rxrpc_kernel_query_call_security); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 4d9c5e21ba785..232b6986da83e 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -19,7 +19,7 @@ /* * Set the completion state on an aborted connection. */ -static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb, +static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, s32 abort_code, int err, enum rxrpc_call_completion compl) { @@ -49,12 +49,20 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, s32 abort_code, int err, enum rxrpc_abort_reason why) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - if (rxrpc_set_conn_aborted(conn, skb, abort_code, err, + u32 cid = conn->proto.cid, call = 0, seq = 0; + + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + cid = sp->hdr.cid; + call = sp->hdr.callNumber; + seq = sp->hdr.seq; + } + + if (rxrpc_set_conn_aborted(conn, abort_code, err, RXRPC_CALL_LOCALLY_ABORTED)) { - trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, - sp->hdr.seq, abort_code, err); + trace_rxrpc_abort(0, why, cid, call, seq, abort_code, err); rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort); } return -EPROTO; @@ -67,7 +75,7 @@ static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { trace_rxrpc_rx_conn_abort(conn, skb); - rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + rxrpc_set_conn_aborted(conn, skb->priority, -ECONNABORTED, RXRPC_CALL_REMOTELY_ABORTED); } @@ -248,7 +256,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_CHALLENGE: - return conn->security->respond_to_challenge(conn, skb); + ret = conn->security->respond_to_challenge(conn, skb); + sp->chall.conn = NULL; + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + return ret; case RXRPC_PACKET_TYPE_RESPONSE: ret = conn->security->verify_response(conn, skb); @@ -270,7 +281,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, * we've already received the packet, put it on the * front of the queue. */ - sp->conn = rxrpc_get_connection(conn, rxrpc_conn_get_poke_secured); + sp->poke_conn = rxrpc_get_connection( + conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -392,6 +404,61 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, } /* + * Post a CHALLENGE packet to the socket of one of a connection's calls so that + * it can get application data to include in the packet, possibly querying + * userspace. + */ +static bool rxrpc_post_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = NULL; + struct rxrpc_sock *rx; + bool respond = false; + + sp->chall.conn = + rxrpc_get_connection(conn, rxrpc_conn_get_challenge_input); + + if (!conn->security->challenge_to_recvmsg) { + rxrpc_post_packet_to_conn(conn, skb); + return true; + } + + rcu_read_lock(); + + for (int i = 0; i < ARRAY_SIZE(conn->channels); i++) { + if (conn->channels[i].call) { + call = conn->channels[i].call; + rx = rcu_dereference(call->socket); + if (!rx) { + call = NULL; + continue; + } + + respond = true; + if (test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags)) + break; + call = NULL; + } + } + + if (!respond) { + rcu_read_unlock(); + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + sp->chall.conn = NULL; + return false; + } + + if (call) + rxrpc_notify_socket_oob(call, skb); + rcu_read_unlock(); + + if (!call) + rxrpc_post_packet_to_conn(conn, skb); + return true; +} + +/* * Input a connection-level packet. */ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) @@ -411,6 +478,16 @@ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) return true; case RXRPC_PACKET_TYPE_CHALLENGE: + rxrpc_see_skb(skb, rxrpc_skb_see_oob_challenge); + if (rxrpc_is_conn_aborted(conn)) { + if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) + rxrpc_send_conn_abort(conn); + return true; + } + if (!conn->security->validate_challenge(conn, skb)) + return false; + return rxrpc_post_challenge(conn, skb); + case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_is_conn_aborted(conn)) { if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) @@ -436,6 +513,19 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); + if (conn->tx_response) { + struct sk_buff *skb; + + spin_lock_irq(&conn->local->lock); + skb = conn->tx_response; + conn->tx_response = NULL; + spin_unlock_irq(&conn->local->lock); + + if (conn->state != RXRPC_CONN_ABORTED) + rxrpc_send_response(conn, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_response); + } + if (skb) { switch (skb->mark) { case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: @@ -452,3 +542,31 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, false); } + +/* + * Post a RESPONSE message to the I/O thread for transmission. + */ +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_local *local = conn->local; + struct sk_buff *old; + + _enter("%x", sp->resp.challenge_serial); + + spin_lock_irq(&local->lock); + old = conn->tx_response; + if (old) { + struct rxrpc_skb_priv *osp = rxrpc_skb(skb); + + /* Always go with the response to the most recent challenge. */ + if (after(sp->resp.challenge_serial, osp->resp.challenge_serial)) + conn->tx_response = old; + else + old = skb; + } else { + conn->tx_response = skb; + } + spin_unlock_irq(&local->lock); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_response); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 8ac22dde8b39b..37340becb224d 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -73,6 +73,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; + rwlock_init(&conn->security_use_lock); spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->idle_timestamp = jiffies; @@ -329,6 +330,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work) } rxrpc_purge_queue(&conn->rx_queue); + rxrpc_free_skb(conn->tx_response, rxrpc_skb_put_response); rxrpc_kill_client_conn(conn); diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index e068f9b79d023..0a260df45d25a 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -42,11 +42,18 @@ static void none_free_call_crypto(struct rxrpc_call *call) { } -static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool none_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxrpc_eproto_rxnull_challenge); + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxrpc_eproto_rxnull_challenge); + return true; +} + +static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; } static int none_verify_response(struct rxrpc_connection *conn, @@ -82,7 +89,8 @@ const struct rxrpc_security rxrpc_no_security = { .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, - .respond_to_challenge = none_respond_to_challenge, + .validate_challenge = none_validate_challenge, + .sendmsg_respond_to_challenge = none_sendmsg_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, }; diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 64f8d77b87312..e939ecf417c4b 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -97,6 +97,20 @@ bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, return false; } +/* + * Directly produce a connection abort from a packet. + */ +bool rxrpc_direct_conn_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, + s32 abort_code, int err) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + trace_rxrpc_abort(0, why, sp->hdr.cid, 0, sp->hdr.seq, abort_code, err); + skb->mark = RXRPC_SKB_MARK_REJECT_CONN_ABORT; + skb->priority = abort_code; + return false; +} + static bool rxrpc_bad_message(struct sk_buff *skb, enum rxrpc_abort_reason why) { return rxrpc_direct_abort(skb, why, RX_PROTOCOL_ERROR, -EBADMSG); @@ -489,8 +503,8 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - rxrpc_input_conn_event(sp->conn, skb); - rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke); + rxrpc_input_conn_event(sp->poke_conn, skb); + rxrpc_put_connection(sp->poke_conn, rxrpc_conn_put_poke); rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); break; default: @@ -501,9 +515,11 @@ int rxrpc_io_thread(void *data) } /* Deal with connections that want immediate attention. */ - spin_lock_irq(&local->lock); - list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); - spin_unlock_irq(&local->lock); + if (!list_empty_careful(&local->conn_attend_q)) { + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); + spin_unlock_irq(&local->lock); + } while ((conn = list_first_entry_or_null(&conn_attend_q, struct rxrpc_connection, diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 33e8302a79e33..9fdc1f031c9da 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -129,6 +129,160 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, return 0; } +static u64 xdr_dec64(const __be32 *xdr) +{ + return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]); +} + +static time64_t rxrpc_s64_to_time64(s64 time_in_100ns) +{ + bool neg = false; + u64 tmp = time_in_100ns; + + if (time_in_100ns < 0) { + tmp = -time_in_100ns; + neg = true; + } + do_div(tmp, 10000000); + return neg ? -tmp : tmp; +} + +/* + * Parse a YFS-RxGK type XDR format token + * - the caller guarantees we have at least 4 words + * + * struct token_rxgk { + * opr_time begintime; + * opr_time endtime; + * afs_int64 level; + * afs_int64 lifetime; + * afs_int64 bytelife; + * afs_int64 enctype; + * opaque key<>; + * opaque ticket<>; + * }; + */ +static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + time64_t expiry; + size_t plen; + const __be32 *ticket, *key; + s64 tmp; + u32 tktlen, keylen; + + _enter(",{%x,%x,%x,%x},%x", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + if (6 * 2 + 2 > toklen / 4) + goto reject; + + key = xdr + (6 * 2 + 1); + keylen = ntohl(key[-1]); + _debug("keylen: %x", keylen); + keylen = round_up(keylen, 4); + if ((6 * 2 + 2) * 4 + keylen > toklen) + goto reject; + + ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1); + tktlen = ntohl(ticket[-1]); + _debug("tktlen: %x", tktlen); + tktlen = round_up(tktlen, 4); + if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) { + kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]", + (6 * 2 + 2) * 4 + keylen + tktlen, toklen, + keylen, tktlen); + goto reject; + } + + plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen; + prep->quotalen = datalen + plen; + + plen -= sizeof(*token); + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + goto nomem; + + token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL); + if (!token->rxgk) + goto nomem_token; + + token->security_index = RXRPC_SECURITY_YFS_RXGK; + token->rxgk->begintime = xdr_dec64(xdr + 0 * 2); + token->rxgk->endtime = xdr_dec64(xdr + 1 * 2); + token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2); + if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT) + goto reject_token; + token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2); + token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2); + token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2); + if (tmp < 0 || tmp > UINT_MAX) + goto reject_token; + token->rxgk->key.len = ntohl(key[-1]); + token->rxgk->key.data = token->rxgk->_key; + token->rxgk->ticket.len = ntohl(ticket[-1]); + + if (token->rxgk->endtime != 0) { + expiry = rxrpc_s64_to_time64(token->rxgk->endtime); + if (expiry < 0) + goto expired; + if (expiry < prep->expiry) + prep->expiry = expiry; + } + + memcpy(token->rxgk->key.data, key, token->rxgk->key.len); + + /* Pad the ticket so that we can use it directly in XDR */ + token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4), + GFP_KERNEL); + if (!token->rxgk->ticket.data) + goto nomem_yrxgk; + memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len); + + _debug("SCIX: %u", token->security_index); + _debug("EXPY: %llx", token->rxgk->endtime); + _debug("LIFE: %llx", token->rxgk->lifetime); + _debug("BYTE: %llx", token->rxgk->bytelife); + _debug("ENC : %u", token->rxgk->enctype); + _debug("LEVL: %u", token->rxgk->level); + _debug("KLEN: %u", token->rxgk->key.len); + _debug("TLEN: %u", token->rxgk->ticket.len); + _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data); + _debug("TICK: %*phN", + min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + + _leave(" = 0"); + return 0; + +nomem_yrxgk: + kfree(token->rxgk); +nomem_token: + kfree(token); +nomem: + return -ENOMEM; +reject_token: + kfree(token); +reject: + return -EKEYREJECTED; +expired: + kfree(token->rxgk); + kfree(token); + return -EKEYEXPIRED; +} + /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words @@ -228,6 +382,9 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) case RXRPC_SECURITY_RXKAD: ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); break; + case RXRPC_SECURITY_YFS_RXGK: + ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen); + break; default: ret2 = -EPROTONOSUPPORT; break; @@ -390,6 +547,10 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; + case RXRPC_SECURITY_YFS_RXGK: + kfree(token->rxgk->ticket.data); + kfree(token->rxgk); + break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); @@ -433,6 +594,9 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m) case RXRPC_SECURITY_RXKAD: seq_puts(m, "ka"); break; + case RXRPC_SECURITY_YFS_RXGK: + seq_puts(m, "ygk"); + break; default: /* we have a ticket we can't encode */ seq_printf(m, "%u", token->security_index); break; @@ -531,6 +695,8 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key); * * Generate a null RxRPC key that can be used to indicate anonymous security is * required for a particular domain. + * + * Return: The new key or a negative error code. */ struct key *rxrpc_get_null_key(const char *keyname) { @@ -595,6 +761,13 @@ static long rxrpc_read(const struct key *key, toksize += RND(token->kad->ticket_len); break; + case RXRPC_SECURITY_YFS_RXGK: + toksize += 6 * 8 + 2 * 4; + if (!token->no_leak_key) + toksize += RND(token->rxgk->key.len); + toksize += RND(token->rxgk->ticket.len); + break; + default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); @@ -674,6 +847,20 @@ static long rxrpc_read(const struct key *key, ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; + case RXRPC_SECURITY_YFS_RXGK: + ENCODE64(token->rxgk->begintime); + ENCODE64(token->rxgk->endtime); + ENCODE64(token->rxgk->level); + ENCODE64(token->rxgk->lifetime); + ENCODE64(token->rxgk->bytelife); + ENCODE64(token->rxgk->enctype); + if (token->no_leak_key) + ENCODE(0); + else + ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data); + ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data); + break; + default: pr_err("Unsupported key token type (%u)\n", token->security_index); diff --git a/net/rxrpc/oob.c b/net/rxrpc/oob.c new file mode 100644 index 0000000000000..05ca9c1faa577 --- /dev/null +++ b/net/rxrpc/oob.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Out of band message handling (e.g. challenge-response) + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/gfp.h> +#include <linux/skbuff.h> +#include <linux/export.h> +#include <linux/sched/signal.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +enum rxrpc_oob_command { + RXRPC_OOB_CMD_UNSET, + RXRPC_OOB_CMD_RESPOND, +} __mode(byte); + +struct rxrpc_oob_params { + u64 oob_id; /* ID number of message if reply */ + s32 abort_code; + enum rxrpc_oob_command command; + bool have_oob_id:1; +}; + +/* + * Post an out-of-band message for attention by the socket or kernel service + * associated with a reference call. + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_sock *rx; + struct sock *sk; + + rcu_read_lock(); + + rx = rcu_dereference(call->socket); + if (rx) { + sk = &rx->sk; + spin_lock_irq(&rx->recvmsg_lock); + + if (sk->sk_state < RXRPC_CLOSE) { + skb->skb_mstamp_ns = rx->oob_id_counter++; + rxrpc_get_skb(skb, rxrpc_skb_get_post_oob); + skb_queue_tail(&rx->recvmsg_oobq, skb); + + trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); + if (rx->app_ops) + rx->app_ops->notify_oob(sk, skb); + } + + spin_unlock_irq(&rx->recvmsg_lock); + if (!rx->app_ops && !sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + } + + rcu_read_unlock(); +} + +/* + * Locate the OOB message to respond to by its ID. + */ +static struct sk_buff *rxrpc_find_pending_oob(struct rxrpc_sock *rx, u64 oob_id) +{ + struct rb_node *p; + struct sk_buff *skb; + + p = rx->pending_oobq.rb_node; + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); + + if (oob_id < skb->skb_mstamp_ns) + p = p->rb_left; + else if (oob_id > skb->skb_mstamp_ns) + p = p->rb_right; + else + return skb; + } + + return NULL; +} + +/* + * Add an OOB message into the pending-response set. We always assign the next + * value from a 64-bit counter to the oob_id, so just assume we're always going + * to be on the right-hand edge of the tree and that the counter won't wrap. + * The tree is also given a ref to the message. + */ +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb) +{ + struct rb_node **pp = &rx->pending_oobq.rb_node, *p = NULL; + + while (*pp) { + p = *pp; + pp = &(*pp)->rb_right; + } + + rb_link_node(&skb->rbnode, p, pp); + rb_insert_color(&skb->rbnode, &rx->pending_oobq); +} + +/* + * Extract control messages from the sendmsg() control buffer. + */ +static int rxrpc_sendmsg_oob_cmsg(struct msghdr *msg, struct rxrpc_oob_params *p) +{ + struct cmsghdr *cmsg; + int len; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - sizeof(struct cmsghdr); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_OOB_ID: + if (len != sizeof(p->oob_id) || p->have_oob_id) + return -EINVAL; + memcpy(&p->oob_id, CMSG_DATA(cmsg), sizeof(p->oob_id)); + p->have_oob_id = true; + break; + case RXRPC_RESPOND: + if (p->command != RXRPC_OOB_CMD_UNSET) + return -EINVAL; + p->command = RXRPC_OOB_CMD_RESPOND; + break; + case RXRPC_ABORT: + if (len != sizeof(p->abort_code) || p->abort_code) + return -EINVAL; + memcpy(&p->abort_code, CMSG_DATA(cmsg), sizeof(p->abort_code)); + if (p->abort_code == 0) + return -EINVAL; + break; + case RXRPC_RESP_RXGK_APPDATA: + if (p->command != RXRPC_OOB_CMD_RESPOND) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + if (!p->have_oob_id) + return -EBADSLT; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Allow userspace to respond to an OOB using sendmsg(). + */ +static int rxrpc_respond_to_oob(struct rxrpc_sock *rx, + struct rxrpc_oob_params *p, + struct msghdr *msg) +{ + struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + skb = rxrpc_find_pending_oob(rx, p->oob_id); + if (skb) + rb_erase(&skb->rbnode, &rx->pending_oobq); + release_sock(&rx->sk); + if (!skb) + return -EBADSLT; + + sp = rxrpc_skb(skb); + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + ret = -EPROTO; + if (skb->mark != RXRPC_OOB_CHALLENGE) + break; + conn = sp->chall.conn; + ret = -EOPNOTSUPP; + if (!conn->security->sendmsg_respond_to_challenge) + break; + if (p->abort_code) { + rxrpc_abort_conn(conn, NULL, p->abort_code, -ECONNABORTED, + rxrpc_abort_response_sendmsg); + ret = 0; + } else { + ret = conn->security->sendmsg_respond_to_challenge(skb, msg); + } + break; + default: + ret = -EINVAL; + break; + } + + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + +/* + * Send an out-of-band message or respond to a received out-of-band message. + * - caller gives us the socket lock + * - the socket may be either a client socket or a server socket + */ +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) +{ + struct rxrpc_oob_params p = {}; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_oob_cmsg(msg, &p); + if (ret < 0) + goto error_release_sock; + + if (p.have_oob_id) + return rxrpc_respond_to_oob(rx, &p, msg); + + release_sock(&rx->sk); + + switch (p.command) { + default: + ret = -EINVAL; + break; + } + + _leave(" = %d", ret); + return ret; + +error_release_sock: + release_sock(&rx->sk); + return ret; +} + +/** + * rxrpc_kernel_query_oob - Query the parameters of an out-of-band message + * @oob: The message to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * + * Extract useful parameters from an out-of-band message. The source peer + * parameters are returned through the argument list and the message type is + * returned. + * + * Return: + * * %RXRPC_OOB_CHALLENGE - Challenge wanting a response. + */ +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + enum rxrpc_oob_type type = oob->mark; + + switch (type) { + case RXRPC_OOB_CHALLENGE: + *_peer = sp->chall.conn->peer; + *_peer_appdata = sp->chall.conn->peer->app_data; + break; + default: + WARN_ON_ONCE(1); + *_peer = NULL; + *_peer_appdata = 0; + break; + } + + return type; +} +EXPORT_SYMBOL(rxrpc_kernel_query_oob); + +/** + * rxrpc_kernel_dequeue_oob - Dequeue and return the front OOB message + * @sock: The socket to query + * @_type: Where to return the message type + * + * Dequeue the front OOB message, if there is one, and return it and + * its type. + * + * Return: The sk_buff representing the OOB message or %NULL if the queue was + * empty. + */ +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *oob; + + oob = skb_dequeue(&rx->recvmsg_oobq); + if (oob) + *_type = oob->mark; + return oob; +} +EXPORT_SYMBOL(rxrpc_kernel_dequeue_oob); + +/** + * rxrpc_kernel_free_oob - Free an out-of-band message + * @oob: The OOB message to free + * + * Free an OOB message along with any resources it holds. + */ +void rxrpc_kernel_free_oob(struct sk_buff *oob) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + + switch (oob->mark) { + case RXRPC_OOB_CHALLENGE: + rxrpc_put_connection(sp->chall.conn, rxrpc_conn_put_oob); + break; + } + + rxrpc_free_skb(oob, rxrpc_skb_put_purge_oob); +} +EXPORT_SYMBOL(rxrpc_kernel_free_oob); + +/** + * rxrpc_kernel_query_challenge - Query the parameters of a challenge + * @challenge: The challenge to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * @_service_id: Where to return the connection service ID + * @_security_index: Where to return the connection security index + * + * Extract useful parameters from a CHALLENGE message. + */ +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + *_peer = sp->chall.conn->peer; + *_peer_appdata = sp->chall.conn->peer->app_data; + *_service_id = sp->hdr.serviceId; + *_security_index = sp->hdr.securityIndex; +} +EXPORT_SYMBOL(rxrpc_kernel_query_challenge); + +/** + * rxrpc_kernel_reject_challenge - Allow a kernel service to reject a challenge + * @challenge: The challenge to be rejected + * @abort_code: The abort code to stick into the ABORT packet + * @error: Local error value + * @why: Indication as to why. + * + * Allow a kernel service to reject a challenge by aborting the connection if + * it's still in an abortable state. The error is returned so this function + * can be used with a return statement. + * + * Return: The %error parameter. + */ +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + _enter("{%x},%d,%d,%u", sp->hdr.serial, abort_code, error, why); + + rxrpc_abort_conn(sp->chall.conn, NULL, abort_code, error, why); + return error; +} +EXPORT_SYMBOL(rxrpc_kernel_reject_challenge); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 95905b85a8d71..8b5903b6e481a 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -18,7 +18,7 @@ extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) { struct sockaddr *sa = msg->msg_name; struct sock *sk = socket->sk; @@ -814,6 +814,9 @@ void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) __be32 code; int ret, ioc; + if (sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) + return; /* Never abort an abort. */ + rxrpc_see_skb(skb, rxrpc_skb_see_reject); iov[0].iov_base = &whdr; @@ -826,7 +829,13 @@ void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) msg.msg_controllen = 0; msg.msg_flags = 0; - memset(&whdr, 0, sizeof(whdr)); + whdr = (struct rxrpc_wire_header) { + .epoch = htonl(sp->hdr.epoch), + .cid = htonl(sp->hdr.cid), + .callNumber = htonl(sp->hdr.callNumber), + .serviceId = htons(sp->hdr.serviceId), + .flags = ~sp->hdr.flags & RXRPC_CLIENT_INITIATED, + }; switch (skb->mark) { case RXRPC_SKB_MARK_REJECT_BUSY: @@ -834,6 +843,9 @@ void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) size = sizeof(whdr); ioc = 1; break; + case RXRPC_SKB_MARK_REJECT_CONN_ABORT: + whdr.callNumber = 0; + fallthrough; case RXRPC_SKB_MARK_REJECT_ABORT: whdr.type = RXRPC_PACKET_TYPE_ABORT; code = htonl(skb->priority); @@ -847,14 +859,6 @@ void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) { msg.msg_namelen = srx.transport_len; - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.serviceId = htons(sp->hdr.serviceId); - whdr.flags = sp->hdr.flags; - whdr.flags ^= RXRPC_CLIENT_INITIATED; - whdr.flags &= RXRPC_CLIENT_INITIATED; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, ioc, size); ret = do_udp_sendmsg(local->socket, &msg, size); if (ret < 0) @@ -916,3 +920,64 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) peer->last_tx_at = ktime_get_seconds(); _leave(""); } + +/* + * Send a RESPONSE message. + */ +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(response); + struct scatterlist sg[16]; + struct bio_vec *bvec = conn->local->bvec; + struct msghdr msg; + size_t len = sp->resp.len; + __be32 wserial; + u32 serial = 0; + int ret, nr_sg; + + _enter("C=%x,%x", conn->debug_id, sp->resp.challenge_serial); + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, 0, len); + if (ret < 0) + goto fail; + nr_sg = ret; + ret = -EIO; + if (WARN_ON_ONCE(nr_sg > ARRAY_SIZE(conn->local->bvec))) + goto fail; + + for (int i = 0; i < nr_sg; i++) + bvec_set_page(&bvec[i], sg_page(&sg[i]), sg[i].length, sg[i].offset); + + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, nr_sg, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + serial = rxrpc_get_next_serials(conn, 1); + wserial = htonl(serial); + + trace_rxrpc_tx_response(conn, serial, sp); + + ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial), + &wserial, sizeof(wserial)); + if (ret < 0) + goto fail; + + rxrpc_local_dont_fragment(conn->local, false); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret < 0) + goto fail; + + conn->peer->last_tx_at = ktime_get_seconds(); + return; + +fail: + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_response); + kleave(" = %d", ret); +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 71b6e07bf1616..366431b0736cd 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -149,8 +149,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, * assess the MTU size for the network interface through which this peer is * reached */ -static void rxrpc_assess_MTU_size(struct rxrpc_local *local, - struct rxrpc_peer *peer) +void rxrpc_assess_MTU_size(struct rxrpc_local *local, struct rxrpc_peer *peer) { struct net *net = local->net; struct dst_entry *dst; @@ -277,8 +276,6 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, peer->hdrsize += sizeof(struct rxrpc_wire_header); peer->max_data = peer->if_mtu - peer->hdrsize; - - rxrpc_assess_MTU_size(local, peer); } /* @@ -297,6 +294,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, if (peer) { memcpy(&peer->srx, srx, sizeof(*srx)); rxrpc_init_peer(local, peer, hash_key); + rxrpc_assess_MTU_size(local, peer); } _leave(" = %p", peer); @@ -475,6 +473,8 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) * @call: The call to query * * Get a record for the remote peer in a call. + * + * Return: The call's peer record. */ struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call) { @@ -486,7 +486,9 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT * @peer: The peer to query * - * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples. + * Get the call's peer smoothed RTT. + * + * Return: The RTT in uS or %UINT_MAX if we have no samples. */ unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { @@ -499,7 +501,10 @@ EXPORT_SYMBOL(rxrpc_kernel_get_srtt); * @peer: The peer to query * * Get a pointer to the address from a peer record. The caller is responsible - * for making sure that the address is not deallocated. + * for making sure that the address is not deallocated. A fake address will be + * substituted if %peer in NULL. + * + * Return: The rxrpc address record or a fake record. */ const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer) { @@ -512,7 +517,10 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_srx); * @peer: The peer to query * * Get a pointer to the transport address from a peer record. The caller is - * responsible for making sure that the address is not deallocated. + * responsible for making sure that the address is not deallocated. A fake + * address will be substituted if %peer in NULL. + * + * Return: The transport address record or a fake record. */ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) { @@ -527,7 +535,9 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_addr); * @app_data: The data to set * * Set the app-specific data on a peer. AF_RXRPC makes no effort to retain - * anything the data might refer to. The previous app_data is returned. + * anything the data might refer to. + * + * Return: The previous app_data. */ unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data) { @@ -540,6 +550,8 @@ EXPORT_SYMBOL(rxrpc_kernel_set_peer_data); * @peer: The peer to query * * Retrieve the app-specific data from a peer. + * + * Return: The peer's app data. */ unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer) { diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 42f70e4636f82..f8bfec12bc7e0 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -181,4 +181,24 @@ struct rxkad_response { __be32 ticket_len; /* Kerberos ticket length */ } __packed; +/* + * GSSAPI security type-4 and type-6 data header. + */ +struct rxgk_header { + __be32 epoch; + __be32 cid; + __be32 call_number; + __be32 seq; + __be32 sec_index; + __be32 data_len; +} __packed; + +/* + * GSSAPI security type-4 and type-6 response packet header. + */ +struct rxgk_response { + __be64 start_time; + __be32 token_len; +} __packed; + #endif /* _LINUX_RXRPC_PACKET_H */ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 32cd5f1d541db..7fa7e77f6bb99 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -29,6 +29,10 @@ void rxrpc_notify_socket(struct rxrpc_call *call) if (!list_empty(&call->recvmsg_link)) return; + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { + rxrpc_see_call(call, rxrpc_call_see_notify_released); + return; + } rcu_read_lock(); @@ -155,6 +159,82 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) } /* + * Transcribe a call's user ID to a control message. + */ +static int rxrpc_recvmsg_user_id(struct rxrpc_call *call, struct msghdr *msg, + int flags) +{ + if (!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + return 0; + + if (flags & MSG_CMSG_COMPAT) { + unsigned int id32 = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned int), &id32); + } else { + unsigned long idl = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned long), &idl); + } +} + +/* + * Deal with a CHALLENGE packet. + */ +static int rxrpc_recvmsg_challenge(struct socket *sock, struct msghdr *msg, + struct sk_buff *challenge, unsigned int flags) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + struct rxrpc_connection *conn = sp->chall.conn; + + return conn->security->challenge_to_recvmsg(conn, challenge, msg); +} + +/* + * Process OOB packets. Called with the socket locked. + */ +static int rxrpc_recvmsg_oob(struct socket *sock, struct msghdr *msg, + unsigned int flags) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + bool need_response = false; + int ret; + + skb = skb_peek(&rx->recvmsg_oobq); + if (!skb) + return -EAGAIN; + rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_OOB_ID, sizeof(u64), + &skb->skb_mstamp_ns); + if (ret < 0) + return ret; + + switch ((enum rxrpc_oob_type)skb->mark) { + case RXRPC_OOB_CHALLENGE: + need_response = true; + ret = rxrpc_recvmsg_challenge(sock, msg, skb, flags); + break; + default: + WARN_ONCE(1, "recvmsg() can't process unknown OOB type %u\n", + skb->mark); + ret = -EIO; + break; + } + + if (!(flags & MSG_PEEK)) + skb_unlink(skb, &rx->recvmsg_oobq); + if (need_response) + rxrpc_add_pending_oob(rx, skb); + else + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + +/* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA * (returns 1). If more packets are required, it returns -EAGAIN and if the @@ -165,6 +245,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, size_t len, int flags, size_t *_offset) { struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct sk_buff *skb; rxrpc_seq_t seq = 0; size_t remain; @@ -207,7 +288,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, sp->offset, sp->len, ret2); if (ret2 < 0) { - kdebug("verify = %d", ret2); ret = ret2; goto out; } @@ -255,6 +335,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); + + if (!rx->app_ops && + !skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + trace_rxrpc_recvdata(call, rxrpc_recvmsg_oobq, seq, + rx_pkt_offset, rx_pkt_len, ret); + break; + } } out: @@ -262,6 +349,7 @@ out: call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } + done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); @@ -301,6 +389,7 @@ try_again: /* Return immediately if a client socket has no outstanding calls */ if (RB_EMPTY_ROOT(&rx->calls) && list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); return -EAGAIN; @@ -322,7 +411,8 @@ try_again: if (ret) goto wait_error; - if (list_empty(&rx->recvmsg_q)) { + if (list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq)) { if (signal_pending(current)) goto wait_interrupted; trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); @@ -332,6 +422,15 @@ try_again: goto try_again; } + /* Deal with OOB messages before we consider getting normal data. */ + if (!skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + ret = rxrpc_recvmsg_oob(sock, msg, flags); + release_sock(&rx->sk); + if (ret == -EAGAIN) + goto try_again; + goto error_no_call; + } + /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. * We also want to weed out calls that got requeued whilst we were @@ -342,7 +441,8 @@ try_again: call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && - skb_queue_empty(&call->recvmsg_queue)) { + skb_queue_empty(&call->recvmsg_queue) && + skb_queue_empty(&rx->recvmsg_oobq)) { list_del_init(&call->recvmsg_link); spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); @@ -351,6 +451,16 @@ try_again: goto try_again; } + rxrpc_see_call(call, rxrpc_call_see_recvmsg); + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { + rxrpc_see_call(call, rxrpc_call_see_already_released); + list_del_init(&call->recvmsg_link); + spin_unlock_irq(&rx->recvmsg_lock); + release_sock(&rx->sk); + trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); + rxrpc_put_call(call, rxrpc_call_put_recvmsg); + goto try_again; + } if (!(flags & MSG_PEEK)) list_del_init(&call->recvmsg_link); else @@ -374,25 +484,18 @@ try_again: release_sock(&rx->sk); - if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) - BUG(); - - if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - if (flags & MSG_CMSG_COMPAT) { - unsigned int id32 = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned int), &id32); - } else { - unsigned long idl = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned long), &idl); - } - if (ret < 0) - goto error_unlock_call; + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { + rxrpc_see_call(call, rxrpc_call_see_already_released); + mutex_unlock(&call->user_mutex); + if (!(flags & MSG_PEEK)) + rxrpc_put_call(call, rxrpc_call_put_recvmsg); + goto try_again; } + ret = rxrpc_recvmsg_user_id(call, msg, flags); + if (ret < 0) + goto error_unlock_call; + if (msg->msg_name && call->peer) { size_t len = sizeof(call->dest_srx); @@ -477,14 +580,14 @@ wait_error: * @_service: Where to store the actual service ID (may be upgraded) * * Allow a kernel service to receive data and pick up information about the - * state of a call. Returns 0 if got what was asked for and there's more - * available, 1 if we got what was asked for and we're at the end of the data - * and -EAGAIN if we need more data. + * state of a call. Note that *@_abort should also be initialised to %0. * - * Note that we may return -EAGAIN to drain empty packets at the end of the - * data, even if we've already copied over the requested data. + * Note that we may return %-EAGAIN to drain empty packets at the end + * of the data, even if we've already copied over the requested data. * - * *_abort should also be initialised to 0. + * Return: %0 if got what was asked for and there's more available, %1 + * if we got what was asked for and we're at the end of the data and + * %-EAGAIN if we need more data. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, struct iov_iter *iter, size_t *_len, diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c new file mode 100644 index 0000000000000..1e19c605bcc82 --- /dev/null +++ b/net/rxrpc/rxgk.c @@ -0,0 +1,1371 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/key-type.h> +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Parse the information from a server key + */ +static int rxgk_preparse_server_key(struct key_preparsed_payload *prep) +{ + const struct krb5_enctype *krb5; + struct krb5_buffer *server_key = (void *)&prep->payload.data[2]; + unsigned int service, sec_class, kvno, enctype; + int n = 0; + + _enter("%zu", prep->datalen); + + if (sscanf(prep->orig_description, "%u:%u:%u:%u%n", + &service, &sec_class, &kvno, &enctype, &n) != 4) + return -EINVAL; + + if (prep->orig_description[n]) + return -EINVAL; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + prep->payload.data[0] = (struct krb5_enctype *)krb5; + + if (prep->datalen != krb5->key_len) + return -EKEYREJECTED; + + server_key->len = prep->datalen; + server_key->data = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!server_key->data) + return -ENOMEM; + + _leave(" = 0"); + return 0; +} + +static void rxgk_free_server_key(union key_payload *payload) +{ + struct krb5_buffer *server_key = (void *)&payload->data[2]; + + kfree_sensitive(server_key->data); +} + +static void rxgk_free_preparse_server_key(struct key_preparsed_payload *prep) +{ + rxgk_free_server_key(&prep->payload); +} + +static void rxgk_destroy_server_key(struct key *key) +{ + rxgk_free_server_key(&key->payload); +} + +static void rxgk_describe_server_key(const struct key *key, struct seq_file *m) +{ + const struct krb5_enctype *krb5 = key->payload.data[0]; + + if (krb5) + seq_printf(m, ": %s", krb5->name); +} + +/* + * Handle rekeying the connection when we see our limits overrun or when the + * far side decided to rekey. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_rekey(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk, *dead = NULL; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + bool crank = false; + + _enter("%d", specific_key_number ? *specific_key_number : -1); + + mutex_lock(&conn->security_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto crank_window; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto generate_key; + if (!specific_key_number && + test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto crank_window; + +grab: + refcount_inc(&gk->usage); + mutex_unlock(&conn->security_lock); + rxgk_put(dead); + return gk; + +crank_window: + trace_rxrpc_rxgk_rekey(conn, current_key, + specific_key_number ? *specific_key_number : -1); + if (current_key == UINT_MAX) + goto bad_key; + if (current_key + 1 == UINT_MAX) + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + + key_number = current_key + 1; + if (WARN_ON(conn->rxgk.keys[key_number & mask])) + goto bad_key; + crank = true; + +generate_key: + gk = conn->rxgk.keys[current_key & mask]; + gk = rxgk_generate_transport_key(conn, gk->key, key_number, GFP_NOFS); + if (IS_ERR(gk)) { + mutex_unlock(&conn->security_lock); + return gk; + } + + write_lock(&conn->security_use_lock); + if (crank) { + current_key++; + conn->rxgk.key_number = current_key; + dead = conn->rxgk.keys[(current_key - 2) & mask]; + conn->rxgk.keys[(current_key - 2) & mask] = NULL; + } + conn->rxgk.keys[current_key & mask] = gk; + write_unlock(&conn->security_use_lock); + goto grab; + +bad_key: + mutex_unlock(&conn->security_lock); + return ERR_PTR(-ESTALE); +} + +/* + * Get the specified keying context. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + + _enter("{%u},%d", + conn->rxgk.key_number, specific_key_number ? *specific_key_number : -1); + + read_lock(&conn->security_use_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + /* Only the bottom 16 bits of the key number are exposed in the + * header, so we try and keep the upper 16 bits in step. The + * whole 32 bits are used to generate the TK. + */ + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto rekey; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto slow_path; + if (!specific_key_number && + key_number < UINT_MAX) { + if (time_after(jiffies, gk->expiry) || + gk->bytes_remaining < 0) { + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); + goto slow_path; + } + + if (test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto slow_path; + } + + refcount_inc(&gk->usage); + read_unlock(&conn->security_use_lock); + return gk; + +rekey: + _debug("rekey"); + if (current_key == UINT_MAX) + goto bad_key; + gk = conn->rxgk.keys[current_key & mask]; + if (gk) + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); +slow_path: + read_unlock(&conn->security_use_lock); + return rxgk_rekey(conn, specific_key_number); +bad_key: + read_unlock(&conn->security_use_lock); + return ERR_PTR(-ESTALE); +} + +/* + * initialise connection security + */ +static int rxgk_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d,%u},{%x}", + conn->debug_id, conn->rxgk.key_number, key_serial(conn->key)); + + conn->security_ix = token->security_index; + conn->security_level = token->rxgk->level; + + if (rxrpc_conn_is_client(conn)) { + conn->rxgk.start_time = ktime_get(); + do_div(conn->rxgk.start_time, 100); + } + + gk = rxgk_generate_transport_key(conn, token->rxgk, conn->rxgk.key_number, + GFP_NOFS); + if (IS_ERR(gk)) + return PTR_ERR(gk); + conn->rxgk.enctype = gk->krb5->etype; + conn->rxgk.keys[gk->key_number & 3] = gk; + + switch (conn->security_level) { + case RXRPC_SECURITY_PLAIN: + case RXRPC_SECURITY_AUTH: + case RXRPC_SECURITY_ENCRYPT: + break; + default: + ret = -EKEYREJECTED; + goto error; + } + + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * Clean up the crypto on a call. + */ +static void rxgk_free_call_crypto(struct rxrpc_call *call) +{ +} + +/* + * Work out how much data we can put in a packet. + */ +static struct rxrpc_txbuf *rxgk_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) +{ + enum krb5_crypto_mode mode; + struct rxgk_context *gk; + struct rxrpc_txbuf *txb; + size_t shdr, alloc, limit, part, offset, gap; + + switch (call->conn->security_level) { + default: + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); + case RXRPC_SECURITY_AUTH: + shdr = 0; + mode = KRB5_CHECKSUM_MODE; + break; + case RXRPC_SECURITY_ENCRYPT: + shdr = sizeof(struct rxgk_header); + mode = KRB5_ENCRYPT_MODE; + break; + } + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return NULL; + + /* Work out the maximum amount of data that will fit. */ + alloc = RXRPC_JUMBO_DATALEN; + limit = crypto_krb5_how_much_data(gk->krb5, mode, &alloc, &offset); + + if (remain < limit - shdr) { + part = remain; + alloc = crypto_krb5_how_much_buffer(gk->krb5, mode, + shdr + part, &offset); + gap = 0; + } else { + part = limit - shdr; + gap = RXRPC_JUMBO_DATALEN - alloc; + alloc = RXRPC_JUMBO_DATALEN; + } + + rxgk_put(gk); + + txb = rxrpc_alloc_data_txbuf(call, alloc, 16, gfp); + if (!txb) + return NULL; + + txb->crypto_header = offset; + txb->sec_header = shdr; + txb->offset += offset + shdr; + txb->space = part; + + /* Clear excess space in the packet */ + if (gap) + memset(txb->data + alloc - gap, 0, gap); + return txb; +} + +/* + * Integrity mode (sign a packet - level 1 security) + */ +static int rxgk_secure_packet_integrity(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + struct krb5_buffer metadata; + int ret = -ENOMEM; + + _enter(""); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + goto error_gk; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + metadata.len = sizeof(*hdr); + metadata.data = hdr; + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_get_mic(gk->krb5, gk->tx_Kc, &metadata, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + kfree(hdr); +error_gk: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * wholly encrypt a packet (level 2 security) + */ +static int rxgk_secure_packet_encrypted(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + int ret; + + _enter("%x", txb->len); + + /* Insert the header into the buffer. */ + hdr = txb->data + txb->crypto_header; + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_encrypt(gk->krb5, gk->tx_enc, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len, + false); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * checksum an RxRPC packet header + */ +static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d{%x}},{#%u},%u,", + call->debug_id, key_serial(call->conn->key), txb->seq, txb->len); + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk) == -ESTALE ? -EKEYREJECTED : PTR_ERR(gk); + + ret = key_validate(call->conn->key); + if (ret < 0) { + rxgk_put(gk); + return ret; + } + + call->security_enctype = gk->krb5->etype; + txb->cksum = htons(gk->key_number); + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); + txb->pkt_len = txb->len; + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_secure_packet_integrity(call, gk, txb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_secure_packet_encrypted(call, gk, txb); + default: + rxgk_put(gk); + return -EPERM; + } +} + +/* + * Integrity mode (check the signature on a packet - level 1 security) + */ +static int rxgk_verify_packet_integrity(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header *hdr; + struct krb5_buffer metadata; + unsigned int offset = sp->offset, len = sp->len; + size_t data_offset = 0, data_len = len; + u32 ac; + int ret = -ENOMEM; + + _enter(""); + + crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + goto put_gk; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(sp->hdr.seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(data_len); + + metadata.len = sizeof(*hdr); + metadata.data = hdr; + ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, + skb, &offset, &len, &ac); + kfree(hdr); + if (ret == -EPROTO) { + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); + } else { + sp->offset = offset; + sp->len = len; + } + +put_gk: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Decrypt an encrypted packet (level 2 security). + */ +static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header hdr; + unsigned int offset = sp->offset, len = sp->len; + int ret; + u32 ac; + + _enter(""); + + ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); + if (ret == -EPROTO) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); + if (ret < 0) + goto error; + + if (len < sizeof(hdr)) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + + /* Extract the header from the skb */ + ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr)); + if (ret < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_encdata); + goto error; + } + offset += sizeof(hdr); + len -= sizeof(hdr); + + if (ntohl(hdr.epoch) != call->conn->proto.epoch || + ntohl(hdr.cid) != call->cid || + ntohl(hdr.call_number) != call->call_id || + ntohl(hdr.seq) != sp->hdr.seq || + ntohl(hdr.sec_index) != call->security_ix || + ntohl(hdr.data_len) > len) { + ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON, + rxgk_abort_2_short_data); + goto error; + } + + sp->offset = offset; + sp->len = ntohl(hdr.data_len); + ret = 0; +error: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Verify the security on a received packet or subpacket (if part of a + * jumbo packet). + */ +static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_context *gk; + u16 key_number = sp->hdr.cksum; + + _enter("{%d{%x}},{#%u}", + call->debug_id, key_serial(call->conn->key), sp->hdr.seq); + + gk = rxgk_get_key(call->conn, &key_number); + if (IS_ERR(gk)) { + switch (PTR_ERR(gk)) { + case -ESTALE: + return rxrpc_abort_eproto(call, skb, RXGK_BADKEYNO, + rxgk_abort_bad_key_number); + default: + return PTR_ERR(gk); + } + } + + call->security_enctype = gk->krb5->etype; + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_verify_packet_integrity(call, gk, skb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_verify_packet_encrypted(call, gk, skb); + default: + rxgk_put(gk); + return -ENOANO; + } +} + +/* + * Allocate memory to hold a challenge or a response packet. We're not running + * in the io_thread, so we can't use ->tx_alloc. + */ +static struct page *rxgk_alloc_packet(size_t total_len) +{ + gfp_t gfp = GFP_NOFS; + int order; + + order = get_order(total_len); + if (order > 0) + gfp |= __GFP_COMP; + return alloc_pages(gfp, order); +} + +/* + * Issue a challenge. + */ +static int rxgk_issue_challenge(struct rxrpc_connection *conn) +{ + struct rxrpc_wire_header *whdr; + struct bio_vec bvec[1]; + struct msghdr msg; + struct page *page; + size_t len = sizeof(*whdr) + sizeof(conn->rxgk.nonce); + u32 serial; + int ret; + + _enter("{%d}", conn->debug_id); + + get_random_bytes(&conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + /* We can't use conn->tx_alloc without a lock */ + page = rxgk_alloc_packet(sizeof(*whdr) + sizeof(conn->rxgk.nonce)); + if (!page) + return -ENOMEM; + + bvec_set_page(&bvec[0], page, len, 0); + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, 1, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + whdr = page_address(page); + whdr->epoch = htonl(conn->proto.epoch); + whdr->cid = htonl(conn->proto.cid); + whdr->callNumber = 0; + whdr->seq = 0; + whdr->type = RXRPC_PACKET_TYPE_CHALLENGE; + whdr->flags = conn->out_clientflag; + whdr->userStatus = 0; + whdr->securityIndex = conn->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(conn->service_id); + + memcpy(whdr + 1, conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + serial = rxrpc_get_next_serials(conn, 1); + whdr->serial = htonl(serial); + + trace_rxrpc_tx_challenge(conn, serial, 0, *(u32 *)&conn->rxgk.nonce); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret > 0) + conn->peer->last_tx_at = ktime_get_seconds(); + __free_page(page); + + if (ret < 0) { + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_rxgk_challenge); + return -EAGAIN; + } + + trace_rxrpc_tx_packet(conn->debug_id, whdr, + rxrpc_tx_point_rxgk_challenge); + _leave(" = 0"); + return 0; +} + +/* + * Validate a challenge packet. + */ +static bool rxgk_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + u8 nonce[20]; + + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxgk_abort_chall_no_key); + return false; + } + + if (key_validate(conn->key) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + return false; + } + + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + nonce, sizeof(nonce)) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_chall_short); + return false; + } + + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, 0, *(u32 *)nonce, 0); + return true; +} + +/** + * rxgk_kernel_query_challenge - Query RxGK-specific challenge parameters + * @challenge: The challenge packet to query + * + * Return: The Kerberos 5 encoding type for the challenged connection. + */ +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + return sp->chall.conn->rxgk.enctype; +} +EXPORT_SYMBOL(rxgk_kernel_query_challenge); + +/* + * Fill out the control message to pass to userspace to inform about the + * challenge. + */ +static int rxgk_challenge_to_recvmsg(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg) +{ + struct rxgk_challenge chall; + + chall.base.service_id = conn->service_id; + chall.base.security_index = conn->security_ix; + chall.enctype = conn->rxgk.enctype; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_CHALLENGED, sizeof(chall), &chall); +} + +/* + * Insert the requisite amount of XDR padding for the length given. + */ +static int rxgk_pad_out(struct sk_buff *response, size_t len, size_t offset) +{ + __be32 zero = 0; + size_t pad = xdr_round_up(len) - len; + int ret; + + if (!pad) + return 0; + + ret = skb_store_bits(response, offset, &zero, pad); + if (ret < 0) + return ret; + return pad; +} + +/* + * Insert the header into the response. + */ +static noinline ssize_t rxgk_insert_response_header(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset) +{ + struct rxrpc_skb_priv *rsp = rxrpc_skb(response); + + struct { + struct rxrpc_wire_header whdr; + __be32 start_time_msw; + __be32 start_time_lsw; + __be32 ticket_len; + } h; + int ret; + + rsp->resp.kvno = gk->key_number; + rsp->resp.version = gk->krb5->etype; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = htons(gk->key_number); + h.whdr.serviceId = htons(conn->service_id); + h.start_time_msw = htonl(upper_32_bits(conn->rxgk.start_time)); + h.start_time_lsw = htonl(lower_32_bits(conn->rxgk.start_time)); + h.ticket_len = htonl(gk->key->ticket.len); + + ret = skb_store_bits(response, offset, &h, sizeof(h)); + return ret < 0 ? ret : sizeof(h); +} + +/* + * Construct the authenticator to go in the response packet + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static ssize_t rxgk_construct_authenticator(struct rxrpc_connection *conn, + struct sk_buff *challenge, + const struct krb5_buffer *appdata, + struct sk_buff *response, + size_t offset) +{ + struct { + u8 nonce[20]; + __be32 appdata_len; + } a; + struct { + __be32 level; + __be32 epoch; + __be32 cid; + __be32 call_numbers_count; + __be32 call_numbers[4]; + } b; + int ret; + + ret = skb_copy_bits(challenge, sizeof(struct rxrpc_wire_header), + a.nonce, sizeof(a.nonce)); + if (ret < 0) + return -EPROTO; + + a.appdata_len = htonl(appdata->len); + + ret = skb_store_bits(response, offset, &a, sizeof(a)); + if (ret < 0) + return ret; + offset += sizeof(a); + + if (appdata->len) { + ret = skb_store_bits(response, offset, appdata->data, appdata->len); + if (ret < 0) + return ret; + offset += appdata->len; + + ret = rxgk_pad_out(response, appdata->len, offset); + if (ret < 0) + return ret; + offset += ret; + } + + b.level = htonl(conn->security_level); + b.epoch = htonl(conn->proto.epoch); + b.cid = htonl(conn->proto.cid); + b.call_numbers_count = htonl(4); + b.call_numbers[0] = htonl(conn->channels[0].call_counter); + b.call_numbers[1] = htonl(conn->channels[1].call_counter); + b.call_numbers[2] = htonl(conn->channels[2].call_counter); + b.call_numbers[3] = htonl(conn->channels[3].call_counter); + + ret = skb_store_bits(response, offset, &b, sizeof(b)); + if (ret < 0) + return ret; + return sizeof(a) + xdr_round_up(appdata->len) + sizeof(b); +} + +static ssize_t rxgk_encrypt_authenticator(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset, + size_t alloc_len, + size_t auth_offset, + size_t auth_len) +{ + struct scatterlist sg[16]; + int nr_sg; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(response, sg, offset, alloc_len); + if (unlikely(nr_sg < 0)) + return nr_sg; + return crypto_krb5_encrypt(gk->krb5, gk->resp_enc, sg, nr_sg, alloc_len, + auth_offset, auth_len, false); +} + +/* + * Construct the response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator<RXGK_MAXAUTHENTICATOR> + * }; + */ +static int rxgk_construct_response(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp, *rsp; + struct rxgk_context *gk; + struct sk_buff *response; + size_t len, auth_len, authx_len, offset, auth_offset, authx_offset; + __be32 tmp; + int ret; + + gk = rxgk_get_key(conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk); + + auth_len = 20 + (4 + appdata->len) + 12 + (1 + 4) * 4; + authx_len = crypto_krb5_how_much_buffer(gk->krb5, KRB5_ENCRYPT_MODE, + auth_len, &auth_offset); + len = sizeof(struct rxrpc_wire_header) + + 8 + (4 + xdr_round_up(gk->key->ticket.len)) + (4 + authx_len); + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxgk); + response->len = len; + response->data_len = len; + + ret = rxgk_insert_response_header(conn, gk, response, 0); + if (ret < 0) + goto error; + offset = ret; + + ret = skb_store_bits(response, offset, gk->key->ticket.data, gk->key->ticket.len); + if (ret < 0) + goto error; + offset += gk->key->ticket.len; + ret = rxgk_pad_out(response, gk->key->ticket.len, offset); + if (ret < 0) + goto error; + + authx_offset = offset + ret + 4; /* Leave a gap for the length. */ + + ret = rxgk_construct_authenticator(conn, challenge, appdata, response, + authx_offset + auth_offset); + if (ret < 0) + goto error; + auth_len = ret; + + ret = rxgk_encrypt_authenticator(conn, gk, response, + authx_offset, authx_len, + auth_offset, auth_len); + if (ret < 0) + goto error; + authx_len = ret; + + tmp = htonl(authx_len); + ret = skb_store_bits(response, authx_offset - 4, &tmp, 4); + if (ret < 0) + goto error; + + ret = rxgk_pad_out(response, authx_len, authx_offset + authx_len); + if (ret < 0) + goto error; + len = authx_offset + authx_len + ret; + + if (len != response->len) { + response->len = len; + response->data_len = len; + } + + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Respond to a challenge packet. + */ +static int rxgk_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + if (key_validate(conn->key) < 0) + return rxrpc_abort_conn(conn, NULL, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + + return rxgk_construct_response(conn, challenge, appdata); +} + +static int rxgk_respond_to_challenge_no_appdata(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + struct krb5_buffer appdata = {}; + + return rxgk_respond_to_challenge(conn, challenge, &appdata); +} + +/** + * rxgk_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * @appdata: The application data to include in the RESPONSE authenticator + * + * Allow a kernel application to respond to a CHALLENGE with application data + * to be included in the RxGK RESPONSE Authenticator. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxgk_respond_to_challenge(csp->chall.conn, challenge, appdata); +} +EXPORT_SYMBOL(rxgk_kernel_respond_to_challenge); + +/* + * Parse sendmsg() control message and respond to challenge. We need to see if + * there's an appdata to fish out. + */ +static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + struct krb5_buffer appdata = {}; + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (cmsg->cmsg_level != SOL_RXRPC || + cmsg->cmsg_type != RXRPC_RESP_RXGK_APPDATA) + continue; + if (appdata.data) + return -EINVAL; + appdata.data = CMSG_DATA(cmsg); + appdata.len = cmsg->cmsg_len - sizeof(struct cmsghdr); + } + + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +} + +/* + * Verify the authenticator. + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + __be32 *p, __be32 *end) +{ + u32 app_len, call_count, level, epoch, cid, i; + + _enter(""); + + if (memcmp(p, conn->rxgk.nonce, 20) != 0) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_nonce); + p += 20 / sizeof(__be32); + + app_len = ntohl(*p++); + if (app_len > (end - p) * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + p += xdr_round_up(app_len) / sizeof(__be32); + if (end - p < 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + level = ntohl(*p++); + epoch = ntohl(*p++); + cid = ntohl(*p++); + call_count = ntohl(*p++); + + if (level != conn->security_level || + epoch != conn->proto.epoch || + cid != conn->proto.cid || + call_count > 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_param); + + if (end - p < call_count) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_call_list); + + for (i = 0; i < call_count; i++) { + u32 call_id = ntohl(*p++); + + if (call_id > INT_MAX) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_callid); + + if (call_id < conn->channels[i].call_counter) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_ctr); + + if (call_id > conn->channels[i].call_counter) { + if (conn->channels[i].call) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_state); + + conn->channels[i].call_counter = call_id; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * Extract the authenticator and verify it. + */ +static int rxgk_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + unsigned int auth_offset, unsigned int auth_len) +{ + void *auth; + __be32 *p; + int ret; + + auth = kmalloc(auth_len, GFP_NOFS); + if (!auth) + return -ENOMEM; + + ret = skb_copy_bits(skb, auth_offset, auth, auth_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_auth); + goto error; + } + + p = auth; + ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, p + auth_len); +error: + kfree(auth); + return ret; +} + +/* + * Verify a response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator<RXGK_MAXAUTHENTICATOR> + * }; + */ +static int rxgk_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + const struct krb5_enctype *krb5; + struct rxrpc_key_token *token; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_response rhdr; + struct rxgk_context *gk; + struct key *key = NULL; + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); + unsigned int token_offset, token_len; + unsigned int auth_offset, auth_len; + __be32 xauth_len; + int ret, ec; + + _enter("{%d}", conn->debug_id); + + /* Parse the RXGK_Response object */ + if (sizeof(rhdr) + sizeof(__be32) > len) + goto short_packet; + + if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0) + goto short_packet; + offset += sizeof(rhdr); + len -= sizeof(rhdr); + + token_offset = offset; + token_len = ntohl(rhdr.token_len); + if (xdr_round_up(token_len) + sizeof(__be32) > len) + goto short_packet; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len); + + offset += xdr_round_up(token_len); + len -= xdr_round_up(token_len); + + if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0) + goto short_packet; + offset += sizeof(xauth_len); + len -= sizeof(xauth_len); + + auth_offset = offset; + auth_len = ntohl(xauth_len); + if (auth_len < len) + goto short_packet; + if (auth_len & 3) + goto inconsistent; + if (auth_len < 20 + 9 * 4) + goto auth_too_short; + + /* We need to extract and decrypt the token and instantiate a session + * key for it. This bit, however, is application-specific. If + * possible, we use a default parser, but we might end up bumping this + * to the app to deal with - which might mean a round trip to + * userspace. + */ + ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key); + if (ret < 0) + goto out; + + /* We now have a key instantiated from the decrypted ticket. We can + * pass this to the application so that they can parse the ticket + * content and we can use the session key it contains to derive the + * keys we need. + * + * Note that we have to switch enctype at this point as the enctype of + * the ticket doesn't necessarily match that of the transport. + */ + token = key->payload.data[0]; + conn->security_level = token->rxgk->level; + conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time); + + gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS); + if (IS_ERR(gk)) { + ret = PTR_ERR(gk); + goto cant_get_token; + } + + krb5 = gk->krb5; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len); + + /* Decrypt, parse and verify the authenticator. */ + ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb, + &auth_offset, &auth_len, &ec); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret, + rxgk_abort_resp_auth_dec); + goto out; + } + + ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len); + if (ret < 0) + goto out; + + conn->key = key; + key = NULL; + ret = 0; +out: + key_put(key); + _leave(" = %d", ret); + return ret; + +inconsistent: + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_xdr_align); + goto out; +auth_too_short: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_auth); + goto out; +short_packet: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_packet); + goto out; + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_internal_error); + goto out; + case -ENOPKG: + ret = rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_nopkg); + goto out; + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + goto out; +} + +/* + * clear the connection security + */ +static void rxgk_clear(struct rxrpc_connection *conn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(conn->rxgk.keys); i++) + rxgk_put(conn->rxgk.keys[i]); +} + +/* + * Initialise the RxGK security service. + */ +static int rxgk_init(void) +{ + return 0; +} + +/* + * Clean up the RxGK security service. + */ +static void rxgk_exit(void) +{ +} + +/* + * RxRPC YFS GSSAPI-based security + */ +const struct rxrpc_security rxgk_yfs = { + .name = "yfs-rxgk", + .security_index = RXRPC_SECURITY_YFS_RXGK, + .no_key_abort = RXGK_NOTAUTH, + .init = rxgk_init, + .exit = rxgk_exit, + .preparse_server_key = rxgk_preparse_server_key, + .free_preparse_server_key = rxgk_free_preparse_server_key, + .destroy_server_key = rxgk_destroy_server_key, + .describe_server_key = rxgk_describe_server_key, + .init_connection_security = rxgk_init_connection_security, + .alloc_txbuf = rxgk_alloc_txbuf, + .secure_packet = rxgk_secure_packet, + .verify_packet = rxgk_verify_packet, + .free_call_crypto = rxgk_free_call_crypto, + .issue_challenge = rxgk_issue_challenge, + .validate_challenge = rxgk_validate_challenge, + .challenge_to_recvmsg = rxgk_challenge_to_recvmsg, + .sendmsg_respond_to_challenge = rxgk_sendmsg_respond_to_challenge, + .respond_to_challenge = rxgk_respond_to_challenge_no_appdata, + .verify_response = rxgk_verify_response, + .clear = rxgk_clear, + .default_decode_ticket = rxgk_yfs_decode_ticket, +}; diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c new file mode 100644 index 0000000000000..b94b77a1c3178 --- /dev/null +++ b/net/rxrpc/rxgk_app.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Application-specific bits for GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/key-type.h> +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Decode a default-style YFS ticket in a response and turn it into an + * rxrpc-type key. + * + * struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + * + * struct RXGK_AuthName { + * afs_int32 kind; + * opaque data<AUTHDATAMAX>; + * opaque display<AUTHPRINTABLEMAX>; + * }; + * + * struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key) +{ + struct rxrpc_key_token *token; + const struct cred *cred = current_cred(); // TODO - use socket creds + struct key *key; + size_t pre_ticket_len, payload_len; + unsigned int klen, enctype; + void *payload, *ticket; + __be32 *t, *p, *q, tmp[2]; + int ret; + + _enter(""); + + /* Get the session key length */ + ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); + if (ret < 0) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_klen); + enctype = ntohl(tmp[0]); + klen = ntohl(tmp[1]); + + if (klen > ticket_len - 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_key); + + pre_ticket_len = ((5 + 14) * sizeof(__be32) + + xdr_round_up(klen) + + sizeof(__be32)); + payload_len = pre_ticket_len + xdr_round_up(ticket_len); + + payload = kzalloc(payload_len, GFP_NOFS); + if (!payload) + return -ENOMEM; + + /* We need to fill out the XDR form for a key payload that we can pass + * to add_key(). Start by copying in the ticket so that we can parse + * it. + */ + ticket = payload + pre_ticket_len; + ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + goto error; + } + + /* Fill out the form header. */ + p = payload; + p[0] = htonl(0); /* Flags */ + p[1] = htonl(1); /* len(cellname) */ + p[2] = htonl(0x20000000); /* Cellname " " */ + p[3] = htonl(1); /* #tokens */ + p[4] = htonl(15 * sizeof(__be32) + xdr_round_up(klen) + + xdr_round_up(ticket_len)); /* Token len */ + + /* Now fill in the body. Most of this we can just scrape directly from + * the ticket. + */ + t = ticket + sizeof(__be32) * 2 + xdr_round_up(klen); + q = payload + 5 * sizeof(__be32); + q[0] = htonl(RXRPC_SECURITY_YFS_RXGK); + q[1] = t[1]; /* begintime - msw */ + q[2] = t[2]; /* - lsw */ + q[3] = t[5]; /* endtime - msw */ + q[4] = t[6]; /* - lsw */ + q[5] = 0; /* level - msw */ + q[6] = t[0]; /* - lsw */ + q[7] = 0; /* lifetime - msw */ + q[8] = t[3]; /* - lsw */ + q[9] = 0; /* bytelife - msw */ + q[10] = t[4]; /* - lsw */ + q[11] = 0; /* enctype - msw */ + q[12] = htonl(enctype); /* - lsw */ + q[13] = htonl(klen); /* Key length */ + + q += 14; + + memcpy(q, ticket + sizeof(__be32) * 2, klen); + q += xdr_round_up(klen) / 4; + q[0] = htonl(ticket_len); + q++; + if (WARN_ON((unsigned long)q != (unsigned long)ticket)) { + ret = -EIO; + goto error; + } + + /* Ticket read in with skb_copy_bits above */ + q += xdr_round_up(ticket_len) / 4; + if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) { + ret = -EIO; + goto error; + } + + /* Now turn that into a key. */ + key = key_alloc(&key_type_rxrpc, "x", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, // TODO: Use socket owner + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + ret = PTR_ERR(key); + goto error; + } + + _debug("key %d", key_serial(key)); + + ret = key_instantiate_and_link(key, payload, payload_len, NULL, NULL); + if (ret < 0) + goto error_key; + + token = key->payload.data[0]; + token->no_leak_key = true; + *_key = key; + key = NULL; + ret = 0; + goto error; + +error_key: + key_put(key); +error: + kfree_sensitive(payload); + _leave(" = %d", ret); + return ret; +} + +/* + * Extract the token and set up a session key from the details. + * + * struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + * + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1] + */ +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key) +{ + const struct krb5_enctype *krb5; + const struct krb5_buffer *server_secret; + struct crypto_aead *token_enc = NULL; + struct key *server_key; + unsigned int ticket_offset, ticket_len; + u32 kvno, enctype; + int ret, ec; + + struct { + __be32 kvno; + __be32 enctype; + __be32 token_len; + } container; + + /* Decode the RXGK_TokenContainer object. This tells us which server + * key we should be using. We can then fetch the key, get the secret + * and set up the crypto to extract the token. + */ + if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + kvno = ntohl(container.kvno); + enctype = ntohl(container.enctype); + ticket_len = ntohl(container.token_len); + ticket_offset = token_offset + sizeof(container); + + if (xdr_round_up(ticket_len) > token_len - 3 * 4) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + _debug("KVNO %u", kvno); + _debug("ENC %u", enctype); + _debug("TLEN %u", ticket_len); + + server_key = rxrpc_look_up_server_security(conn, skb, kvno, enctype); + if (IS_ERR(server_key)) + goto cant_get_server_key; + + down_read(&server_key->sem); + server_secret = (const void *)&server_key->payload.data[2]; + ret = rxgk_set_up_token_cipher(server_secret, &token_enc, enctype, &krb5, GFP_NOFS); + up_read(&server_key->sem); + key_put(server_key); + if (ret < 0) + goto cant_get_token; + + /* We can now decrypt and parse the token/ticket. This allows us to + * gain access to K0, from which we can derive the transport key and + * thence decode the authenticator. + */ + ret = rxgk_decrypt_skb(krb5, token_enc, skb, + &ticket_offset, &ticket_len, &ec); + crypto_free_aead(token_enc); + token_enc = NULL; + if (ret < 0) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + + ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, + ticket_len, _key); + if (ret < 0) + goto cant_get_token; + + _leave(" = 0"); + return ret; + +cant_get_server_key: + ret = PTR_ERR(server_key); + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -ENOKEY: + case -EKEYREJECTED: + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EPERM: + return rxrpc_abort_conn(conn, skb, RXGK_BADKEYNO, -EKEYREJECTED, + rxgk_abort_resp_tok_nokey); + default: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_keyerr); + } + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_internal_error); + case -ENOPKG: + return rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_tok_nopkg); + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + return ret; +} diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h new file mode 100644 index 0000000000000..7370a56559853 --- /dev/null +++ b/net/rxrpc/rxgk_common.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Common bits for GSSAPI-based RxRPC security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <crypto/krb5.h> +#include <crypto/skcipher.h> +#include <crypto/hash.h> + +/* + * Per-key number context. This is replaced when the connection is rekeyed. + */ +struct rxgk_context { + refcount_t usage; + unsigned int key_number; /* Rekeying number (goes in the rx header) */ + unsigned long flags; +#define RXGK_TK_NEEDS_REKEY 0 /* Set if this needs rekeying */ + unsigned long expiry; /* Expiration time of this key */ + long long bytes_remaining; /* Remaining Tx lifetime of this key */ + const struct krb5_enctype *krb5; /* RxGK encryption type */ + const struct rxgk_key *key; + + /* We need up to 7 keys derived from the transport key, but we don't + * actually need the transport key. Each key is derived by + * DK(TK,constant). + */ + struct crypto_aead *tx_enc; /* Transmission key */ + struct crypto_aead *rx_enc; /* Reception key */ + struct crypto_shash *tx_Kc; /* Transmission checksum key */ + struct crypto_shash *rx_Kc; /* Reception checksum key */ + struct crypto_aead *resp_enc; /* Response packet enc key */ +}; + +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_object_len(x) (4 + xdr_round_up(x)) + +/* + * rxgk_app.c + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key); + +/* + * rxgk_kdf.c + */ +void rxgk_put(struct rxgk_context *gk); +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp); +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_key, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp); + +/* + * Apply decryption and checksumming functions to part of an skbuff. The + * offset and length are updated to reflect the actual content of the encrypted + * region. + */ +static inline +int rxgk_decrypt_skb(const struct krb5_enctype *krb5, + struct crypto_aead *aead, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + int *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} + +/* + * Check the MIC on a region of an skbuff. The offset and length are updated + * to reflect the actual content of the secure region. + */ +static inline +int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, + struct crypto_shash *shash, + const struct krb5_buffer *metadata, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + u32 *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} diff --git a/net/rxrpc/rxgk_kdf.c b/net/rxrpc/rxgk_kdf.c new file mode 100644 index 0000000000000..b4db5aa30e5b9 --- /dev/null +++ b/net/rxrpc/rxgk_kdf.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RxGK transport key derivation. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/key-type.h> +#include <linux/slab.h> +#include <keys/rxrpc-type.h> +#include "ar-internal.h" +#include "rxgk_common.h" + +#define round16(x) (((x) + 15) & ~15) + +/* + * Constants used to derive the keys and hmacs actually used for doing stuff. + */ +#define RXGK_CLIENT_ENC_PACKET 1026U // 0x402 +#define RXGK_CLIENT_MIC_PACKET 1027U // 0x403 +#define RXGK_SERVER_ENC_PACKET 1028U // 0x404 +#define RXGK_SERVER_MIC_PACKET 1029U // 0x405 +#define RXGK_CLIENT_ENC_RESPONSE 1030U // 0x406 +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c + +static void rxgk_free(struct rxgk_context *gk) +{ + if (gk->tx_Kc) + crypto_free_shash(gk->tx_Kc); + if (gk->rx_Kc) + crypto_free_shash(gk->rx_Kc); + if (gk->tx_enc) + crypto_free_aead(gk->tx_enc); + if (gk->rx_enc) + crypto_free_aead(gk->rx_enc); + if (gk->resp_enc) + crypto_free_aead(gk->resp_enc); + kfree(gk); +} + +void rxgk_put(struct rxgk_context *gk) +{ + if (gk && refcount_dec_and_test(&gk->usage)) + rxgk_free(gk); +} + +/* + * Transport key derivation function. + * + * TK = random-to-key(PRF+(K0, L, + * epoch || cid || start_time || key_number)) + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-11 sec 8.3] + */ +static int rxgk_derive_transport_key(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + struct krb5_buffer *TK, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct krb5_buffer conn_info; + unsigned int L = krb5->key_bytes; + __be32 *info; + u8 *buffer; + int ret; + + _enter(""); + + conn_info.len = sizeof(__be32) * 5; + + buffer = kzalloc(round16(conn_info.len), gfp); + if (!buffer) + return -ENOMEM; + + conn_info.data = buffer; + + info = (__be32 *)conn_info.data; + info[0] = htonl(conn->proto.epoch); + info[1] = htonl(conn->proto.cid); + info[2] = htonl(conn->rxgk.start_time >> 32); + info[3] = htonl(conn->rxgk.start_time >> 0); + info[4] = htonl(gk->key_number); + + ret = crypto_krb5_calc_PRFplus(krb5, &rxgk->key, L, &conn_info, TK, gfp); + kfree_sensitive(buffer); + _leave(" = %d", ret); + return ret; +} + +/* + * Set up the ciphers for the usage keys. + */ +static int rxgk_set_up_ciphers(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct crypto_shash *shash; + struct crypto_aead *aead; + struct krb5_buffer TK; + bool service = rxrpc_conn_is_service(conn); + int ret; + u8 *buffer; + + buffer = kzalloc(krb5->key_bytes, gfp); + if (!buffer) + return -ENOMEM; + + TK.len = krb5->key_bytes; + TK.data = buffer; + + ret = rxgk_derive_transport_key(conn, gk, rxgk, &TK, gfp); + if (ret < 0) + goto out; + + aead = crypto_krb5_prepare_encryption(krb5, &TK, RXGK_CLIENT_ENC_RESPONSE, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->resp_enc = aead; + + if (crypto_aead_blocksize(gk->resp_enc) != krb5->block_len || + crypto_aead_authsize(gk->resp_enc) != krb5->cksum_len) { + pr_notice("algo inconsistent with krb5 table %u!=%u or %u!=%u\n", + crypto_aead_blocksize(gk->resp_enc), krb5->block_len, + crypto_aead_authsize(gk->resp_enc), krb5->cksum_len); + ret = -EINVAL; + goto out; + } + + if (service) { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } else { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } + + ret = 0; +out: + kfree_sensitive(buffer); + return ret; +aead_error: + ret = PTR_ERR(aead); + goto out; +hash_error: + ret = PTR_ERR(shash); + goto out; +} + +/* + * Derive a transport key for a connection and then derive a bunch of usage + * keys from it and set up ciphers using them. + */ +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp) +{ + struct rxgk_context *gk; + unsigned long lifetime; + int ret = -ENOPKG; + + _enter(""); + + gk = kzalloc(sizeof(*gk), GFP_KERNEL); + if (!gk) + return ERR_PTR(-ENOMEM); + refcount_set(&gk->usage, 1); + gk->key = key; + gk->key_number = key_number; + + gk->krb5 = crypto_krb5_find_enctype(key->enctype); + if (!gk->krb5) + goto err_tk; + + ret = rxgk_set_up_ciphers(conn, gk, key, gfp); + if (ret) + goto err_tk; + + /* Set the remaining number of bytes encrypted with this key that may + * be transmitted before rekeying. Note that the spec has been + * interpreted differently on this point... + */ + switch (key->bytelife) { + case 0: + case 63: + gk->bytes_remaining = LLONG_MAX; + break; + case 1 ... 62: + gk->bytes_remaining = 1LL << key->bytelife; + break; + default: + gk->bytes_remaining = key->bytelife; + break; + } + + /* Set the time after which rekeying must occur */ + if (key->lifetime) { + lifetime = min_t(u64, key->lifetime, INT_MAX / HZ); + lifetime *= HZ; + } else { + lifetime = MAX_JIFFY_OFFSET; + } + gk->expiry = jiffies + lifetime; + return gk; + +err_tk: + rxgk_put(gk); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * Use the server secret key to set up the ciphers that will be used to extract + * the token from a response packet. + */ +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_aead, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp) +{ + const struct krb5_enctype *krb5; + struct crypto_aead *aead; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + aead = crypto_krb5_prepare_encryption(krb5, server_key, RXGK_SERVER_ENC_TOKEN, gfp); + if (IS_ERR(aead)) + return PTR_ERR(aead); + + *_krb5 = krb5; + *token_aead = aead; + return 0; +} diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 6cb37b0eb77f4..3657c0661cdc7 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -177,8 +177,10 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem if (!txb) return NULL; - txb->offset += shdr; - txb->space = part; + txb->crypto_header = 0; + txb->sec_header = shdr; + txb->offset += shdr; + txb->space = part; return txb; } @@ -683,6 +685,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) serial = rxrpc_get_next_serial(conn); whdr.serial = htonl(serial); + trace_rxrpc_tx_challenge(conn, serial, 0, conn->rxkad.nonce); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, @@ -698,62 +702,6 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) } /* - * send a Kerberos security response - */ -static int rxkad_send_response(struct rxrpc_connection *conn, - struct rxrpc_host_header *hdr, - struct rxkad_response *resp, - const struct rxkad_key *s2) -{ - struct rxrpc_wire_header whdr; - struct msghdr msg; - struct kvec iov[3]; - size_t len; - u32 serial; - int ret; - - _enter(""); - - msg.msg_name = &conn->peer->srx.transport; - msg.msg_namelen = conn->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - memset(&whdr, 0, sizeof(whdr)); - whdr.epoch = htonl(hdr->epoch); - whdr.cid = htonl(hdr->cid); - whdr.type = RXRPC_PACKET_TYPE_RESPONSE; - whdr.flags = conn->out_clientflag; - whdr.securityIndex = hdr->securityIndex; - whdr.serviceId = htons(hdr->serviceId); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = resp; - iov[1].iov_len = sizeof(*resp); - iov[2].iov_base = (void *)s2->ticket; - iov[2].iov_len = s2->ticket_len; - - len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; - - serial = rxrpc_get_next_serial(conn); - whdr.serial = htonl(serial); - - rxrpc_local_dont_fragment(conn->local, false); - ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); - if (ret < 0) { - trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_point_rxkad_response); - return -EAGAIN; - } - - conn->peer->last_tx_at = ktime_get_seconds(); - _leave(" = 0"); - return 0; -} - -/* * calculate the response checksum */ static void rxkad_calc_response_checksum(struct rxkad_response *response) @@ -772,12 +720,21 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response) * encrypt the response packet */ static int rxkad_encrypt_response(struct rxrpc_connection *conn, - struct rxkad_response *resp, + struct sk_buff *response, const struct rxkad_key *s2) { struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg[1]; + size_t encsize = sizeof(((struct rxkad_response *)0)->encrypted); + int ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, + sizeof(struct rxrpc_wire_header) + + offsetof(struct rxkad_response, encrypted), encsize); + if (ret < 0) + return ret; req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS); if (!req) @@ -786,89 +743,206 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); - sg_init_table(sg, 1); - sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_sync_tfm(req, conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_encrypt(req); + skcipher_request_set_crypt(req, sg, sg, encsize, iv.x); + ret = crypto_skcipher_encrypt(req); skcipher_request_free(req); - return 0; + return ret; } /* - * respond to a challenge packet + * Validate a challenge packet. */ -static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool rxkad_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { - const struct rxrpc_key_token *token; struct rxkad_challenge challenge; - struct rxkad_response *resp; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - u32 version, nonce, min_level; - int ret = -EPROTO; + u32 version, min_level; + int ret; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); - if (!conn->key) - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxkad_abort_chall_no_key); + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxkad_abort_chall_no_key); + return false; + } ret = key_validate(conn->key); - if (ret < 0) - return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, - rxkad_abort_chall_key_expired); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); + return false; + } if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &challenge, sizeof(challenge)) < 0) - return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, - rxkad_abort_chall_short); + &challenge, sizeof(challenge)) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_chall_short); + return false; + } version = ntohl(challenge.version); - nonce = ntohl(challenge.nonce); + sp->chall.rxkad_nonce = ntohl(challenge.nonce); min_level = ntohl(challenge.min_level); - trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level); + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, + sp->chall.rxkad_nonce, min_level); + + if (version != RXKAD_VERSION) { + rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_chall_version); + return false; + } - if (version != RXKAD_VERSION) - return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, - rxkad_abort_chall_version); + if (conn->security_level < min_level) { + rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, + rxkad_abort_chall_level); + return false; + } + return true; +} + +/* + * Insert the header into the response. + */ +static noinline +int rxkad_insert_response_header(struct rxrpc_connection *conn, + const struct rxrpc_key_token *token, + struct sk_buff *challenge, + struct sk_buff *response, + size_t *offset) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + struct { + struct rxrpc_wire_header whdr; + struct rxkad_response resp; + } h; + int ret; - if (conn->security_level < min_level) - return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, - rxkad_abort_chall_level); + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = 0; + h.whdr.serviceId = htons(conn->service_id); + h.resp.version = htonl(RXKAD_VERSION); + h.resp.__pad = 0; + h.resp.encrypted.epoch = htonl(conn->proto.epoch); + h.resp.encrypted.cid = htonl(conn->proto.cid); + h.resp.encrypted.checksum = 0; + h.resp.encrypted.securityIndex = htonl(conn->security_ix); + h.resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter); + h.resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter); + h.resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter); + h.resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter); + h.resp.encrypted.inc_nonce = htonl(csp->chall.rxkad_nonce + 1); + h.resp.encrypted.level = htonl(conn->security_level); + h.resp.kvno = htonl(token->kad->kvno); + h.resp.ticket_len = htonl(token->kad->ticket_len); + + rxkad_calc_response_checksum(&h.resp); + + ret = skb_store_bits(response, *offset, &h, sizeof(h)); + *offset += sizeof(h); + return ret; +} + +/* + * respond to a challenge packet + */ +static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + const struct rxrpc_key_token *token; + struct rxrpc_skb_priv *csp, *rsp; + struct sk_buff *response; + size_t len, offset = 0; + int ret = -EPROTO; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + ret = key_validate(conn->key); + if (ret < 0) + return rxrpc_abort_conn(conn, challenge, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); token = conn->key->payload.data[0]; /* build the response packet */ - resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); - if (!resp) - return -ENOMEM; + len = sizeof(struct rxrpc_wire_header) + + sizeof(struct rxkad_response) + + token->kad->ticket_len; + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxkad); + response->len = len; + response->data_len = len; + + offset = 0; + ret = rxkad_insert_response_header(conn, token, challenge, response, + &offset); + if (ret < 0) + goto error; + + ret = rxkad_encrypt_response(conn, response, token->kad); + if (ret < 0) + goto error; + + ret = skb_store_bits(response, offset, token->kad->ticket, + token->kad->ticket_len); + if (ret < 0) + goto error; - resp->version = htonl(RXKAD_VERSION); - resp->encrypted.epoch = htonl(conn->proto.epoch); - resp->encrypted.cid = htonl(conn->proto.cid); - resp->encrypted.securityIndex = htonl(conn->security_ix); - resp->encrypted.inc_nonce = htonl(nonce + 1); - resp->encrypted.level = htonl(conn->security_level); - resp->kvno = htonl(token->kad->kvno); - resp->ticket_len = htonl(token->kad->ticket_len); - resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter); - resp->encrypted.call_id[1] = htonl(conn->channels[1].call_counter); - resp->encrypted.call_id[2] = htonl(conn->channels[2].call_counter); - resp->encrypted.call_id[3] = htonl(conn->channels[3].call_counter); - - /* calculate the response checksum and then do the encryption */ - rxkad_calc_response_checksum(resp); - ret = rxkad_encrypt_response(conn, resp, token->kad); - if (ret == 0) - ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); - kfree(resp); + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); return ret; } /* + * RxKAD does automatic response only as there's nothing to manage that isn't + * already in the key. + */ +static int rxkad_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; +} + +/** + * rxkad_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * + * Allow a kernel application to respond to a CHALLENGE. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxkad_respond_to_challenge(csp->chall.conn, challenge); +} +EXPORT_SYMBOL(rxkad_kernel_respond_to_challenge); + +/* * decrypt the kerberos IV ticket in the response */ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, @@ -1276,6 +1350,8 @@ const struct rxrpc_security rxkad = { .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, .issue_challenge = rxkad_issue_challenge, + .validate_challenge = rxkad_validate_challenge, + .sendmsg_respond_to_challenge = rxkad_sendmsg_respond_to_challenge, .respond_to_challenge = rxkad_respond_to_challenge, .verify_response = rxkad_verify_response, .clear = rxkad_clear, diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index e848a4777b8c7..0377301156b09 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "rxperf: " fmt #include <linux/module.h> #include <linux/slab.h> +#include <crypto/krb5.h> #include <net/sock.h> #include <net/af_rxrpc.h> #define RXRPC_TRACE_ONLY_DEFINE_ENUMS @@ -136,6 +137,12 @@ static void rxperf_notify_end_reply_tx(struct sock *sock, RXPERF_CALL_SV_AWAIT_ACK); } +static const struct rxrpc_kernel_ops rxperf_rxrpc_callback_ops = { + .notify_new_call = rxperf_rx_new_call, + .discard_new_call = rxperf_rx_discard_new_call, + .user_attach_call = rxperf_rx_attach, +}; + /* * Charge the incoming call preallocation. */ @@ -161,7 +168,6 @@ static void rxperf_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(rxperf_socket, rxperf_notify_rx, - rxperf_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -209,8 +215,7 @@ static int rxperf_open_socket(void) if (ret < 0) goto error_2; - rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call, - rxperf_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &rxperf_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -546,9 +551,9 @@ static int rxperf_process_call(struct rxperf_call *call) } /* - * Add a key to the security keyring. + * Add an rxkad key to the security keyring. */ -static int rxperf_add_key(struct key *keyring) +static int rxperf_add_rxkad_key(struct key *keyring) { key_ref_t kref; int ret; @@ -574,6 +579,47 @@ static int rxperf_add_key(struct key *keyring) return ret; } +#ifdef CONFIG_RXGK +/* + * Add a yfs-rxgk key to the security keyring. + */ +static int rxperf_add_yfs_rxgk_key(struct key *keyring, u32 enctype) +{ + const struct krb5_enctype *krb5 = crypto_krb5_find_enctype(enctype); + key_ref_t kref; + char name[64]; + int ret; + u8 key[32]; + + if (!krb5 || krb5->key_len > sizeof(key)) + return 0; + + /* The key is just { 0, 1, 2, 3, 4, ... } */ + for (int i = 0; i < krb5->key_len; i++) + key[i] = i; + + sprintf(name, "%u:6:1:%u", RX_PERF_SERVICE, enctype); + + kref = key_create_or_update(make_key_ref(keyring, true), + "rxrpc_s", name, + key, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + + if (IS_ERR(kref)) { + pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + ret = key_link(keyring, key_ref_to_ptr(kref)); + if (ret < 0) + pr_err("Can't link rxperf server key: %d\n", ret); + key_ref_put(kref); + return ret; +} +#endif + /* * Initialise the rxperf server. */ @@ -603,9 +649,29 @@ static int __init rxperf_init(void) goto error_keyring; } rxperf_sec_keyring = keyring; - ret = rxperf_add_key(keyring); + ret = rxperf_add_rxkad_key(keyring); + if (ret < 0) + goto error_key; +#ifdef CONFIG_RXGK + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA256_128); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA384_192); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA128_CTS_CMAC); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA256_CTS_CMAC); if (ret < 0) goto error_key; +#endif ret = rxperf_open_socket(); if (ret < 0) diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 9784adc8f2759..2bfbf2b2bb37a 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -20,6 +20,9 @@ static const struct rxrpc_security *rxrpc_security_types[] = { #ifdef CONFIG_RXKAD [RXRPC_SECURITY_RXKAD] = &rxkad, #endif +#ifdef CONFIG_RXGK + [RXRPC_SECURITY_YFS_RXGK] = &rxgk_yfs, +#endif }; int __init rxrpc_init_security(void) @@ -137,15 +140,15 @@ const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *rx, sec = rxrpc_security_lookup(sp->hdr.securityIndex); if (!sec) { - rxrpc_direct_abort(skb, rxrpc_abort_unsupported_security, - RX_INVALID_OPERATION, -EKEYREJECTED); + rxrpc_direct_conn_abort(skb, rxrpc_abort_unsupported_security, + RX_INVALID_OPERATION, -EKEYREJECTED); return NULL; } if (sp->hdr.securityIndex != RXRPC_SECURITY_NONE && !rx->securities) { - rxrpc_direct_abort(skb, rxrpc_abort_no_service_key, - sec->no_key_abort, -EKEYREJECTED); + rxrpc_direct_conn_abort(skb, rxrpc_abort_no_service_key, + sec->no_key_abort, -EKEYREJECTED); return NULL; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 84dc6c94f23b1..ebbb78b842de8 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -607,7 +607,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; @@ -657,7 +657,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) - __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; bool dropped_lock = false; @@ -759,14 +758,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; - } else if (p.command == RXRPC_CMD_SEND_ABORT) { + goto out_put_unlock; + } + + switch (p.command) { + case RXRPC_CMD_SEND_ABORT: rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; - } else if (p.command != RXRPC_CMD_SEND_DATA) { - ret = -EINVAL; - } else { + break; + case RXRPC_CMD_SEND_DATA: ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); + break; + default: + ret = -EINVAL; + break; } out_put_unlock: @@ -794,6 +800,8 @@ error_release_sock: * appropriate to sending data. No control data should be supplied in @msg, * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, @@ -829,8 +837,9 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data); * @error: Local error value * @why: Indication as to why. * - * Allow a kernel service to abort a call, if it's still in an abortable state - * and return true if the call was aborted, false if it was already complete. + * Allow a kernel service to abort a call if it's still in an abortable state. + * + * Return: %true if the call was aborted, %false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, enum rxrpc_abort_reason why) diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index e51940589ee54..36b05fd842a7b 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -152,6 +152,8 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) * * Set the server security keyring on an rxrpc socket. This is used to provide * the encryption keys for a kernel service. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) { @@ -169,3 +171,43 @@ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); + +/** + * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service + * @sk: The socket to set the keyring on + * @set: True to set, false to clear the flag + * + * Set the flag on an rxrpc socket to say that the caller wants to manage the + * RESPONSE packet and the user-defined data it may contain. Setting this + * means that recvmsg() will return messages with RXRPC_CHALLENGED in the + * control message buffer containing information about the challenge. + * + * The user should respond to the challenge by passing RXRPC_RESPOND or + * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call. + * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be + * included to indicate the parts the user wants to supply. + * + * The server will be passed the response data with a RXRPC_RESPONDED control + * message when it gets the first data from each call. + * + * Note that this is only honoured by security classes that need auxiliary data + * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond + * without consulting userspace. + * + * Return: The previous setting. + */ +int rxrpc_sock_set_manage_response(struct sock *sk, bool set) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + lock_sock(sk); + ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + if (set) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + release_sock(sk); + return ret; +} +EXPORT_SYMBOL(rxrpc_sock_set_manage_response); diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index c550991d48faa..29767038691ad 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -60,14 +60,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return txb; } -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) -{ - int r; - - __refcount_inc(&txb->ref, &r); - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r + 1, what); -} - void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) { int r = refcount_read(&txb->ref); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a800127effcd7..6ddff028b81a4 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -403,6 +403,30 @@ config NET_SCH_ETS If unsure, say N. +config NET_SCH_BPF + bool "BPF-based Qdisc" + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF + help + This option allows BPF-based queueing disiplines. With BPF struct_ops, + users can implement supported operators in Qdisc_ops using BPF programs. + The queue holding skb can be built with BPF maps or graphs. + + Say Y here if you want to use BPF-based Qdisc. + + If unsure, say N. + +config NET_SCH_DUALPI2 + tristate "Dual Queue PI Square (DUALPI2) scheduler" + help + Say Y here if you want to use the Dual Queue Proportional Integral + Controller Improved with a Square scheduling algorithm. + For more information, please see https://tools.ietf.org/html/rfc9332 + + To compile this driver as a module, choose M here: the module + will be called sch_dualpi2. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" help @@ -784,7 +808,7 @@ config NET_ACT_SKBEDIT config NET_ACT_CSUM tristate "Checksum Updating" depends on NET_CLS_ACT && INET - select CRC32 + select NET_CRC32C help Say Y here to update some common checksum after some direct packet alterations. diff --git a/net/sched/Makefile b/net/sched/Makefile index 82c3f78ca486e..5078ea84e6ad7 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -62,6 +62,8 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o +obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o +obj-$(CONFIG_NET_SCH_DUALPI2) += sch_dualpi2.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 839790043256b..9e468e4634671 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -933,18 +933,25 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { struct idr *idr = &idrinfo->action_idr; + bool mutex_taken = false; struct tc_action *p; - int ret; unsigned long id = 1; unsigned long tmp; + int ret; idr_for_each_entry_ul(idr, p, tmp, id) { + if (tc_act_in_hw(p) && !mutex_taken) { + rtnl_lock(); + mutex_taken = true; + } ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return; } + if (mutex_taken) + rtnl_unlock(); idr_destroy(&idrinfo->action_idr); } EXPORT_SYMBOL(tcf_idrinfo_destroy); @@ -1461,17 +1468,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; - struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 2]; struct tc_action *act; size_t sz = 0; int err; int i; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL, extack); if (err < 0) return err; + /* The nested attributes are parsed as types, but they are really an + * array of actions. So we parse one more than we can handle, and return + * an error if the last one is set (as that indicates that the request + * contained more than the maximum number of actions). + */ + if (tb[TCA_ACT_MAX_PRIO + 1]) { + NL_SET_ERR_MSG_FMT(extack, + "Only %d actions supported per filter", + TCA_ACT_MAX_PRIO); + return -EINVAL; + } + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 0fce631e7c911..3e89927d71164 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -88,7 +88,7 @@ count: /* using overlimits stats to count how many packets marked */ tcf_action_inc_overlimit_qstats(&ca->common); out: - return READ_ONCE(ca->tcf_action); + return parms->action; } static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { @@ -167,6 +167,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (err < 0) goto release_idr; + nparms->action = parm->action; + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); @@ -190,20 +192,20 @@ out_free: static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_connmark_info *ci = to_connmark(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_connmark_info *ci = to_connmark(a); + const struct tcf_connmark_parms *parms; struct tc_connmark opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; - struct tcf_connmark_parms *parms; struct tcf_t t; - spin_lock_bh(&ci->tcf_lock); - parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock)); + rcu_read_lock(); + parms = rcu_dereference(ci->parms); - opt.action = ci->tcf_action; + opt.action = parms->action; opt.zone = parms->zone; if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -212,12 +214,12 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5cc8e407e7911..0939e6b2ba4d1 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -99,6 +99,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, goto put_chain; } params_new->update_flags = parm->update_flags; + params_new->action = parm->action; spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -580,7 +581,7 @@ TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb, tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); - action = READ_ONCE(p->tcf_action); + action = params->action; if (unlikely(action == TC_ACT_SHOT)) goto drop; @@ -631,9 +632,9 @@ drop: static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_csum *p = to_tcf_csum(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_csum *p = to_tcf_csum(a); - struct tcf_csum_params *params; + const struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, @@ -641,10 +642,9 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, }; struct tcf_t t; - spin_lock_bh(&p->tcf_lock); - params = rcu_dereference_protected(p->params, - lockdep_is_held(&p->tcf_lock)); - opt.action = p->tcf_action; + rcu_read_lock(); + params = rcu_dereference(p->params); + opt.action = params->action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) @@ -653,12 +653,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index c02f39efc6efe..6749a4a9a9cd0 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -977,7 +977,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, p = rcu_dereference_bh(c->params); - retval = READ_ONCE(c->tcf_action); + retval = p->action; commit = p->ct_action & TCA_CT_ACT_COMMIT; clear = p->ct_action & TCA_CT_ACT_CLEAR; tmpl = p->tmpl; @@ -1409,6 +1409,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (err) goto cleanup; + params->action = parm->action; spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params = rcu_replace_pointer(c->params, params, @@ -1442,8 +1443,8 @@ static void tcf_ct_cleanup(struct tc_action *a) } static int tcf_ct_dump_key_val(struct sk_buff *skb, - void *val, int val_type, - void *mask, int mask_type, + const void *val, int val_type, + const void *mask, int mask_type, int len) { int err; @@ -1464,9 +1465,9 @@ static int tcf_ct_dump_key_val(struct sk_buff *skb, return 0; } -static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) +static int tcf_ct_dump_nat(struct sk_buff *skb, const struct tcf_ct_params *p) { - struct nf_nat_range2 *range = &p->range; + const struct nf_nat_range2 *range = &p->range; if (!(p->ct_action & TCA_CT_ACT_NAT)) return 0; @@ -1504,7 +1505,8 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) return 0; } -static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper) +static int tcf_ct_dump_helper(struct sk_buff *skb, + const struct nf_conntrack_helper *helper) { if (!helper) return 0; @@ -1521,9 +1523,8 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ct *c = to_ct(a); - struct tcf_ct_params *p; - + const struct tcf_ct *c = to_ct(a); + const struct tcf_ct_params *p; struct tc_ct opt = { .index = c->tcf_index, .refcnt = refcount_read(&c->tcf_refcnt) - ref, @@ -1531,10 +1532,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&c->tcf_lock); - p = rcu_dereference_protected(c->params, - lockdep_is_held(&c->tcf_lock)); - opt.action = c->tcf_action; + rcu_read_lock(); + p = rcu_dereference(c->params); + opt.action = p->action; if (tcf_ct_dump_key_val(skb, &p->ct_action, TCA_CT_ACTION, @@ -1579,11 +1579,11 @@ skip_dump: tcf_tm_dump(&t, &c->tcf_tm); if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD)) goto nla_put_failure; - spin_unlock_bh(&c->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&c->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 5b1241ddc7585..71efe04d00b5c 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -44,9 +44,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, newdscp); - ca->stats_dscp_set++; + atomic64_inc(&ca->stats_dscp_set); } else { - ca->stats_dscp_error++; + atomic64_inc(&ca->stats_dscp_error); } } break; @@ -57,9 +57,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, newdscp); - ca->stats_dscp_set++; + atomic64_inc(&ca->stats_dscp_set); } else { - ca->stats_dscp_error++; + atomic64_inc(&ca->stats_dscp_error); } } break; @@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb) { - ca->stats_cpmark_set++; + atomic64_inc(&ca->stats_cpmark_set); skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } @@ -88,13 +88,11 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, struct tcf_ctinfo_params *cp; struct nf_conn *ct; int proto, wlen; - int action; cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); tcf_action_update_bstats(&ca->common, skb); - action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { @@ -141,7 +139,7 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, if (thash) nf_ct_put(ct); out: - return action; + return cp->action; } static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { @@ -258,6 +256,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, cp_new->mode |= CTINFO_MODE_CPMARK; } + cp_new->action = actparm->action; + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); cp_new = rcu_replace_pointer(ci->params, cp_new, @@ -282,25 +282,24 @@ release_idr: static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - struct tcf_ctinfo *ci = to_ctinfo(a); + const struct tcf_ctinfo *ci = to_ctinfo(a); + unsigned char *b = skb_tail_pointer(skb); + const struct tcf_ctinfo_params *cp; struct tc_ctinfo opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; - unsigned char *b = skb_tail_pointer(skb); - struct tcf_ctinfo_params *cp; struct tcf_t t; - spin_lock_bh(&ci->tcf_lock); - cp = rcu_dereference_protected(ci->params, - lockdep_is_held(&ci->tcf_lock)); + rcu_read_lock(); + cp = rcu_dereference(ci->params); tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) goto nla_put_failure; - opt.action = ci->tcf_action; + opt.action = cp->action; if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) goto nla_put_failure; @@ -323,22 +322,25 @@ static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, } if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, - ci->stats_dscp_set, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_dscp_set), + TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, - ci->stats_dscp_error, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_dscp_error), + TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, - ci->stats_cpmark_set, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_cpmark_set), + TCA_CTINFO_PAD)) goto nla_put_failure; - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5b38143659249..5f01f567c934d 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_nest_level); + +#ifndef CONFIG_PREEMPT_RT +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); +} + +static void tcf_mirred_nest_level_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); +} + +#else +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return current->net_xmit.sched_mirred_nest++; +} + +static void tcf_mirred_nest_level_dec(void) +{ + current->net_xmit.sched_mirred_nest--; +} +#endif static bool tcf_mirred_is_act_redirect(int action) { @@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, int m_eaction; u32 blockid; - nest_level = __this_cpu_inc_return(mirred_nest_level); + nest_level = tcf_mirred_nest_level_inc_return(); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); @@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, retval); dec_nest_level: - __this_cpu_dec(mirred_nest_level); + tcf_mirred_nest_level_dec(); return retval; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 9f86f4e666d33..6654011dcd2ba 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -57,7 +57,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; __be32 new_lse; - int ret, mac_len; + int mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -72,8 +72,6 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, mac_len = skb_network_offset(skb); } - ret = READ_ONCE(m->tcf_action); - p = rcu_dereference_bh(m->mpls_p); switch (p->tcfm_action) { @@ -122,7 +120,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); - return ret; + return p->action; drop: qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats)); @@ -296,6 +294,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, ACT_MPLS_BOS_NOT_SET); p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO], htons(ETH_P_MPLS_UC)); + p->action = parm->action; spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -330,8 +329,8 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_mpls *m = to_mpls(a); - struct tcf_mpls_params *p; + const struct tcf_mpls *m = to_mpls(a); + const struct tcf_mpls_params *p; struct tc_mpls opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, @@ -339,10 +338,10 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&m->tcf_lock); - opt.action = m->tcf_action; - p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(m->mpls_p); opt.m_action = p->tcfm_action; + opt.action = p->action; if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -370,12 +369,12 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD)) goto nla_put_failure; - spin_unlock_bh(&m->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&m->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -EMSGSIZE; } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index d541f553805fa..26241d80ebe03 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -91,6 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, nparm->new_addr = parm->new_addr; nparm->mask = parm->mask; nparm->flags = parm->flags; + nparm->action = parm->action; p = to_tcf_nat(*a); @@ -130,17 +131,16 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb, tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); - action = READ_ONCE(p->tcf_action); - parms = rcu_dereference_bh(p->parms); + action = parms->action; + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + old_addr = parms->old_addr; new_addr = parms->new_addr; mask = parms->mask; egress = parms->flags & TCA_NAT_FLAG_EGRESS; - if (unlikely(action == TC_ACT_SHOT)) - goto drop; - noff = skb_network_offset(skb); if (!pskb_may_pull(skb, sizeof(*iph) + noff)) goto drop; @@ -268,21 +268,20 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_nat *p = to_tcf_nat(a); + const struct tcf_nat *p = to_tcf_nat(a); + const struct tcf_nat_parms *parms; struct tc_nat opt = { .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; - struct tcf_nat_parms *parms; struct tcf_t t; - spin_lock_bh(&p->tcf_lock); - - opt.action = p->tcf_action; + rcu_read_lock(); - parms = rcu_dereference_protected(p->parms, lockdep_is_held(&p->tcf_lock)); + parms = rcu_dereference(p->parms); + opt.action = parms->action; opt.old_addr = parms->old_addr; opt.new_addr = parms->new_addr; opt.mask = parms->mask; @@ -294,12 +293,12 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index fc0a35a7b62ac..4b65901397a88 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -279,7 +279,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } p = to_pedit(*a); - + nparms->action = parm->action; spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); @@ -483,7 +483,7 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, bad: tcf_action_inc_overlimit_qstats(&p->common); done: - return p->tcf_action; + return parms->action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, @@ -500,19 +500,19 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_pedit *p = to_pedit(a); - struct tcf_pedit_parms *parms; + const struct tcf_pedit *p = to_pedit(a); + const struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; - spin_lock_bh(&p->tcf_lock); - parms = rcu_dereference_protected(p->parms, 1); + rcu_read_lock(); + parms = rcu_dereference(p->parms); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return -ENOBUFS; } opt->nkeys = parms->tcfp_nkeys; @@ -521,7 +521,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->flags = parms->tcfp_flags; - opt->action = p->tcf_action; + opt->action = parms->action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; @@ -540,13 +540,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); kfree(opt); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); kfree(opt); return -1; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a214ed6811420..0e1c611833790 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -198,6 +198,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, psched_ppscfg_precompute(&new->ppsrate, pps); } + new->action = parm->action; spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); @@ -254,8 +255,8 @@ TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb, tcf_lastuse_update(&police->tcf_tm); bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); - ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); + ret = p->action; if (p->tcfp_ewma_rate) { struct gnet_stats_rate_est64 sample; @@ -338,9 +339,9 @@ static void tcf_police_stats_update(struct tc_action *a, static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_police *police = to_police(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_police *police = to_police(a); - struct tcf_police_params *p; + const struct tcf_police_params *p; struct tc_police opt = { .index = police->tcf_index, .refcnt = refcount_read(&police->tcf_refcnt) - ref, @@ -348,10 +349,9 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&police->tcf_lock); - opt.action = police->tcf_action; - p = rcu_dereference_protected(police->params, - lockdep_is_held(&police->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(police->params); + opt.action = p->action; opt.mtu = p->tcfp_mtu; opt.burst = PSCHED_NS2TICKS(p->tcfp_burst); if (p->rate_present) { @@ -392,12 +392,12 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; - spin_unlock_bh(&police->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&police->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 1f1d9ce3e968a..8c1d1554f6575 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -43,13 +43,11 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; - int action; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); - action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) skb->priority = params->priority; @@ -85,7 +83,7 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; - return action; + return params->action; err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); @@ -262,6 +260,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (flags & SKBEDIT_F_MASK) params_new->mask = *mask; + params_new->action = parm->action; spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(d->params, params_new, @@ -284,9 +283,9 @@ release_idr: static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_skbedit *d = to_skbedit(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbedit *d = to_skbedit(a); - struct tcf_skbedit_params *params; + const struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, @@ -295,10 +294,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, u64 pure_flags = 0; struct tcf_t t; - spin_lock_bh(&d->tcf_lock); - params = rcu_dereference_protected(d->params, - lockdep_is_held(&d->tcf_lock)); - opt.action = d->tcf_action; + rcu_read_lock(); + params = rcu_dereference(d->params); + opt.action = params->action; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -333,12 +331,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c new file mode 100644 index 0000000000000..adcb618a2bfca --- /dev/null +++ b/net/sched/bpf_qdisc.c @@ -0,0 +1,472 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/types.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> + +#define QDISC_OP_IDX(op) (offsetof(struct Qdisc_ops, op) / sizeof(void (*)(void))) +#define QDISC_MOFF_IDX(moff) (moff / sizeof(void (*)(void))) + +static struct bpf_struct_ops bpf_Qdisc_ops; + +struct bpf_sched_data { + struct qdisc_watchdog watchdog; +}; + +struct bpf_sk_buff_ptr { + struct sk_buff *skb; +}; + +static int bpf_qdisc_init(struct btf *btf) +{ + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr) + +static bool bpf_qdisc_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + struct btf *btf = prog->aux->attach_btf; + u32 arg; + + arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off); + if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) { + if (arg == 2 && type == BPF_READ) { + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; + info->btf = btf; + info->btf_id = bpf_sk_buff_ptr_ids[0]; + return true; + } + } + + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct Qdisc, limit): + *end = offsetofend(struct Qdisc, limit); + break; + case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen): + *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen); + break; + case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1: + *end = offsetofend(struct Qdisc, qstats); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct sk_buff, tstamp): + *end = offsetofend(struct sk_buff, tstamp); + break; + case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ... + offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, + data[QDISC_CB_PRIV_LEN - 1]): + *end = offsetof(struct sk_buff, cb) + + offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + const struct btf_type *t, *skbt, *qdisct; + size_t end; + int err; + + skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]); + qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]); + t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == skbt) { + err = bpf_qdisc_sk_buff_access(log, reg, off, &end); + } else if (t == qdisct) { + err = bpf_qdisc_qdisc_access(log, reg, off, &end); + } else { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + if (err) { + bpf_log(log, "no write support to %s at off %d\n", + btf_name_by_offset(reg->btf, t->name_off), off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of %s ended at %zu\n", + off, size, btf_name_by_offset(reg->btf, t->name_off), end); + return -EACCES; + } + + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_init_prologue_ids, func, bpf_qdisc_init_prologue) + +static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, init)) + return 0; + + /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx". + * r2 = r1[16]; // r2 will be "struct netlink_ext_ack *extack" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_init_prologue(r1, r2); + * if r0 == 0 goto pc+1; + * BPF_EXIT; + * r1 = r6; // r1 will be "u64 *ctx". + */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 16); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1); + *insn++ = BPF_EXIT_INSN(); + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_reset_destroy_epilogue_ids, func, bpf_qdisc_reset_destroy_epilogue) + +static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, + s16 ctx_stack_off) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, reset) && + prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, destroy)) + return 0; + + /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_reset_destroy_epilogue(r1); + * BPF_EXIT; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_reset_destroy_epilogue_ids[0]); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +__bpf_kfunc_start_defs(); + +/* bpf_skb_get_hash - Get the flow hash of an skb. + * @skb: The skb to get the flow hash from. + */ +__bpf_kfunc u32 bpf_skb_get_hash(struct sk_buff *skb) +{ + return skb_get_hash(skb); +} + +/* bpf_kfree_skb - Release an skb's reference and drop it immediately. + * @skb: The skb whose reference to be released and dropped. + */ +__bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list. + * @skb: The skb whose reference to be released and dropped. + * @to_free_list: The list of skbs to be dropped. + */ +__bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb, + struct bpf_sk_buff_ptr *to_free_list) +{ + __qdisc_drop(skb, (struct sk_buff **)to_free_list); +} + +/* bpf_qdisc_watchdog_schedule - Schedule a qdisc to a later time using a timer. + * @sch: The qdisc to be scheduled. + * @expire: The expiry time of the timer. + * @delta_ns: The slack range of the timer. + */ +__bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_schedule_range_ns(&q->watchdog, expire, delta_ns); +} + +/* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */ +__bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch, + struct netlink_ext_ack *extack) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *p; + + qdisc_watchdog_init(&q->watchdog, sch); + + if (sch->parent != TC_H_ROOT) { + /* If qdisc_lookup() returns NULL, it means .init is called by + * qdisc_create_dflt() in mq/mqprio_init and the parent qdisc + * has not been added to qdisc_hash yet. + */ + p = qdisc_lookup(dev, TC_H_MAJ(sch->parent)); + if (p && !(p->flags & TCQ_F_MQROOT)) { + NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq"); + return -EINVAL; + } + } + + return 0; +} + +/* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset + * and .destroy + */ +__bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); +} + +/* bpf_qdisc_bstats_update - Update Qdisc basic statistics + * @sch: The qdisc from which an skb is dequeued. + * @skb: The skb to be dequeued. + */ +__bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) +{ + bstats_update(&sch->bstats, skb); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(qdisc_kfunc_ids) +BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(qdisc_kfunc_ids) + +BTF_SET_START(qdisc_common_kfunc_set) +BTF_ID(func, bpf_skb_get_hash) +BTF_ID(func, bpf_kfree_skb) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_SET_END(qdisc_common_kfunc_set) + +BTF_SET_START(qdisc_enqueue_kfunc_set) +BTF_ID(func, bpf_qdisc_skb_drop) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_SET_END(qdisc_enqueue_kfunc_set) + +BTF_SET_START(qdisc_dequeue_kfunc_set) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_ID(func, bpf_qdisc_bstats_update) +BTF_SET_END(qdisc_dequeue_kfunc_set) + +enum qdisc_ops_kf_flags { + QDISC_OPS_KF_COMMON = 0, + QDISC_OPS_KF_ENQUEUE = 1 << 0, + QDISC_OPS_KF_DEQUEUE = 1 << 1, +}; + +static const u32 qdisc_ops_context_flags[] = { + [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE, + [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_DEQUEUE, + [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON, +}; + +static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + u32 moff, flags; + + if (!btf_id_set8_contains(&qdisc_kfunc_ids, kfunc_id)) + return 0; + + if (prog->aux->st_ops != &bpf_Qdisc_ops) + return -EACCES; + + moff = prog->aux->attach_st_ops_member_off; + flags = qdisc_ops_context_flags[QDISC_MOFF_IDX(moff)]; + + if ((flags & QDISC_OPS_KF_ENQUEUE) && + btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id)) + return 0; + + if ((flags & QDISC_OPS_KF_DEQUEUE) && + btf_id_set_contains(&qdisc_dequeue_kfunc_set, kfunc_id)) + return 0; + + if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id)) + return 0; + + return -EACCES; +} + +static const struct btf_kfunc_id_set bpf_qdisc_kfunc_set = { + .owner = THIS_MODULE, + .set = &qdisc_kfunc_ids, + .filter = bpf_qdisc_kfunc_filter, +}; + +static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_qdisc_is_valid_access, + .btf_struct_access = bpf_qdisc_btf_struct_access, + .gen_prologue = bpf_qdisc_gen_prologue, + .gen_epilogue = bpf_qdisc_gen_epilogue, +}; + +static int bpf_qdisc_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct Qdisc_ops *uqdisc_ops; + struct Qdisc_ops *qdisc_ops; + u32 moff; + + uqdisc_ops = (const struct Qdisc_ops *)udata; + qdisc_ops = (struct Qdisc_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct Qdisc_ops, priv_size): + if (uqdisc_ops->priv_size) + return -EINVAL; + qdisc_ops->priv_size = sizeof(struct bpf_sched_data); + return 1; + case offsetof(struct Qdisc_ops, peek): + qdisc_ops->peek = qdisc_peek_dequeued; + return 0; + case offsetof(struct Qdisc_ops, id): + if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id, + sizeof(qdisc_ops->id)) <= 0) + return -EINVAL; + return 1; + } + + return 0; +} + +static int bpf_qdisc_reg(void *kdata, struct bpf_link *link) +{ + return register_qdisc(kdata); +} + +static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link) +{ + return unregister_qdisc(kdata); +} + +static int bpf_qdisc_validate(void *kdata) +{ + struct Qdisc_ops *ops = (struct Qdisc_ops *)kdata; + + if (!ops->enqueue || !ops->dequeue || !ops->init || + !ops->reset || !ops->destroy) + return -EINVAL; + + return 0; +} + +static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch, + struct sk_buff **to_free) +{ + return 0; +} + +static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void Qdisc_ops__reset(struct Qdisc *sch) +{ +} + +static void Qdisc_ops__destroy(struct Qdisc *sch) +{ +} + +static struct Qdisc_ops __bpf_ops_qdisc_ops = { + .enqueue = Qdisc_ops__enqueue, + .dequeue = Qdisc_ops__dequeue, + .init = Qdisc_ops__init, + .reset = Qdisc_ops__reset, + .destroy = Qdisc_ops__destroy, +}; + +static struct bpf_struct_ops bpf_Qdisc_ops = { + .verifier_ops = &bpf_qdisc_verifier_ops, + .reg = bpf_qdisc_reg, + .unreg = bpf_qdisc_unreg, + .validate = bpf_qdisc_validate, + .init_member = bpf_qdisc_init_member, + .init = bpf_qdisc_init, + .name = "Qdisc_ops", + .cfi_stubs = &__bpf_ops_qdisc_ops, + .owner = THIS_MODULE, +}; + +BTF_ID_LIST_SINGLE(bpf_sk_buff_dtor_ids, func, bpf_kfree_skb) + +static int __init bpf_qdisc_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc skb_kfunc_dtors[] = { + { + .btf_id = bpf_sk_buff_ids[0], + .kfunc_btf_id = bpf_sk_buff_dtor_ids[0] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_qdisc_kfunc_set); + ret = ret ?: register_btf_id_dtor_kfuncs(skb_kfunc_dtors, + ARRAY_SIZE(skb_kfunc_dtors), + THIS_MODULE); + ret = ret ?: register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops); + + return ret; +} +late_initcall(bpf_qdisc_kfunc_init); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5c2580a07530e..5693b41b093f3 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -345,7 +345,7 @@ TC_INDIRECT_SCOPE int flow_classify(struct sk_buff *skb, static void flow_perturbation(struct timer_list *t) { - struct flow_filter *f = from_timer(f, t, perturb_timer); + struct flow_filter *f = timer_container_of(f, t, perturb_timer); get_random_bytes(&f->hashrnd, 4); if (f->perturb_period) diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 420c66203b177..6b3d0af72c39c 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -108,7 +108,7 @@ static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) struct text_match *tm = EM_TEXT_PRIV(m); struct tcf_em_text conf; - strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1); + strscpy(conf.algo, tm->config->ops->name); conf.from_offset = tm->from_offset; conf.to_offset = tm->to_offset; conf.from_layer = tm->from_layer; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f74a097f54ae7..d7c767b861a46 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -25,6 +25,7 @@ #include <linux/hrtimer.h> #include <linux/slab.h> #include <linux/hashtable.h> +#include <linux/bpf.h> #include <net/netdev_lock.h> #include <net/net_namespace.h> @@ -207,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name) for (q = qdisc_base; q; q = q->next) { if (!strcmp(name, q->id)) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -237,7 +238,7 @@ int qdisc_set_default(const char *name) if (ops) { /* Set new default */ - module_put(default_qdisc_ops->owner); + bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner); default_qdisc_ops = ops; } write_unlock(&qdisc_mod_lock); @@ -335,17 +336,22 @@ out: return q; } -static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid, + struct netlink_ext_ack *extack) { unsigned long cl; const struct Qdisc_class_ops *cops = p->ops->cl_ops; - if (cops == NULL) - return NULL; + if (cops == NULL) { + NL_SET_ERR_MSG(extack, "Parent qdisc is not classful"); + return ERR_PTR(-EOPNOTSUPP); + } cl = cops->find(p, classid); - if (cl == 0) - return NULL; + if (cl == 0) { + NL_SET_ERR_MSG(extack, "Specified class not found"); + return ERR_PTR(-ENOENT); + } return cops->leaf(p, cl); } @@ -359,7 +365,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -595,16 +601,6 @@ out: qdisc_skb_cb(skb)->pkt_len = pkt_len; } -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) -{ - if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { - pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", - txt, qdisc->ops->id, qdisc->handle >> 16); - qdisc->flags |= TCQ_F_WARN_NONWC; - } -} -EXPORT_SYMBOL(qdisc_warn_nonwc); - static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, @@ -779,15 +775,12 @@ static u32 qdisc_alloc_handle(struct net_device *dev) void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { - bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; bool notify; int drops; - if (n == 0 && len == 0) - return; drops = max_t(int, n, 0); rcu_read_lock(); while ((parentid = sch->parent)) { @@ -796,17 +789,8 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) if (sch->flags & TCQ_F_NOPARENT) break; - /* Notify parent qdisc only if child qdisc becomes empty. - * - * If child was empty even before update then backlog - * counter is screwed and we skip notification because - * parent class is already passive. - * - * If the original child was offloaded then it is allowed - * to be seem as empty, so the parent is notified anyway. - */ - notify = !sch->q.qlen && !WARN_ON_ONCE(!n && - !qdisc_is_offloaded); + /* Notify parent qdisc only if child qdisc becomes empty. */ + notify = !sch->q.qlen; /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { @@ -815,6 +799,9 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) } cops = sch->ops->cl_ops; if (notify && cops->qlen_notify) { + /* Note that qlen_notify must be idempotent as it may get called + * multiple times. + */ cl = cops->find(sch, parentid); cops->qlen_notify(sch, cl); } @@ -1370,7 +1357,7 @@ err_out3: netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: - module_put(ops->owner); + bpf_module_put(ops, ops->owner); err_out: *errp = err; return NULL; @@ -1498,7 +1485,7 @@ static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); return -ENOENT; } - q = qdisc_leaf(p, clid); + q = qdisc_leaf(p, clid, extack); } else if (dev_ingress_queue(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } @@ -1509,6 +1496,8 @@ static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); return -ENOENT; } + if (IS_ERR(q)) + return PTR_ERR(q); if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Invalid handle"); @@ -1610,7 +1599,9 @@ static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } - q = qdisc_leaf(p, clid); + q = qdisc_leaf(p, clid, extack); + if (IS_ERR(q)) + return PTR_ERR(q); } else if (dev_ingress_queue_create(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } @@ -1782,7 +1773,7 @@ static void request_qdisc_module(struct nlattr *kind) ops = qdisc_lookup_ops(kind); if (ops) { - module_put(ops->owner); + bpf_module_put(ops, ops->owner); return; } diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 48dd8c88903fe..dbcfb948c8670 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1407,7 +1407,10 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) return cake_calc_overhead(q, len, off); /* borrowed from qdisc_pkt_len_init() */ - hdr_len = skb_transport_offset(skb); + if (!skb->encapsulation) + hdr_len = skb_transport_offset(skb); + else + hdr_len = skb_inner_transport_offset(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c new file mode 100644 index 0000000000000..845375ebd4eaa --- /dev/null +++ b/net/sched/sch_dualpi2.c @@ -0,0 +1,1175 @@ +// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +/* Copyright (C) 2024 Nokia + * + * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com> + * Author: Olga Albisser <olga@albisser.org> + * Author: Henrik Steen <henrist@henrist.net> + * Author: Olivier Tilmans <olivier.tilmans@nokia.com> + * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> + * + * DualPI Improved with a Square (dualpi2): + * - Supports congestion controls that comply with the Prague requirements + * in RFC9331 (e.g. TCP-Prague) + * - Supports coupled dual-queue with PI2 as defined in RFC9332 + * - Supports ECN L4S-identifier (IP.ECN==0b*1) + * + * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks, + * they do not meet the 'Prague L4S Requirements' listed in RFC 9331 + * Section 4, so they can only be used with DualPI2 in a datacenter + * context. + * + * References: + * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332 + * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and + * scalable TCP." in proc. ACM CoNEXT'16, 2016. + */ + +#include <linux/errno.h> +#include <linux/hrtimer.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/limits.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h> + +#include <net/gso.h> +#include <net/inet_ecn.h> +#include <net/pkt_cls.h> +#include <net/pkt_sched.h> + +/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets + * i.e., twice the maximal snd_cwnd. + * MAX_PROB must be consistent with the RNG in dualpi2_roll(). + */ +#define MAX_PROB U32_MAX + +/* alpha/beta values exchanged over netlink are in units of 256ns */ +#define ALPHA_BETA_SHIFT 8 + +/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later + * computations. Consequently (see and dualpi2_scale_alpha_beta()), their + * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1 + * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to + * control flows whose maximal RTTs can be in usec up to few secs. + */ +#define ALPHA_BETA_MAX ((1U << 31) - 1) + +/* Internal alpha/beta are in units of 64ns. + * This enables to use all alpha/beta values in the allowed range without loss + * of precision due to rounding when scaling them internally, e.g., + * scale_alpha_beta(1) will not round down to 0. + */ +#define ALPHA_BETA_GRANULARITY 6 + +#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY) + +/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */ +#define MAX_WC 100 + +struct dualpi2_sched_data { + struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */ + struct Qdisc *sch; /* The Classic queue (C-queue) */ + + /* Registered tc filters */ + struct tcf_proto __rcu *tcf_filters; + struct tcf_block *tcf_block; + + /* PI2 parameters */ + u64 pi2_target; /* Target delay in nanoseconds */ + u32 pi2_tupdate; /* Timer frequency in nanoseconds */ + u32 pi2_prob; /* Base PI probability */ + u32 pi2_alpha; /* Gain factor for the integral rate response */ + u32 pi2_beta; /* Gain factor for the proportional response */ + struct hrtimer pi2_timer; /* prob update timer */ + + /* Step AQM (L-queue only) parameters */ + u32 step_thresh; /* Step threshold */ + bool step_in_packets; /* Step thresh in packets (1) or time (0) */ + + /* C-queue starvation protection */ + s32 c_protection_credit; /* Credit (sign indicates which queue) */ + s32 c_protection_init; /* Reset value of the credit */ + u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */ + u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */ + + /* General dualQ parameters */ + u32 memory_limit; /* Memory limit of both queues */ + u8 coupling_factor;/* Coupling factor (k) between both queues */ + u8 ecn_mask; /* Mask to match packets into L-queue */ + u32 min_qlen_step; /* Minimum queue length to apply step thresh */ + bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */ + bool drop_overload; /* Drop (1) on overload, or overflow (0) */ + bool split_gso; /* Split aggregated skb (1) or leave as is (0) */ + + /* Statistics */ + u64 c_head_ts; /* Enqueue timestamp of the C-queue head */ + u64 l_head_ts; /* Enqueue timestamp of the L-queue head */ + u64 last_qdelay; /* Q delay val at the last probability update */ + u32 packets_in_c; /* Enqueue packet counter of the C-queue */ + u32 packets_in_l; /* Enqueue packet counter of the L-queue */ + u32 maxq; /* Maximum queue size of the C-queue */ + u32 ecn_mark; /* ECN mark pkt counter due to PI probability */ + u32 step_marks; /* ECN mark pkt counter due to step AQM */ + u32 memory_used; /* Memory used of both queues */ + u32 max_memory_used;/* Maximum used memory */ + + /* Deferred drop statistics */ + u32 deferred_drops_cnt; /* Packets dropped */ + u32 deferred_drops_len; /* Bytes dropped */ +}; + +struct dualpi2_skb_cb { + u64 ts; /* Timestamp at enqueue */ + u8 apply_step:1, /* Can we apply the step threshold */ + classified:2, /* Packet classification results */ + ect:2; /* Packet ECT codepoint */ +}; + +enum dualpi2_classification_results { + DUALPI2_C_CLASSIC = 0, /* C-queue */ + DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */ + DUALPI2_C_LLLL = 2, /* L-queue (no drops/marks) */ + __DUALPI2_C_MAX /* Keep last*/ +}; + +static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb)); + return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference) +{ + return reference - dualpi2_skb_cb(skb)->ts; +} + +static u64 head_enqueue_time(struct Qdisc *q) +{ + struct sk_buff *skb = qdisc_peek_head(q); + + return skb ? dualpi2_skb_cb(skb)->ts : 0; +} + +static u32 dualpi2_scale_alpha_beta(u32 param) +{ + u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING); + + do_div(tmp, NSEC_PER_SEC); + return tmp; +} + +static u32 dualpi2_unscale_alpha_beta(u32 param) +{ + u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING); + + do_div(tmp, MAX_PROB); + return tmp; +} + +static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) +{ + return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); +} + +static bool skb_is_l4s(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S; +} + +static bool skb_in_l_queue(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC; +} + +static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q) +{ + return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step; +} + +static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb) +{ + if (INET_ECN_set_ce(skb)) { + q->ecn_mark++; + return true; + } + return false; +} + +static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) +{ + q->c_protection_credit = q->c_protection_init; +} + +/* This computes the initial credit value and WRR weight for the L queue (wl) + * from the weight of the C queue (wc). + * If wl > wc, the scheduler will start with the L queue when reset. + */ +static void dualpi2_calculate_c_protection(struct Qdisc *sch, + struct dualpi2_sched_data *q, u32 wc) +{ + q->c_protection_wc = wc; + q->c_protection_wl = MAX_WC - wc; + q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) * + ((int)q->c_protection_wc - (int)q->c_protection_wl); + dualpi2_reset_c_protection(q); +} + +static bool dualpi2_roll(u32 prob) +{ + return get_random_u32() <= prob; +} + +/* Packets in the C-queue are subject to a marking probability pC, which is the + * square of the internal PI probability (i.e., have an overall lower mark/drop + * probability). If the qdisc is overloaded, ignore ECT values and only drop. + * + * Note that this marking scheme is also applied to L4S packets during overload. + * Return true if packet dropping is required in C queue + */ +static bool dualpi2_classic_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, u32 prob, + bool overload) +{ + if (dualpi2_roll(prob) && dualpi2_roll(prob)) { + if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; + dualpi2_mark(q, skb); + } + return false; +} + +/* Packets in the L-queue are subject to a marking probability pL given by the + * internal PI probability scaled by the coupling factor. + * + * On overload (i.e., @local_l_prob is >= 100%): + * - if the qdisc is configured to trade losses to preserve latency (i.e., + * @q->drop_overload), apply classic drops first before marking. + * - otherwise, preserve the "no loss" property of ECN at the cost of queueing + * delay, eventually resulting in taildrop behavior once sch->limit is + * reached. + * Return true if packet dropping is required in L queue + */ +static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, + u64 local_l_prob, u32 prob, + bool overload) +{ + if (overload) { + /* Apply classic drop */ + if (!q->drop_overload || + !(dualpi2_roll(prob) && dualpi2_roll(prob))) + goto mark; + return true; + } + + /* We can safely cut the upper 32b as overload==false */ + if (dualpi2_roll(local_l_prob)) { + /* Non-ECT packets could have classified as L4S by filters. */ + if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; +mark: + dualpi2_mark(q, skb); + } + return false; +} + +/* Decide whether a given packet must be dropped (or marked if ECT), according + * to the PI2 probability. + * + * Never mark/drop if we have a standing queue of less than 2 MTUs. + */ +static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + u64 local_l_prob; + bool overload; + u32 prob; + + if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch))) + return false; + + prob = READ_ONCE(q->pi2_prob); + local_l_prob = (u64)prob * q->coupling_factor; + overload = local_l_prob > MAX_PROB; + + switch (dualpi2_skb_cb(skb)->classified) { + case DUALPI2_C_CLASSIC: + return dualpi2_classic_marking(q, skb, prob, overload); + case DUALPI2_C_L4S: + return dualpi2_scalable_marking(q, skb, local_l_prob, prob, + overload); + default: /* DUALPI2_C_LLLL */ + return false; + } +} + +static void dualpi2_read_ect(struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + int wlen = skb_network_offset(skb); + + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK; + break; + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK; + break; + default: + goto not_ecn; + } + return; + +not_ecn: + /* Non pullable/writable packets can only be dropped hence are + * classified as not ECT. + */ + cb->ect = INET_ECN_NOT_ECT; +} + +static int dualpi2_skb_classify(struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + struct tcf_result res; + struct tcf_proto *fl; + int result; + + dualpi2_read_ect(skb); + if (cb->ect & q->ecn_mask) { + cb->classified = DUALPI2_C_L4S; + return NET_XMIT_SUCCESS; + } + + if (TC_H_MAJ(skb->priority) == q->sch->handle && + TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) { + cb->classified = TC_H_MIN(skb->priority); + return NET_XMIT_SUCCESS; + } + + fl = rcu_dereference_bh(q->tcf_filters); + if (!fl) { + cb->classified = DUALPI2_C_CLASSIC; + return NET_XMIT_SUCCESS; + } + + result = tcf_classify(skb, NULL, fl, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } +#endif + cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ? + TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC; + } + return NET_XMIT_SUCCESS; +} + +static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct dualpi2_skb_cb *cb; + + if (unlikely(qdisc_qlen(sch) >= sch->limit) || + unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) { + qdisc_qstats_overlimit(sch); + if (skb_in_l_queue(skb)) + qdisc_qstats_overlimit(q->l_queue); + return qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_OVERLIMIT); + } + + if (q->drop_early && must_drop(sch, q, skb)) { + qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_CONGESTED); + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } + + cb = dualpi2_skb_cb(skb); + cb->ts = ktime_get_ns(); + q->memory_used += skb->truesize; + if (q->memory_used > q->max_memory_used) + q->max_memory_used = q->memory_used; + + if (qdisc_qlen(sch) > q->maxq) + q->maxq = qdisc_qlen(sch); + + if (skb_in_l_queue(skb)) { + /* Apply step thresh if skb is L4S && L-queue len >= min_qlen */ + dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q); + + /* Keep the overall qdisc stats consistent */ + ++sch->q.qlen; + qdisc_qstats_backlog_inc(sch, skb); + ++q->packets_in_l; + if (!q->l_head_ts) + q->l_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, q->l_queue); + } + ++q->packets_in_c; + if (!q->c_head_ts) + q->c_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, sch); +} + +/* By default, dualpi2 will split GSO skbs into independent skbs and enqueue + * each of those individually. This yields the following benefits, at the + * expense of CPU usage: + * - Finer-grained AQM actions as the sub-packets of a burst no longer share the + * same fate (e.g., the random mark/drop probability is applied individually) + * - Improved precision of the starvation protection/WRR scheduler at dequeue, + * as the size of the dequeued packets will be smaller. + */ +static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + int err; + + err = dualpi2_skb_classify(q, skb); + if (err != NET_XMIT_SUCCESS) { + if (err & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return err; + } + + if (q->split_gso && skb_is_gso(skb)) { + netdev_features_t features; + struct sk_buff *nskb, *next; + int cnt, byte_len, orig_len; + int err; + + features = netif_skb_features(skb); + nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(nskb)) + return qdisc_drop(skb, sch, to_free); + + cnt = 1; + byte_len = 0; + orig_len = qdisc_pkt_len(skb); + skb_list_walk_safe(nskb, nskb, next) { + skb_mark_not_on_list(nskb); + + /* Iterate through GSO fragments of an skb: + * (1) Set pkt_len from the single GSO fragments + * (2) Copy classified and ect values of an skb + * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb + */ + qdisc_skb_cb(nskb)->pkt_len = nskb->len; + dualpi2_skb_cb(nskb)->classified = + dualpi2_skb_cb(skb)->classified; + dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; + err = dualpi2_enqueue_skb(nskb, sch, to_free); + + if (err == NET_XMIT_SUCCESS) { + /* Compute the backlog adjustment that needs + * to be propagated in the qdisc tree to reflect + * all new skbs successfully enqueued. + */ + ++cnt; + byte_len += nskb->len; + } + } + if (cnt > 1) { + /* The caller will add the original skb stats to its + * backlog, compensate this if any nskb is enqueued. + */ + --cnt; + byte_len -= orig_len; + } + qdisc_tree_reduce_backlog(sch, -cnt, -byte_len); + consume_skb(skb); + return err; + } + return dualpi2_enqueue_skb(skb, sch, to_free); +} + +/* Select the queue from which the next packet can be dequeued, ensuring that + * neither queue can starve the other with a WRR scheduler. + * + * The sign of the WRR credit determines the next queue, while the size of + * the dequeued packet determines the magnitude of the WRR credit change. If + * either queue is empty, the WRR credit is kept unchanged. + * + * As the dequeued packet can be dropped later, the caller has to perform the + * qdisc_bstats_update() calls. + */ +static struct sk_buff *dequeue_packet(struct Qdisc *sch, + struct dualpi2_sched_data *q, + int *credit_change, + u64 now) +{ + struct sk_buff *skb = NULL; + int c_len; + + *credit_change = 0; + c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue); + if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) { + skb = __qdisc_dequeue_head(&q->l_queue->q); + WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue)); + if (c_len) + *credit_change = q->c_protection_wc; + qdisc_qstats_backlog_dec(q->l_queue, skb); + + /* Keep the global queue size consistent */ + --sch->q.qlen; + q->memory_used -= skb->truesize; + } else if (c_len) { + skb = __qdisc_dequeue_head(&sch->q); + WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch)); + if (qdisc_qlen(q->l_queue)) + *credit_change = ~((s32)q->c_protection_wl) + 1; + q->memory_used -= skb->truesize; + } else { + dualpi2_reset_c_protection(q); + return NULL; + } + *credit_change *= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); + return skb; +} + +static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb, + u64 now) +{ + u64 qdelay = 0; + + if (q->step_in_packets) + qdelay = qdisc_qlen(q->l_queue); + else + qdelay = dualpi2_sojourn_time(skb, now); + + if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) { + if (!dualpi2_skb_cb(skb)->ect) { + /* Drop this non-ECT packet */ + return 1; + } + + if (dualpi2_mark(q, skb)) + ++q->step_marks; + } + qdisc_bstats_update(q->l_queue, skb); + return 0; +} + +static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb, + struct Qdisc *sch, enum skb_drop_reason reason) +{ + ++q->deferred_drops_cnt; + q->deferred_drops_len += qdisc_pkt_len(skb); + kfree_skb_reason(skb, reason); + qdisc_qstats_drop(sch); +} + +static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + int credit_change; + u64 now; + + now = ktime_get_ns(); + + while ((skb = dequeue_packet(sch, q, &credit_change, now))) { + if (!q->drop_early && must_drop(sch, q, skb)) { + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_QDISC_CONGESTED); + continue; + } + + if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) { + qdisc_qstats_drop(q->l_queue); + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_DUALPI2_STEP_DROP); + continue; + } + + q->c_protection_credit += credit_change; + qdisc_bstats_update(sch, skb); + break; + } + + if (q->deferred_drops_cnt) { + qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt, + q->deferred_drops_len); + q->deferred_drops_cnt = 0; + q->deferred_drops_len = 0; + } + return skb; +} + +static s64 __scale_delta(u64 diff) +{ + do_div(diff, 1 << ALPHA_BETA_GRANULARITY); + return diff; +} + +static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c, + u64 *qdelay_l) +{ + u64 now, qc, ql; + + now = ktime_get_ns(); + qc = READ_ONCE(q->c_head_ts); + ql = READ_ONCE(q->l_head_ts); + + *qdelay_c = qc ? now - qc : 0; + *qdelay_l = ql ? now - ql : 0; +} + +static u32 calculate_probability(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + u32 new_prob; + u64 qdelay_c; + u64 qdelay_l; + u64 qdelay; + s64 delta; + + get_queue_delays(q, &qdelay_c, &qdelay_l); + qdelay = max(qdelay_l, qdelay_c); + + /* Alpha and beta take at most 32b, i.e, the delay difference would + * overflow for queuing delay differences > ~4.2sec. + */ + delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha; + delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta; + q->last_qdelay = qdelay; + + /* Bound new_prob between 0 and MAX_PROB */ + if (delta > 0) { + new_prob = __scale_delta(delta) + q->pi2_prob; + if (new_prob < q->pi2_prob) + new_prob = MAX_PROB; + } else { + new_prob = q->pi2_prob - __scale_delta(~delta + 1); + if (new_prob > q->pi2_prob) + new_prob = 0; + } + + /* If we do not drop on overload, ensure we cap the L4S probability to + * 100% to keep window fairness when overflowing. + */ + if (!q->drop_overload) + return min_t(u32, new_prob, MAX_PROB / q->coupling_factor); + return new_prob; +} + +static u32 get_memory_limit(struct Qdisc *sch, u32 limit) +{ + /* Apply rule of thumb, i.e., doubling the packet length, + * to further include per packet overhead in memory_limit. + */ + u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch))); + + if (upper_32_bits(memlim)) + return U32_MAX; + else + return lower_32_bits(memlim); +} + +static u32 convert_us_to_nsec(u32 us) +{ + u64 ns = mul_u32_u32(us, NSEC_PER_USEC); + + if (upper_32_bits(ns)) + return U32_MAX; + + return lower_32_bits(ns); +} + +static u32 convert_ns_to_usec(u64 ns) +{ + do_div(ns, NSEC_PER_USEC); + if (upper_32_bits(ns)) + return U32_MAX; + + return lower_32_bits(ns); +} + +static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) +{ + struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer); + struct Qdisc *sch = q->sch; + spinlock_t *root_lock; /* to lock qdisc for probability calculations */ + + rcu_read_lock(); + root_lock = qdisc_lock(qdisc_root_sleeping(sch)); + spin_lock(root_lock); + + WRITE_ONCE(q->pi2_prob, calculate_probability(sch)); + hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q)); + + spin_unlock(root_lock); + rcu_read_unlock(); + return HRTIMER_RESTART; +} + +static struct netlink_range_validation dualpi2_alpha_beta_range = { + .min = 1, + .max = ALPHA_BETA_MAX, +}; + +static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = { + [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_TARGET] = { .type = NLA_U32 }, + [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1), + [TCA_DUALPI2_ALPHA] = + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), + [TCA_DUALPI2_BETA] = + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), + [TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 }, + [TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 }, + [TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 }, + [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1), + [TCA_DUALPI2_DROP_OVERLOAD] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX), + [TCA_DUALPI2_DROP_EARLY] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX), + [TCA_DUALPI2_C_PROTECTION] = + NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC), + [TCA_DUALPI2_ECN_MASK] = + NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT, + TCA_DUALPI2_ECN_MASK_MAX), + [TCA_DUALPI2_SPLIT_GSO] = + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX), +}; + +static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_DUALPI2_MAX + 1]; + struct dualpi2_sched_data *q; + int old_backlog; + int old_qlen; + int err; + + if (!opt || !nla_len(opt)) { + NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required"); + return -EINVAL; + } + err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy, + extack); + if (err < 0) + return err; + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) { + NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes"); + return -EINVAL; + } + + q = qdisc_priv(sch); + sch_tree_lock(sch); + + if (tb[TCA_DUALPI2_LIMIT]) { + u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]); + + WRITE_ONCE(sch->limit, limit); + WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit)); + } + + if (tb[TCA_DUALPI2_MEMORY_LIMIT]) + WRITE_ONCE(q->memory_limit, + nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT])); + + if (tb[TCA_DUALPI2_TARGET]) { + u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]); + + WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC); + } + + if (tb[TCA_DUALPI2_TUPDATE]) { + u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]); + + WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate)); + } + + if (tb[TCA_DUALPI2_ALPHA]) { + u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]); + + WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha)); + } + + if (tb[TCA_DUALPI2_BETA]) { + u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]); + + WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta)); + } + + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) { + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]); + + WRITE_ONCE(q->step_in_packets, true); + WRITE_ONCE(q->step_thresh, step_th); + } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) { + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]); + + WRITE_ONCE(q->step_in_packets, false); + WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th)); + } + + if (tb[TCA_DUALPI2_MIN_QLEN_STEP]) + WRITE_ONCE(q->min_qlen_step, + nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP])); + + if (tb[TCA_DUALPI2_COUPLING]) { + u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]); + + WRITE_ONCE(q->coupling_factor, coupling); + } + + if (tb[TCA_DUALPI2_DROP_OVERLOAD]) { + u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]); + + WRITE_ONCE(q->drop_overload, (bool)drop_overload); + } + + if (tb[TCA_DUALPI2_DROP_EARLY]) { + u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]); + + WRITE_ONCE(q->drop_early, (bool)drop_early); + } + + if (tb[TCA_DUALPI2_C_PROTECTION]) { + u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]); + + dualpi2_calculate_c_protection(sch, q, wc); + } + + if (tb[TCA_DUALPI2_ECN_MASK]) { + u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]); + + WRITE_ONCE(q->ecn_mask, ecn_mask); + } + + if (tb[TCA_DUALPI2_SPLIT_GSO]) { + u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]); + + WRITE_ONCE(q->split_gso, (bool)split_gso); + } + + old_qlen = qdisc_qlen(sch); + old_backlog = sch->qstats.backlog; + while (qdisc_qlen(sch) > sch->limit || + q->memory_used > q->memory_limit) { + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); + + q->memory_used -= skb->truesize; + qdisc_qstats_backlog_dec(sch, skb); + rtnl_qdisc_drop(skb, sch); + } + qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch), + old_backlog - sch->qstats.backlog); + + sch_tree_unlock(sch); + return 0; +} + +/* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */ +static void dualpi2_reset_default(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + q->sch->limit = 10000; /* Max 125ms at 1Gbps */ + q->memory_limit = get_memory_limit(sch, q->sch->limit); + + q->pi2_target = 15 * NSEC_PER_MSEC; + q->pi2_tupdate = 16 * NSEC_PER_MSEC; + q->pi2_alpha = dualpi2_scale_alpha_beta(41); /* ~0.16 Hz * 256 */ + q->pi2_beta = dualpi2_scale_alpha_beta(819); /* ~3.20 Hz * 256 */ + + q->step_thresh = 1 * NSEC_PER_MSEC; + q->step_in_packets = false; + + dualpi2_calculate_c_protection(q->sch, q, 10); /* wc=10%, wl=90% */ + + q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT; /* INET_ECN_ECT_1 */ + q->min_qlen_step = 0; /* Always apply step mark in L-queue */ + q->coupling_factor = 2; /* window fairness for equal RTTs */ + q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */ + q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */ + q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO; /* Split GSO */ +} + +static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + int err; + + q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, 1), extack); + if (!q->l_queue) + return -ENOMEM; + + err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack); + if (err) + return err; + + q->sch = sch; + dualpi2_reset_default(sch); + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + + if (opt && nla_len(opt)) { + err = dualpi2_change(sch, opt, extack); + + if (err) + return err; + } + + hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), + HRTIMER_MODE_ABS_PINNED); + return 0; +} + +static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + bool step_in_pkts; + u32 step_th; + + step_in_pkts = READ_ONCE(q->step_in_packets); + step_th = READ_ONCE(q->step_thresh); + + opts = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!opts) + goto nla_put_failure; + + if (step_in_pkts && + (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_DUALPI2_TARGET, + convert_ns_to_usec(READ_ONCE(q->pi2_target))) || + nla_put_u32(skb, TCA_DUALPI2_TUPDATE, + convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || + nla_put_u32(skb, TCA_DUALPI2_ALPHA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || + nla_put_u32(skb, TCA_DUALPI2_BETA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || + nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) || + nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, + READ_ONCE(q->min_qlen_step)) || + nla_put_u8(skb, TCA_DUALPI2_COUPLING, + READ_ONCE(q->coupling_factor)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, + READ_ONCE(q->drop_overload)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, + READ_ONCE(q->drop_early)) || + nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, + READ_ONCE(q->c_protection_wc)) || + nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || + nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) + goto nla_put_failure; + + if (!step_in_pkts && + (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) || + nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT, + READ_ONCE(q->memory_limit)) || + nla_put_u32(skb, TCA_DUALPI2_TARGET, + convert_ns_to_usec(READ_ONCE(q->pi2_target))) || + nla_put_u32(skb, TCA_DUALPI2_TUPDATE, + convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) || + nla_put_u32(skb, TCA_DUALPI2_ALPHA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) || + nla_put_u32(skb, TCA_DUALPI2_BETA, + dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) || + nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US, + convert_ns_to_usec(step_th)) || + nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP, + READ_ONCE(q->min_qlen_step)) || + nla_put_u8(skb, TCA_DUALPI2_COUPLING, + READ_ONCE(q->coupling_factor)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD, + READ_ONCE(q->drop_overload)) || + nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY, + READ_ONCE(q->drop_early)) || + nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION, + READ_ONCE(q->c_protection_wc)) || + nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) || + nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso)))) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + nla_nest_cancel(skb, opts); + return -1; +} + +static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct tc_dualpi2_xstats st = { + .prob = READ_ONCE(q->pi2_prob), + .packets_in_c = q->packets_in_c, + .packets_in_l = q->packets_in_l, + .maxq = q->maxq, + .ecn_mark = q->ecn_mark, + .credit = q->c_protection_credit, + .step_marks = q->step_marks, + .memory_used = q->memory_used, + .max_memory_used = q->max_memory_used, + .memory_limit = q->memory_limit, + }; + u64 qc, ql; + + get_queue_delays(q, &qc, &ql); + st.delay_l = convert_ns_to_usec(ql); + st.delay_c = convert_ns_to_usec(qc); + return gnet_stats_copy_app(d, &st, sizeof(st)); +} + +/* Reset both L-queue and C-queue, internal packet counters, PI probability, + * C-queue protection credit, and timestamps, while preserving current + * configuration of DUALPI2. + */ +static void dualpi2_reset(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + qdisc_reset_queue(sch); + qdisc_reset_queue(q->l_queue); + q->c_head_ts = 0; + q->l_head_ts = 0; + q->pi2_prob = 0; + q->packets_in_c = 0; + q->packets_in_l = 0; + q->maxq = 0; + q->ecn_mark = 0; + q->step_marks = 0; + q->memory_used = 0; + q->max_memory_used = 0; + dualpi2_reset_c_protection(q); +} + +static void dualpi2_destroy(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + q->pi2_tupdate = 0; + hrtimer_cancel(&q->pi2_timer); + if (q->l_queue) + qdisc_put(q->l_queue); + tcf_block_put(q->tcf_block); +} + +static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static void dualpi2_unbind(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return q->tcf_block; +} + +static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + unsigned int i; + + if (arg->stop) + return; + + /* We statically define only 2 queues */ + for (i = 0; i < 2; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +/* Minimal class support to handle tc filters */ +static const struct Qdisc_class_ops dualpi2_class_ops = { + .leaf = dualpi2_leaf, + .find = dualpi2_find, + .tcf_block = dualpi2_tcf_block, + .bind_tcf = dualpi2_bind, + .unbind_tcf = dualpi2_unbind, + .walk = dualpi2_walk, +}; + +static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { + .id = "dualpi2", + .cl_ops = &dualpi2_class_ops, + .priv_size = sizeof(struct dualpi2_sched_data), + .enqueue = dualpi2_qdisc_enqueue, + .dequeue = dualpi2_qdisc_dequeue, + .peek = qdisc_peek_dequeued, + .init = dualpi2_init, + .destroy = dualpi2_destroy, + .reset = dualpi2_reset, + .change = dualpi2_change, + .dump = dualpi2_dump, + .dump_stats = dualpi2_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init dualpi2_module_init(void) +{ + return register_qdisc(&dualpi2_qdisc_ops); +} + +static void __exit dualpi2_module_exit(void) +{ + unregister_qdisc(&dualpi2_qdisc_ops); +} + +module_init(dualpi2_module_init); +module_exit(dualpi2_module_exit); + +MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler"); +MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>"); +MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>"); +MODULE_AUTHOR("Olga Albisser <olga@albisser.org>"); +MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>"); +MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION("1.0"); diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 2c069f0181c62..037f764822b96 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -661,7 +661,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, for (i = q->nbands; i < oldbands; i++) { if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) list_del_init(&q->classes[i].alist); - qdisc_tree_flush_backlog(q->classes[i].qdisc); + qdisc_purge_queue(q->classes[i].qdisc); } WRITE_ONCE(q->nstrict, nstrict); memcpy(q->prio2band, priomap, sizeof(priomap)); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index df7fac95ab151..b0e34daf1f751 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -384,7 +384,7 @@ flow_error: static void fq_pie_timer(struct timer_list *t) { - struct fq_pie_sched_data *q = from_timer(q, t, adapt_timer); + struct fq_pie_sched_data *q = timer_container_of(q, t, adapt_timer); unsigned long next, tupdate; struct Qdisc *sch = q->sch; spinlock_t *root_lock; /* to lock qdisc for probability calculations */ diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c index ce63414185fd6..d1d87dce7f3f7 100644 --- a/net/sched/sch_frag.c +++ b/net/sched/sch_frag.c @@ -16,14 +16,18 @@ struct sch_frag_data { unsigned int l2_len; u8 l2_data[VLAN_ETH_HLEN]; int (*xmit)(struct sk_buff *skb); + local_lock_t bh_lock; }; -static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage); +static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage); + lockdep_assert_held(&data->bh_lock); if (skb_cow_head(skb, data->l2_len) < 0) { kfree_skb(skb); return -ENOMEM; @@ -95,6 +99,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, struct rtable sch_frag_rt = { 0 }; unsigned long orig_dst; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); @@ -105,11 +110,13 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, IPCB(skb)->frag_max_size = mru; ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) { unsigned long orig_dst; struct rt6_info sch_frag_rt; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); memset(&sch_frag_rt, 0, sizeof(sch_frag_rt)); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, @@ -122,6 +129,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else { net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n", diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 514b1b6ac6819..1e008a228ebdf 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -24,6 +24,7 @@ #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> +#include <linux/bpf.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -495,7 +496,7 @@ EXPORT_SYMBOL(netif_tx_unlock); static void dev_watchdog(struct timer_list *t) { - struct net_device *dev = from_timer(dev, t, watchdog_timer); + struct net_device *dev = timer_container_of(dev, t, watchdog_timer); bool release = true; spin_lock(&dev->tx_global_lock); @@ -739,6 +740,8 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, err = skb_array_produce(q, skb); if (unlikely(err)) { + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); + if (qdisc_is_percpu_stats(qdisc)) return qdisc_drop_cpu(skb, qdisc, to_free); else @@ -1001,14 +1004,14 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, { struct Qdisc *sch; - if (!try_module_get(ops->owner)) { + if (!bpf_try_module_get(ops, ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { - module_put(ops->owner); + bpf_module_put(ops, ops->owner); return NULL; } sch->parent = parentid; @@ -1078,7 +1081,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc) ops->destroy(qdisc); lockdep_unregister_key(&qdisc->root_lock_key); - module_put(ops->owner); + bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 7986145a527cb..d8fd35da32a7c 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -175,6 +175,11 @@ struct hfsc_sched { #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ +static bool cl_in_el_or_vttree(struct hfsc_class *cl) +{ + return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) || + ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node)); +} /* * eligible tree holds backlogged classes being sorted by their eligible times. @@ -830,22 +835,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) } } -static unsigned int -qdisc_peek_len(struct Qdisc *sch) -{ - struct sk_buff *skb; - unsigned int len; - - skb = sch->ops->peek(sch); - if (unlikely(skb == NULL)) { - qdisc_warn_nonwc("qdisc_peek_len", sch); - return 0; - } - len = qdisc_pkt_len(skb); - - return len; -} - static void hfsc_adjust_levels(struct hfsc_class *cl) { @@ -1040,6 +1029,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + RB_CLEAR_NODE(&cl->el_node); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1572,7 +1563,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) sch->qstats.backlog += len; sch->q.qlen++; - if (first && !cl->cl_nactive) { + if (first && !cl_in_el_or_vttree(cl)) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 14bf71f570570..c968ea7637746 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -821,7 +821,9 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) u32 *pid; } stk[TC_HTB_MAXDEPTH], *sp = stk; - BUG_ON(!hprio->row.rb_node); + if (unlikely(!hprio->row.rb_node)) + return NULL; + sp->root = hprio->row.rb_node; sp->pptr = &hprio->ptr; sp->pid = &hprio->last_ptr_id; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 51d4013b61219..f3e5ef9a95925 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -152,7 +152,7 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, static const struct nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = { [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, - TC_QOPT_MAX_QUEUE), + TC_QOPT_MAX_QUEUE - 1), [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index fdd79d3ccd8ce..eafc316ae319e 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -973,6 +973,41 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } +static const struct Qdisc_class_ops netem_class_ops; + +static int check_netem_in_tree(struct Qdisc *sch, bool duplicates, + struct netlink_ext_ack *extack) +{ + struct Qdisc *root, *q; + unsigned int i; + + root = qdisc_root_sleeping(sch); + + if (sch != root && root->ops->cl_ops == &netem_class_ops) { + if (duplicates || + ((struct netem_sched_data *)qdisc_priv(root))->duplicate) + goto err; + } + + if (!qdisc_dev(root)) + return 0; + + hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) { + if (sch != q && q->ops->cl_ops == &netem_class_ops) { + if (duplicates || + ((struct netem_sched_data *)qdisc_priv(q))->duplicate) + goto err; + } + } + + return 0; + +err: + NL_SET_ERR_MSG(extack, + "netem: cannot mix duplicating netems with other netems in tree"); + return -EINVAL; +} + /* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) @@ -1031,6 +1066,11 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, q->gap = qopt->gap; q->counter = 0; q->loss = qopt->loss; + + ret = check_netem_in_tree(sch, qopt->duplicate, extack); + if (ret) + goto unlock; + q->duplicate = qopt->duplicate; /* for compatibility with earlier versions. diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index ff49a6c97033a..ad46ee3ed5a96 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -424,7 +424,7 @@ EXPORT_SYMBOL_GPL(pie_calculate_probability); static void pie_timer(struct timer_list *t) { - struct pie_sched_data *q = from_timer(q, t, adapt_timer); + struct pie_sched_data *q = timer_container_of(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index cc30f7a32f1a7..9e2b9a490db23 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -211,7 +211,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); for (i = q->bands; i < oldbands; i++) - qdisc_tree_flush_backlog(q->queues[i]); + qdisc_purge_queue(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index bf1282cb22eba..2255355e51d35 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -412,7 +412,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, bool existing = false; struct nlattr *tb[TCA_QFQ_MAX + 1]; struct qfq_aggregate *new_agg = NULL; - u32 weight, lmax, inv_w; + u32 weight, lmax, inv_w, old_weight, old_lmax; int err; int delta_w; @@ -443,12 +443,16 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; - if (cl != NULL && - lmax == cl->agg->lmax && - weight == cl->agg->class_weight) - return 0; /* nothing to change */ + if (cl != NULL) { + sch_tree_lock(sch); + old_weight = cl->agg->class_weight; + old_lmax = cl->agg->lmax; + sch_tree_unlock(sch); + if (lmax == old_lmax && weight == old_weight) + return 0; /* nothing to change */ + } - delta_w = weight - (cl ? cl->agg->class_weight : 0); + delta_w = weight - (cl ? old_weight : 0); if (q->wsum + delta_w > QFQ_MAX_WSUM) { NL_SET_ERR_MSG_FMT_MOD(extack, @@ -532,9 +536,6 @@ destroy_class: static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { - struct qfq_sched *q = qdisc_priv(sch); - - qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->rate_est); qdisc_put(cl->qdisc); kfree(cl); @@ -555,6 +556,7 @@ static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); + qfq_rm_from_agg(q, cl); sch_tree_unlock(sch); @@ -625,6 +627,7 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, { struct qfq_class *cl = (struct qfq_class *)arg; struct nlattr *nest; + u32 class_weight, lmax; tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle = cl->common.classid; @@ -633,8 +636,13 @@ static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || - nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) + + sch_tree_lock(sch); + class_weight = cl->agg->class_weight; + lmax = cl->agg->lmax; + sch_tree_unlock(sch); + if (nla_put_u32(skb, TCA_QFQ_WEIGHT, class_weight) || + nla_put_u32(skb, TCA_QFQ_LMAX, lmax)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -651,8 +659,10 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, memset(&xstats, 0, sizeof(xstats)); + sch_tree_lock(sch); xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; + sch_tree_unlock(sch); if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || @@ -989,7 +999,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ list_del_init(&cl->alist); - else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { + else if (cl->deficit < qdisc_peek_len(cl->qdisc)) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); } @@ -1491,6 +1501,7 @@ static void qfq_destroy_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) { + qfq_rm_from_agg(q, cl); qfq_destroy_class(sch, cl); } } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 1ba3e0bba54f0..479c42d11083f 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -285,7 +285,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, q->userbits = userbits; q->limit = ctl->limit; if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old_child = q->qdisc; q->qdisc = child; } @@ -321,7 +321,7 @@ unlock_out: static inline void red_adaptative_timer(struct timer_list *t) { - struct red_sched_data *q = from_timer(q, t, adapt_timer); + struct red_sched_data *q = timer_container_of(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b912ad99aa15d..96eb2f122973a 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -310,7 +310,10 @@ drop: /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ x = q->tail->next; slot = &q->slots[x]; - q->tail->next = slot->next; + if (slot->next == x) + q->tail = NULL; /* no more active slots */ + else + q->tail->next = slot->next; q->ht[slot->hash] = SFQ_EMPTY_SLOT; goto drop; } @@ -597,7 +600,7 @@ drop: static void sfq_perturbation(struct timer_list *t) { - struct sfq_sched_data *q = from_timer(q, t, perturb_timer); + struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; siphash_key_t nkey; @@ -653,6 +656,14 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); return -EINVAL; } + + if (ctl->perturb_period < 0 || + ctl->perturb_period > INT_MAX / HZ) { + NL_SET_ERR_MSG_MOD(extack, "invalid perturb period"); + return -EINVAL; + } + perturb_period = ctl->perturb_period * HZ; + if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; @@ -669,14 +680,12 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, headdrop = q->headdrop; maxdepth = q->maxdepth; maxflows = q->maxflows; - perturb_period = q->perturb_period; quantum = q->quantum; flags = q->flags; /* update and validate configuration */ if (ctl->quantum) quantum = ctl->quantum; - perturb_period = ctl->perturb_period * HZ; if (ctl->flows) maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 14021b8123290..39b735386996e 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -43,6 +43,11 @@ static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_SUPPORTED_FLAGS \ (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX +/* Minimum value for picos_per_byte to ensure non-zero duration + * for minimum-sized Ethernet frames (ETH_ZLEN = 60). + * 60 * 17 > PSEC_PER_NSEC (1000) + */ +#define TAPRIO_PICOS_PER_BYTE_MIN 17 struct sched_entry { /* Durations between this GCL entry and the GCL entry where the @@ -998,7 +1003,7 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, - TC_QOPT_MAX_QUEUE), + TC_QOPT_MAX_QUEUE - 1), [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, @@ -1284,7 +1289,8 @@ static void taprio_start_sched(struct Qdisc *sch, } static void taprio_set_picos_per_byte(struct net_device *dev, - struct taprio_sched *q) + struct taprio_sched *q, + struct netlink_ext_ack *extack) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; @@ -1300,6 +1306,15 @@ static void taprio_set_picos_per_byte(struct net_device *dev, skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; + if (picos_per_byte < TAPRIO_PICOS_PER_BYTE_MIN) { + if (!extack) + pr_warn("Link speed %d is too high. Schedule may be inaccurate.\n", + speed); + NL_SET_ERR_MSG_FMT_MOD(extack, + "Link speed %d is too high. Schedule may be inaccurate.", + speed); + picos_per_byte = TAPRIO_PICOS_PER_BYTE_MIN; + } atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", @@ -1324,17 +1339,19 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, if (dev != qdisc_dev(q->root)) continue; - taprio_set_picos_per_byte(dev, q); + taprio_set_picos_per_byte(dev, q, NULL); stab = rtnl_dereference(q->root->stab); - oper = rtnl_dereference(q->oper_sched); + rcu_read_lock(); + oper = rcu_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); - admin = rtnl_dereference(q->admin_sched); + admin = rcu_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); + rcu_read_unlock(); break; } @@ -1696,19 +1713,15 @@ static int taprio_parse_tc_entry(struct Qdisc *sch, if (err < 0) return err; - if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { + if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_TAPRIO_TC_ENTRY_INDEX)) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); - if (tc >= TC_QOPT_MAX_QUEUE) { - NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); - return -ERANGE; - } - if (*seen_tcs & BIT(tc)) { - NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_TC_ENTRY_INDEX], + "Duplicate tc entry"); return -EINVAL; } @@ -1846,7 +1859,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, q->flags = taprio_flags; /* Needed for length_to_duration() during netlink attribute parsing */ - taprio_set_picos_per_byte(dev, q); + taprio_set_picos_per_byte(dev, q, extack); err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index dc26b22d53c73..4c977f049670a 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -452,7 +452,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (child) { - qdisc_tree_flush_backlog(q->qdisc); + qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; } diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index d18a72df3654e..24d5a35ce894a 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -7,10 +7,10 @@ menuconfig IP_SCTP tristate "The SCTP Protocol" depends on INET depends on IPV6 || IPV6=n - select CRC32 select CRYPTO select CRYPTO_HMAC select CRYPTO_SHA1 + select NET_CRC32C select NET_UDP_TUNNEL help Stream Control Transmission Protocol diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 760152e751c7c..5793d71852b83 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -736,24 +736,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, return peer; } -/* Delete a transport address from an association. */ -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr) -{ - struct list_head *pos; - struct list_head *temp; - struct sctp_transport *transport; - - list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { - transport = list_entry(pos, struct sctp_transport, transports); - if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) { - /* Do book keeping for removing the peer and free it. */ - sctp_assoc_rm_peer(asoc, transport); - break; - } - } -} - /* Lookup a transport by address. */ struct sctp_transport *sctp_assoc_lookup_paddr( const struct sctp_association *asoc, diff --git a/net/sctp/input.c b/net/sctp/input.c index 0c0d2757f6f8d..2dc2666988fbc 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -756,7 +756,7 @@ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) struct sock *sk2 = ep2->base.sk; if (!net_eq(sock_net(sk2), net) || sk2 == sk || - !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) || + !uid_eq(sk_uid(sk2), sk_uid(sk)) || !sk2->sk_reuseport) continue; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index a9ed2ccab1bdb..3336dcfb45150 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -261,9 +261,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_set_inner_ipproto(skb, IPPROTO_SCTP); label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6); - return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, - &fl6->daddr, tclass, ip6_dst_hoplimit(dst), - label, sctp_sk(sk)->udp_port, t->encap_port, false); + udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, + tclass, ip6_dst_hoplimit(dst), label, + sctp_sk(sk)->udp_port, t->encap_port, false, 0); + return 0; } /* Returns the dst cache entry for the given source and destination ip diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 502095173d885..e6f863c031b4c 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -111,7 +111,6 @@ int __init sctp_offload_init(void) if (ret) goto ipv4; - crc32c_csum_stub = &sctp_csum_ops; return ret; ipv4: diff --git a/net/sctp/proc.c b/net/sctp/proc.c index ec00ee75d59a6..74bff317e205c 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -177,7 +177,7 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5lu ", ep, sk, sctp_sk(sk)->type, sk->sk_state, hash, ep->base.bind_addr.port, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk)); sctp_seq_dump_local_addrs(seq, &ep->base); @@ -267,7 +267,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) assoc->assoc_id, assoc->sndbuf_used, atomic_read(&assoc->rmem_alloc), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), epb->bind_addr.port, assoc->peer.port); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 8c3b80c4d40b8..a5ccada55f2b2 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -631,7 +631,7 @@ static void sctp_v4_ecn_capable(struct sock *sk) static void sctp_addr_wq_timeout_handler(struct timer_list *t) { - struct net *net = from_timer(net, t, sctp.addr_wq_timer); + struct net *net = timer_container_of(net, t, sctp.addr_wq_timer); struct sctp_sockaddr_entry *addrw, *temp; struct sctp_sock *sp; @@ -1103,7 +1103,8 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_set_inner_ipproto(skb, IPPROTO_SCTP); udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr, fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, - sctp_sk(sk)->udp_port, t->encap_port, false, false); + sctp_sk(sk)->udp_port, t->encap_port, false, false, + 0); return 0; } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f80208edd6a5c..3ead591c72fd3 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -115,14 +115,6 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) skb->destructor = sctp_control_release_owner; } -/* What was the inbound interface for this chunk? */ -int sctp_chunk_iif(const struct sctp_chunk *chunk) -{ - struct sk_buff *skb = chunk->skb; - - return SCTP_INPUT_CB(skb)->af->skb_iif(skb); -} - /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 3aa5da5e3bbd2..424f10a6fdba9 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -231,7 +231,7 @@ nomem: void sctp_generate_t3_rtx_event(struct timer_list *t) { struct sctp_transport *transport = - from_timer(transport, t, T3_rtx_timer); + timer_container_of(transport, t, T3_rtx_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); @@ -308,7 +308,8 @@ out_unlock: static void sctp_generate_t1_cookie_event(struct timer_list *t) { struct sctp_association *asoc = - from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_COOKIE]); + timer_container_of(asoc, t, + timers[SCTP_EVENT_TIMEOUT_T1_COOKIE]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE); } @@ -316,7 +317,8 @@ static void sctp_generate_t1_cookie_event(struct timer_list *t) static void sctp_generate_t1_init_event(struct timer_list *t) { struct sctp_association *asoc = - from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_INIT]); + timer_container_of(asoc, t, + timers[SCTP_EVENT_TIMEOUT_T1_INIT]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT); } @@ -324,7 +326,8 @@ static void sctp_generate_t1_init_event(struct timer_list *t) static void sctp_generate_t2_shutdown_event(struct timer_list *t) { struct sctp_association *asoc = - from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN]); + timer_container_of(asoc, t, + timers[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN); } @@ -332,7 +335,7 @@ static void sctp_generate_t2_shutdown_event(struct timer_list *t) static void sctp_generate_t4_rto_event(struct timer_list *t) { struct sctp_association *asoc = - from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T4_RTO]); + timer_container_of(asoc, t, timers[SCTP_EVENT_TIMEOUT_T4_RTO]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO); } @@ -340,8 +343,8 @@ static void sctp_generate_t4_rto_event(struct timer_list *t) static void sctp_generate_t5_shutdown_guard_event(struct timer_list *t) { struct sctp_association *asoc = - from_timer(asoc, t, - timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]); + timer_container_of(asoc, t, + timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD); @@ -351,7 +354,8 @@ static void sctp_generate_t5_shutdown_guard_event(struct timer_list *t) static void sctp_generate_autoclose_event(struct timer_list *t) { struct sctp_association *asoc = - from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]); + timer_container_of(asoc, t, + timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE); } @@ -361,7 +365,8 @@ static void sctp_generate_autoclose_event(struct timer_list *t) */ void sctp_generate_heartbeat_event(struct timer_list *t) { - struct sctp_transport *transport = from_timer(transport, t, hb_timer); + struct sctp_transport *transport = timer_container_of(transport, t, + hb_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); @@ -407,7 +412,7 @@ out_unlock: void sctp_generate_proto_unreach_event(struct timer_list *t) { struct sctp_transport *transport = - from_timer(transport, t, proto_unreach_timer); + timer_container_of(transport, t, proto_unreach_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); @@ -442,7 +447,7 @@ out_unlock: void sctp_generate_reconf_event(struct timer_list *t) { struct sctp_transport *transport = - from_timer(transport, t, reconf_timer); + timer_container_of(transport, t, reconf_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); @@ -478,7 +483,8 @@ out_unlock: /* Handle the timeout of the probe timer. */ void sctp_generate_probe_event(struct timer_list *t) { - struct sctp_transport *transport = from_timer(transport, t, probe_timer); + struct sctp_transport *transport = timer_container_of(transport, t, + probe_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); @@ -511,7 +517,7 @@ out_unlock: static void sctp_generate_sack_event(struct timer_list *t) { struct sctp_association *asoc = - from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_SACK]); + timer_container_of(asoc, t, timers[SCTP_EVENT_TIMEOUT_SACK]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 53725ee7ba06d..4921416434f9a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5627,7 +5627,8 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv } /* Helper routine to branch off an association to a new socket. */ -int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) +static int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, + struct socket **sockp) { struct sctp_association *asoc = sctp_id2assoc(sk, id); struct sctp_sock *sp = sctp_sk(sk); @@ -5675,7 +5676,6 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) return err; } -EXPORT_SYMBOL(sctp_do_peeloff); static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *peeloff, struct file **newfile, unsigned flags) @@ -8321,7 +8321,7 @@ static int sctp_hash(struct sock *sk) static void sctp_unhash(struct sock *sk) { - /* STUB */ + sock_rps_delete_flow(sk); } /* Check if port is acceptable. Possibly find first available port. @@ -8345,8 +8345,8 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) bool reuse = (sk->sk_reuse || sp->reuse); struct sctp_bind_hashbucket *head; /* hash list */ struct net *net = sock_net(sk); - kuid_t uid = sock_i_uid(sk); struct sctp_bind_bucket *pp; + kuid_t uid = sk_uid(sk); unsigned short snum; int ret; @@ -8444,7 +8444,7 @@ pp_found: (reuse && (sk2->sk_reuse || sp2->reuse) && sk2->sk_state != SCTP_SS_LISTENING) || (sk->sk_reuseport && sk2->sk_reuseport && - uid_eq(uid, sock_i_uid(sk2)))) + uid_eq(uid, sk_uid(sk2)))) continue; if ((!sk->sk_bound_dev_if || !bound_dev_if2 || @@ -9100,7 +9100,8 @@ static void __sctp_write_space(struct sctp_association *asoc) wq = rcu_dereference(sk->sk_wq); if (wq) { if (waitqueue_active(&wq->wait)) - wake_up_interruptible(&wq->wait); + wake_up_interruptible_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. @@ -9491,8 +9492,8 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_sndbuf = sk->sk_sndbuf; newsk->sk_rcvbuf = sk->sk_rcvbuf; newsk->sk_lingertime = sk->sk_lingertime; - newsk->sk_rcvtimeo = sk->sk_rcvtimeo; - newsk->sk_sndtimeo = sk->sk_sndtimeo; + newsk->sk_rcvtimeo = READ_ONCE(sk->sk_rcvtimeo); + newsk->sk_sndtimeo = READ_ONCE(sk->sk_sndtimeo); newsk->sk_rxhash = sk->sk_rxhash; newinet = inet_sk(newsk); diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 6946c14627931..4d258a6e8033c 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -240,7 +240,7 @@ void sctp_transport_set_owner(struct sctp_transport *transport, void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) { /* If we don't have a fresh route, look one up */ - if (!transport->dst || transport->dst->obsolete) { + if (!transport->dst || READ_ONCE(transport->dst->obsolete)) { sctp_transport_dst_release(transport); transport->af_specific->get_dst(transport, &transport->saddr, &transport->fl, sk); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3760131f14845..9311c38f7abe5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -30,6 +30,10 @@ #include <linux/splice.h> #include <net/sock.h> +#include <net/inet_common.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#endif #include <net/tcp.h> #include <net/smc.h> #include <asm/ioctls.h> @@ -360,6 +364,16 @@ static void smc_destruct(struct sock *sk) return; if (!sock_flag(sk, SOCK_DEAD)) return; + switch (sk->sk_family) { + case AF_INET: + inet_sock_destruct(sk); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + inet6_sock_destruct(sk); + break; +#endif + } } static struct lock_class_key smc_key; @@ -486,8 +500,8 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, { /* options we don't get control via setsockopt for */ nsk->sk_type = osk->sk_type; - nsk->sk_sndtimeo = osk->sk_sndtimeo; - nsk->sk_rcvtimeo = osk->sk_rcvtimeo; + nsk->sk_sndtimeo = READ_ONCE(osk->sk_sndtimeo); + nsk->sk_rcvtimeo = READ_ONCE(osk->sk_rcvtimeo); nsk->sk_mark = READ_ONCE(osk->sk_mark); nsk->sk_priority = READ_ONCE(osk->sk_priority); nsk->sk_rcvlowat = osk->sk_rcvlowat; @@ -1585,7 +1599,7 @@ static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); - long timeo = smc->sk.sk_sndtimeo; + long timeo = READ_ONCE(smc->sk.sk_sndtimeo); int rc = 0; if (!timeo) @@ -2735,8 +2749,7 @@ int smc_accept(struct socket *sock, struct socket *new_sock, if (lsmc->sockopt_defer_accept && !(arg->flags & O_NONBLOCK)) { /* wait till data arrives on the socket */ - timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * - MSEC_PER_SEC); + timeo = secs_to_jiffies(lsmc->sockopt_defer_accept); if (smc_sk(nsk)->use_fallback) { struct sock *clcsk = smc_sk(nsk)->clcsock->sk; diff --git a/net/smc/smc.h b/net/smc/smc.h index 78ae10d06ed2e..2c90849637398 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -283,10 +283,10 @@ struct smc_connection { }; struct smc_sock { /* smc sock container */ - struct sock sk; -#if IS_ENABLED(CONFIG_IPV6) - struct ipv6_pinfo *pinet6; -#endif + union { + struct sock sk; + struct inet_sock icsk_inet; + }; struct socket *clcsock; /* internal tcp socket */ void (*clcsk_state_change)(struct sock *sk); /* original stat_change fct. */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 521f5df80e10c..5a4db151fe957 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -688,7 +688,7 @@ out: int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout) { - long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; + long rcvtimeo = READ_ONCE(smc->clcsock->sk->sk_rcvtimeo); struct sock *clc_sk = smc->clcsock->sk; struct smc_clc_msg_hdr *clcm = buf; struct msghdr msg = {NULL, 0}; @@ -707,7 +707,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, * sizeof(struct smc_clc_msg_hdr) */ krflags = MSG_PEEK | MSG_WAITALL; - clc_sk->sk_rcvtimeo = timeout; + WRITE_ONCE(clc_sk->sk_rcvtimeo, timeout); iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); @@ -795,7 +795,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } out: - clc_sk->sk_rcvtimeo = rcvtimeo; + WRITE_ONCE(clc_sk->sk_rcvtimeo, rcvtimeo); return reason_code; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ac07b963aedec..262746e304dda 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2100,8 +2100,7 @@ int smc_uncompress_bufsize(u8 compressed) /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ -static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - struct rw_semaphore *lock, +static struct smc_buf_desc *smc_buf_get_slot(struct rw_semaphore *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; @@ -2442,7 +2441,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) bufsize = smc_uncompress_bufsize(bufsize_comp); /* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list); + buf_desc = smc_buf_get_slot(lock, buf_list); if (buf_desc) { buf_desc->is_dma_need_sync = 0; SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 6fdb2d96777ad..8ed2f6689b017 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -64,7 +64,7 @@ static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) return 1; - r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->diag_uid = from_kuid_munged(user_ns, sk_uid(sk)); r->diag_inode = sock_i_ino(sk); return 0; } diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 3c5f64ca41153..0eb00bbefd174 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -251,11 +251,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, return 0; } -static int smc_lo_supports_v2(void) -{ - return SMC_LO_V2_CAPABLE; -} - static void smc_lo_get_local_gid(struct smcd_dev *smcd, struct smcd_gid *smcd_gid) { @@ -288,7 +283,6 @@ static const struct smcd_ops lo_ops = { .reset_vlan_required = NULL, .signal_event = NULL, .move_data = smc_lo_move_data, - .supports_v2 = smc_lo_supports_v2, .get_local_gid = smc_lo_get_local_gid, .get_chid = smc_lo_get_chid, .get_dev = smc_lo_get_dev, diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index b391c2ef463f2..76ad29e31d605 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -370,7 +370,7 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, goto out_put; new_pe->type = SMC_PNET_ETH; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); - strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); + strscpy(new_pe->eth_name, eth_name); rc = -EEXIST; new_netdev = true; mutex_lock(&pnettable->lock); diff --git a/net/socket.c b/net/socket.c index 9a0e720f08598..682969deaed35 100644 --- a/net/socket.c +++ b/net/socket.c @@ -592,10 +592,12 @@ static int sockfs_setattr(struct mnt_idmap *idmap, if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); - if (sock->sk) - sock->sk->sk_uid = iattr->ia_uid; - else + if (sock->sk) { + /* Paired with READ_ONCE() in sk_uid() */ + WRITE_ONCE(sock->sk->sk_uid, iattr->ia_uid); + } else { err = -ENOENT; + } } return err; @@ -843,6 +845,52 @@ static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb, sizeof(ts_pktinfo), &ts_pktinfo); } +bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk) +{ + const struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + u32 tsflags = READ_ONCE(sk->sk_tsflags); + + if (serr->ee.ee_errno != ENOMSG || + serr->ee.ee_origin != SO_EE_ORIGIN_TIMESTAMPING) + return false; + + /* software time stamp available and wanted */ + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && skb->tstamp) + return true; + /* hardware time stamps available and wanted */ + return (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + skb_hwtstamps(skb)->hwtstamp; +} + +int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, + struct timespec64 *ts) +{ + u32 tsflags = READ_ONCE(sk->sk_tsflags); + ktime_t hwtstamp; + int if_index = 0; + + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && + ktime_to_timespec64_cond(skb->tstamp, ts)) + return SOF_TIMESTAMPING_TX_SOFTWARE; + + if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) || + skb_is_swtx_tstamp(skb, false)) + return -ENOENT; + + if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) + hwtstamp = get_timestamp(sk, skb, &if_index); + else + hwtstamp = skb_hwtstamps(skb)->hwtstamp; + + if (tsflags & SOF_TIMESTAMPING_BIND_PHC) + hwtstamp = ptp_convert_timestamp(&hwtstamp, + READ_ONCE(sk->sk_bind_phc)); + if (!ktime_to_timespec64_cond(hwtstamp, ts)) + return -ENOENT; + + return SOF_TIMESTAMPING_TX_HARDWARE; +} + /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 95696f42647ec..43b1f558b33db 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -333,7 +333,7 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, struct strparser *strp = (struct strparser *)desc->arg.data; return __strp_recv(desc, orig_skb, orig_offset, orig_len, - strp->sk->sk_rcvbuf, strp->sk->sk_rcvtimeo); + strp->sk->sk_rcvbuf, READ_ONCE(strp->sk->sk_rcvtimeo)); } static int default_read_sock_done(struct strparser *strp, int err) @@ -485,19 +485,6 @@ int strp_init(struct strparser *strp, struct sock *sk, } EXPORT_SYMBOL_GPL(strp_init); -/* Sock process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp) -{ - strp->paused = 0; - - if (strp->need_bytes) { - if (strp_peek_len(strp) < strp->need_bytes) - return; - } - strp_read_sock(strp); -} -EXPORT_SYMBOL_GPL(__strp_unpause); - void strp_unpause(struct strparser *strp) { strp->paused = 0; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 369310909fc98..5c095cb8cb201 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -887,25 +887,16 @@ static void gss_pipe_dentry_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *gss_pipe = pdo->pdo_data; - struct rpc_pipe *pipe = gss_pipe->pipe; - if (pipe->dentry != NULL) { - rpc_unlink(pipe->dentry); - pipe->dentry = NULL; - } + rpc_unlink(gss_pipe->pipe); } static int gss_pipe_dentry_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct gss_pipe *p = pdo->pdo_data; - struct dentry *dentry; - dentry = rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - p->pipe->dentry = dentry; - return 0; + return rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe); } static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = { @@ -1545,6 +1536,7 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) struct kvec iov; struct xdr_buf verf_buf; int status; + u32 seqno; /* Credential */ @@ -1556,15 +1548,16 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) cred_len = p++; spin_lock(&ctx->gc_seq_lock); - req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + xprt_rqst_add_seqno(req, seqno); spin_unlock(&ctx->gc_seq_lock); - if (req->rq_seqno == MAXSEQ) + if (*req->rq_seqnos == MAXSEQ) goto expired; trace_rpcgss_seqno(task); *p++ = cpu_to_be32(RPC_GSS_VERSION); *p++ = cpu_to_be32(ctx->gc_proc); - *p++ = cpu_to_be32(req->rq_seqno); + *p++ = cpu_to_be32(*req->rq_seqnos); *p++ = cpu_to_be32(gss_cred->gc_service); p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2); @@ -1678,17 +1671,31 @@ gss_refresh_null(struct rpc_task *task) return 0; } +static u32 +gss_validate_seqno_mic(struct gss_cl_ctx *ctx, u32 seqno, __be32 *seq, __be32 *p, u32 len) +{ + struct kvec iov; + struct xdr_buf verf_buf; + struct xdr_netobj mic; + + *seq = cpu_to_be32(seqno); + iov.iov_base = seq; + iov.iov_len = 4; + xdr_buf_from_iov(&iov, &verf_buf); + mic.data = (u8 *)p; + mic.len = len; + return gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); +} + static int gss_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *seq = NULL; - struct kvec iov; - struct xdr_buf verf_buf; - struct xdr_netobj mic; u32 len, maj_stat; int status; + int i = 1; /* don't recheck the first item */ p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) @@ -1705,13 +1712,10 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr) seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; - *seq = cpu_to_be32(task->tk_rqstp->rq_seqno); - iov.iov_base = seq; - iov.iov_len = 4; - xdr_buf_from_iov(&iov, &verf_buf); - mic.data = (u8 *)p; - mic.len = len; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[0], seq, p, len); + /* RFC 2203 5.3.3.1 - compute the checksum of each sequence number in the cache */ + while (unlikely(maj_stat == GSS_S_BAD_SIG && i < task->tk_rqstp->rq_seqno_count)) + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[i++], seq, p, len); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) @@ -1750,7 +1754,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; integ_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -1847,7 +1851,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; opaque_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -2001,7 +2005,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, offset = rcv_buf->len - xdr_stream_remaining(xdr); if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; - if (seqno != rqstp->rq_seqno) + if (seqno != *rqstp->rq_seqnos) goto bad_seqno; if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) goto unwrap_failed; @@ -2045,7 +2049,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); goto out; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, seqno); goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); @@ -2077,7 +2081,7 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; /* gss_unwrap decrypted the sequence number */ - if (be32_to_cpup(p++) != rqstp->rq_seqno) + if (be32_to_cpup(p++) != *rqstp->rq_seqnos) goto bad_seqno; /* gss_unwrap redacts the opaque blob from the head iovec. @@ -2093,7 +2097,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); return -EIO; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p)); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, be32_to_cpup(--p)); return -EIO; bad_unwrap: trace_rpcgss_unwrap(task, maj_stat); @@ -2118,14 +2122,14 @@ gss_xmit_need_reencode(struct rpc_task *task) if (!ctx) goto out; - if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) + if (gss_seq_is_newer(*req->rq_seqnos, READ_ONCE(ctx->gc_seq))) goto out_ctx; seq_xmit = READ_ONCE(ctx->gc_seq_xmit); - while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { + while (gss_seq_is_newer(*req->rq_seqnos, seq_xmit)) { u32 tmp = seq_xmit; - seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); + seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, *req->rq_seqnos); if (seq_xmit == tmp) { ret = false; goto out_ctx; @@ -2134,7 +2138,7 @@ gss_xmit_need_reencode(struct rpc_task *task) win = ctx->gc_win; if (win > 0) - ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); + ret = !gss_seq_is_newer(*req->rq_seqnos, seq_xmit - win); out_ctx: gss_put_ctx(ctx); diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 8f2d65c1e831c..16dcf115de1ed 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -875,8 +875,8 @@ out_err: * krb5_etm_decrypt - Decrypt using the RFC 8009 rules * @kctx: Kerberos context * @offset: starting offset of the ciphertext, in bytes - * @len: - * @buf: + * @len: size of ciphertext to unwrap + * @buf: ciphertext to unwrap * @headskip: OUT: the enctype's confounder length, in octets * @tailskip: OUT: the enctype's HMAC length, in octets * diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 73a90ad873fb9..e82212f6b5620 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1628,7 +1628,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp) int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - rqstp->rq_auth_stat = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_failed; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) @@ -1638,6 +1638,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp) svcdata->rsci = NULL; gc = &svcdata->clcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcauth_gss_decode_credbody(&rqstp->rq_arg_stream, gc, &rpcstart)) goto auth_err; if (gc->gc_v != RPC_GSS_VERSION) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 7ce5e28a6c031..131090f31e6a8 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -135,6 +135,8 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, hlist_add_head_rcu(&new->cache_list, head); detail->entries++; + if (detail->nextcheck > new->expiry_time) + detail->nextcheck = new->expiry_time + 1; cache_get(new); spin_unlock(&detail->hash_lock); @@ -462,24 +464,21 @@ static int cache_clean(void) } } + spin_lock(¤t_detail->hash_lock); + /* find a non-empty bucket in the table */ - while (current_detail && - current_index < current_detail->hash_size && + while (current_index < current_detail->hash_size && hlist_empty(¤t_detail->hash_table[current_index])) current_index++; /* find a cleanable entry in the bucket and clean it, or set to next bucket */ - - if (current_detail && current_index < current_detail->hash_size) { + if (current_index < current_detail->hash_size) { struct cache_head *ch = NULL; struct cache_detail *d; struct hlist_head *head; struct hlist_node *tmp; - spin_lock(¤t_detail->hash_lock); - /* Ok, now to clean this strand */ - head = ¤t_detail->hash_table[current_index]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { if (current_detail->nextcheck > ch->expiry_time) @@ -500,8 +499,10 @@ static int cache_clean(void) spin_unlock(&cache_list_lock); if (ch) sunrpc_end_cache_remove_entry(ch, d); - } else + } else { + spin_unlock(¤t_detail->hash_lock); spin_unlock(&cache_list_lock); + } return rv; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 6f75862d97820..8ca354ecfd02a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -112,47 +112,46 @@ static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt) } } -static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb, +static int rpc_setup_pipedir_sb(struct super_block *sb, struct rpc_clnt *clnt) { static uint32_t clntid; const char *dir_name = clnt->cl_program->pipe_dir_name; char name[15]; - struct dentry *dir, *dentry; + struct dentry *dir; + int err; dir = rpc_d_lookup_sb(sb, dir_name); if (dir == NULL) { pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name); - return dir; + return -ENOENT; } for (;;) { snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++); name[sizeof(name) - 1] = '\0'; - dentry = rpc_create_client_dir(dir, name, clnt); - if (!IS_ERR(dentry)) + err = rpc_create_client_dir(dir, name, clnt); + if (!err) break; - if (dentry == ERR_PTR(-EEXIST)) + if (err == -EEXIST) continue; printk(KERN_INFO "RPC: Couldn't create pipefs entry" - " %s/%s, error %ld\n", - dir_name, name, PTR_ERR(dentry)); + " %s/%s, error %d\n", + dir_name, name, err); break; } dput(dir); - return dentry; + return err; } static int rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt) { - struct dentry *dentry; - clnt->pipefs_sb = pipefs_sb; if (clnt->cl_program->pipe_dir_name != NULL) { - dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + int err = rpc_setup_pipedir_sb(pipefs_sb, clnt); + if (err && err != -ENOENT) + return err; } return 0; } @@ -180,16 +179,9 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { - struct dentry *dentry; - switch (event) { case RPC_PIPEFS_MOUNT: - dentry = rpc_setup_pipedir_sb(sb, clnt); - if (!dentry) - return -ENOENT; - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - break; + return rpc_setup_pipedir_sb(sb, clnt); case RPC_PIPEFS_UMOUNT: __rpc_clnt_remove_pipedir(clnt); break; @@ -2771,8 +2763,13 @@ out_verifier: case -EPROTONOSUPPORT: goto out_err; case -EACCES: - /* Re-encode with a fresh cred */ - fallthrough; + /* possible RPCSEC_GSS out-of-sequence event (RFC2203), + * reset recv state and keep waiting, don't retransmit + */ + task->tk_rqstp->rq_reply_bytes_recvd = 0; + task->tk_status = xprt_request_enqueue_receive(task); + task->tk_action = call_transmit_status; + return -EBADMSG; default: goto out_garbage; } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index eadc00410ebc5..0bd1df2ebb479 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -168,8 +168,9 @@ rpc_inode_setowner(struct inode *inode, void *private) } static void -rpc_close_pipes(struct inode *inode) +rpc_close_pipes(struct dentry *dentry) { + struct inode *inode = dentry->d_inode; struct rpc_pipe *pipe = RPC_I(inode)->pipe; int need_release; LIST_HEAD(free_list); @@ -484,60 +485,6 @@ rpc_get_inode(struct super_block *sb, umode_t mode) return inode; } -static int __rpc_create_common(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private) -{ - struct inode *inode; - - d_drop(dentry); - inode = rpc_get_inode(dir->i_sb, mode); - if (!inode) - goto out_err; - inode->i_ino = iunique(dir->i_sb, 100); - if (i_fop) - inode->i_fop = i_fop; - if (private) - rpc_inode_setowner(inode, private); - d_add(dentry, inode); - return 0; -out_err: - printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %pd\n", - __FILE__, __func__, dentry); - dput(dentry); - return -ENOMEM; -} - -static int __rpc_create(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private) -{ - int err; - - err = __rpc_create_common(dir, dentry, S_IFREG | mode, i_fop, private); - if (err) - return err; - fsnotify_create(dir, dentry); - return 0; -} - -static int __rpc_mkdir(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private) -{ - int err; - - err = __rpc_create_common(dir, dentry, S_IFDIR | mode, i_fop, private); - if (err) - return err; - inc_nlink(dir); - fsnotify_mkdir(dir, dentry); - return 0; -} - static void init_pipe(struct rpc_pipe *pipe) { @@ -574,119 +521,60 @@ struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags) } EXPORT_SYMBOL_GPL(rpc_mkpipe_data); -static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry, - umode_t mode, - const struct file_operations *i_fop, - void *private, - struct rpc_pipe *pipe) +static int rpc_new_file(struct dentry *parent, + const char *name, + umode_t mode, + const struct file_operations *i_fop, + void *private) { - struct rpc_inode *rpci; - int err; + struct dentry *dentry = simple_start_creating(parent, name); + struct inode *dir = parent->d_inode; + struct inode *inode; - err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private); - if (err) - return err; - rpci = RPC_I(d_inode(dentry)); - rpci->private = private; - rpci->pipe = pipe; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + inode = rpc_get_inode(dir->i_sb, S_IFREG | mode); + if (unlikely(!inode)) { + dput(dentry); + inode_unlock(dir); + return -ENOMEM; + } + inode->i_ino = iunique(dir->i_sb, 100); + if (i_fop) + inode->i_fop = i_fop; + rpc_inode_setowner(inode, private); + d_instantiate(dentry, inode); fsnotify_create(dir, dentry); + inode_unlock(dir); return 0; } -static int __rpc_rmdir(struct inode *dir, struct dentry *dentry) -{ - int ret; - - dget(dentry); - ret = simple_rmdir(dir, dentry); - d_drop(dentry); - if (!ret) - fsnotify_rmdir(dir, dentry); - dput(dentry); - return ret; -} - -static int __rpc_unlink(struct inode *dir, struct dentry *dentry) -{ - int ret; - - dget(dentry); - ret = simple_unlink(dir, dentry); - d_drop(dentry); - if (!ret) - fsnotify_unlink(dir, dentry); - dput(dentry); - return ret; -} - -static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) +static struct dentry *rpc_new_dir(struct dentry *parent, + const char *name, + umode_t mode) { - struct inode *inode = d_inode(dentry); - - rpc_close_pipes(inode); - return __rpc_unlink(dir, dentry); -} + struct dentry *dentry = simple_start_creating(parent, name); + struct inode *dir = parent->d_inode; + struct inode *inode; -static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, - const char *name) -{ - struct qstr q = QSTR(name); - struct dentry *dentry = d_hash_and_lookup(parent, &q); - if (!dentry) { - dentry = d_alloc(parent, &q); - if (!dentry) - return ERR_PTR(-ENOMEM); - } - if (d_really_is_negative(dentry)) + if (IS_ERR(dentry)) return dentry; - dput(dentry); - return ERR_PTR(-EEXIST); -} -/* - * FIXME: This probably has races. - */ -static void __rpc_depopulate(struct dentry *parent, - const struct rpc_filelist *files, - int start, int eof) -{ - struct inode *dir = d_inode(parent); - struct dentry *dentry; - struct qstr name; - int i; - - for (i = start; i < eof; i++) { - name.name = files[i].name; - name.len = strlen(files[i].name); - dentry = d_hash_and_lookup(parent, &name); - - if (dentry == NULL) - continue; - if (d_really_is_negative(dentry)) - goto next; - switch (d_inode(dentry)->i_mode & S_IFMT) { - default: - BUG(); - case S_IFREG: - __rpc_unlink(dir, dentry); - break; - case S_IFDIR: - __rpc_rmdir(dir, dentry); - } -next: + inode = rpc_get_inode(dir->i_sb, S_IFDIR | mode); + if (unlikely(!inode)) { dput(dentry); + inode_unlock(dir); + return ERR_PTR(-ENOMEM); } -} -static void rpc_depopulate(struct dentry *parent, - const struct rpc_filelist *files, - int start, int eof) -{ - struct inode *dir = d_inode(parent); - - inode_lock_nested(dir, I_MUTEX_CHILD); - __rpc_depopulate(parent, files, start, eof); + inode->i_ino = iunique(dir->i_sb, 100); + inc_nlink(dir); + d_instantiate(dentry, inode); + fsnotify_mkdir(dir, dentry); inode_unlock(dir); + + return dentry; } static int rpc_populate(struct dentry *parent, @@ -694,92 +582,39 @@ static int rpc_populate(struct dentry *parent, int start, int eof, void *private) { - struct inode *dir = d_inode(parent); struct dentry *dentry; int i, err; - inode_lock(dir); for (i = start; i < eof; i++) { - dentry = __rpc_lookup_create_exclusive(parent, files[i].name); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_bad; switch (files[i].mode & S_IFMT) { default: BUG(); case S_IFREG: - err = __rpc_create(dir, dentry, + err = rpc_new_file(parent, + files[i].name, files[i].mode, files[i].i_fop, private); + if (err) + goto out_bad; break; case S_IFDIR: - err = __rpc_mkdir(dir, dentry, - files[i].mode, - NULL, - private); + dentry = rpc_new_dir(parent, + files[i].name, + files[i].mode); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_bad; + } } - if (err != 0) - goto out_bad; } - inode_unlock(dir); return 0; out_bad: - __rpc_depopulate(parent, files, start, eof); - inode_unlock(dir); printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", __FILE__, __func__, parent); return err; } -static struct dentry *rpc_mkdir_populate(struct dentry *parent, - const char *name, umode_t mode, void *private, - int (*populate)(struct dentry *, void *), void *args_populate) -{ - struct dentry *dentry; - struct inode *dir = d_inode(parent); - int error; - - inode_lock_nested(dir, I_MUTEX_PARENT); - dentry = __rpc_lookup_create_exclusive(parent, name); - if (IS_ERR(dentry)) - goto out; - error = __rpc_mkdir(dir, dentry, mode, NULL, private); - if (error != 0) - goto out_err; - if (populate != NULL) { - error = populate(dentry, args_populate); - if (error) - goto err_rmdir; - } -out: - inode_unlock(dir); - return dentry; -err_rmdir: - __rpc_rmdir(dir, dentry); -out_err: - dentry = ERR_PTR(error); - goto out; -} - -static int rpc_rmdir_depopulate(struct dentry *dentry, - void (*depopulate)(struct dentry *)) -{ - struct dentry *parent; - struct inode *dir; - int error; - - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock_nested(dir, I_MUTEX_PARENT); - if (depopulate != NULL) - depopulate(dentry); - error = __rpc_rmdir(dir, dentry); - inode_unlock(dir); - dput(parent); - return error; -} - /** * rpc_mkpipe_dentry - make an rpc_pipefs file for kernel<->userspace * communication @@ -799,11 +634,13 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, * The @private argument passed here will be available to all these methods * from the file pointer, via RPC_I(file_inode(file))->private. */ -struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, +int rpc_mkpipe_dentry(struct dentry *parent, const char *name, void *private, struct rpc_pipe *pipe) { - struct dentry *dentry; struct inode *dir = d_inode(parent); + struct dentry *dentry; + struct inode *inode; + struct rpc_inode *rpci; umode_t umode = S_IFIFO | 0600; int err; @@ -812,48 +649,53 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (pipe->ops->downcall == NULL) umode &= ~0222; - inode_lock_nested(dir, I_MUTEX_PARENT); - dentry = __rpc_lookup_create_exclusive(parent, name); - if (IS_ERR(dentry)) - goto out; - err = __rpc_mkpipe_dentry(dir, dentry, umode, &rpc_pipe_fops, - private, pipe); - if (err) - goto out_err; -out: + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto failed; + } + + inode = rpc_get_inode(dir->i_sb, umode); + if (unlikely(!inode)) { + dput(dentry); + inode_unlock(dir); + err = -ENOMEM; + goto failed; + } + inode->i_ino = iunique(dir->i_sb, 100); + inode->i_fop = &rpc_pipe_fops; + rpci = RPC_I(inode); + rpci->private = private; + rpci->pipe = pipe; + rpc_inode_setowner(inode, private); + d_instantiate(dentry, inode); + pipe->dentry = dentry; + fsnotify_create(dir, dentry); inode_unlock(dir); - return dentry; -out_err: - dentry = ERR_PTR(err); - printk(KERN_WARNING "%s: %s() failed to create pipe %pd/%s (errno = %d)\n", - __FILE__, __func__, parent, name, - err); - goto out; + return 0; + +failed: + pr_warn("%s() failed to create pipe %pd/%s (errno = %d)\n", + __func__, parent, name, err); + return err; } EXPORT_SYMBOL_GPL(rpc_mkpipe_dentry); /** * rpc_unlink - remove a pipe - * @dentry: dentry for the pipe, as returned from rpc_mkpipe + * @pipe: the pipe to be removed * * After this call, lookups will no longer find the pipe, and any * attempts to read or write using preexisting opens of the pipe will * return -EPIPE. */ -int -rpc_unlink(struct dentry *dentry) +void +rpc_unlink(struct rpc_pipe *pipe) { - struct dentry *parent; - struct inode *dir; - int error = 0; - - parent = dget_parent(dentry); - dir = d_inode(parent); - inode_lock_nested(dir, I_MUTEX_PARENT); - error = __rpc_rmpipe(dir, dentry); - inode_unlock(dir); - dput(parent); - return error; + if (pipe->dentry) { + simple_recursive_removal(pipe->dentry, rpc_close_pipes); + pipe->dentry = NULL; + } } EXPORT_SYMBOL_GPL(rpc_unlink); @@ -1010,31 +852,6 @@ rpc_destroy_pipe_dir_objects(struct rpc_pipe_dir_head *pdh) pdo->pdo_ops->destroy(dir, pdo); } -enum { - RPCAUTH_info, - RPCAUTH_EOF -}; - -static const struct rpc_filelist authfiles[] = { - [RPCAUTH_info] = { - .name = "info", - .i_fop = &rpc_info_operations, - .mode = S_IFREG | 0400, - }, -}; - -static int rpc_clntdir_populate(struct dentry *dentry, void *private) -{ - return rpc_populate(dentry, - authfiles, RPCAUTH_info, RPCAUTH_EOF, - private); -} - -static void rpc_clntdir_depopulate(struct dentry *dentry) -{ - rpc_depopulate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF); -} - /** * rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs * @dentry: the parent of new directory @@ -1046,19 +863,27 @@ static void rpc_clntdir_depopulate(struct dentry *dentry) * information about the client, together with any "pipes" that may * later be created using rpc_mkpipe(). */ -struct dentry *rpc_create_client_dir(struct dentry *dentry, - const char *name, - struct rpc_clnt *rpc_client) +int rpc_create_client_dir(struct dentry *dentry, + const char *name, + struct rpc_clnt *rpc_client) { struct dentry *ret; + int err; - ret = rpc_mkdir_populate(dentry, name, 0555, NULL, - rpc_clntdir_populate, rpc_client); - if (!IS_ERR(ret)) { - rpc_client->cl_pipedir_objects.pdh_dentry = ret; - rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects); + ret = rpc_new_dir(dentry, name, 0555); + if (IS_ERR(ret)) + return PTR_ERR(ret); + err = rpc_new_file(ret, "info", S_IFREG | 0400, + &rpc_info_operations, rpc_client); + if (err) { + pr_warn("%s failed to populate directory %pd\n", + __func__, ret); + simple_recursive_removal(ret, NULL); + return err; } - return ret; + rpc_client->cl_pipedir_objects.pdh_dentry = ret; + rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects); + return 0; } /** @@ -1073,7 +898,8 @@ int rpc_remove_client_dir(struct rpc_clnt *rpc_client) return 0; rpc_destroy_pipe_dir_objects(&rpc_client->cl_pipedir_objects); rpc_client->cl_pipedir_objects.pdh_dentry = NULL; - return rpc_rmdir_depopulate(dentry, rpc_clntdir_depopulate); + simple_recursive_removal(dentry, NULL); + return 0; } static const struct rpc_filelist cache_pipefs_files[3] = { @@ -1094,28 +920,25 @@ static const struct rpc_filelist cache_pipefs_files[3] = { }, }; -static int rpc_cachedir_populate(struct dentry *dentry, void *private) -{ - return rpc_populate(dentry, - cache_pipefs_files, 0, 3, - private); -} - -static void rpc_cachedir_depopulate(struct dentry *dentry) -{ - rpc_depopulate(dentry, cache_pipefs_files, 0, 3); -} - struct dentry *rpc_create_cache_dir(struct dentry *parent, const char *name, umode_t umode, struct cache_detail *cd) { - return rpc_mkdir_populate(parent, name, umode, NULL, - rpc_cachedir_populate, cd); + struct dentry *dentry; + + dentry = rpc_new_dir(parent, name, umode); + if (!IS_ERR(dentry)) { + int error = rpc_populate(dentry, cache_pipefs_files, 0, 3, cd); + if (error) { + simple_recursive_removal(dentry, NULL); + return ERR_PTR(error); + } + } + return dentry; } void rpc_remove_cache_dir(struct dentry *dentry) { - rpc_rmdir_depopulate(dentry, rpc_cachedir_depopulate); + simple_recursive_removal(dentry, NULL); } /* @@ -1141,7 +964,6 @@ enum { RPCAUTH_nfsd4_cb, RPCAUTH_cache, RPCAUTH_nfsd, - RPCAUTH_gssd, RPCAUTH_RootEOF }; @@ -1178,10 +1000,6 @@ static const struct rpc_filelist files[] = { .name = "nfsd", .mode = S_IFDIR | 0555, }, - [RPCAUTH_gssd] = { - .name = "gssd", - .mode = S_IFDIR | 0555, - }, }; /* @@ -1190,7 +1008,7 @@ static const struct rpc_filelist files[] = { struct dentry *rpc_d_lookup_sb(const struct super_block *sb, const unsigned char *dir_name) { - return d_hash_and_lookup(sb->s_root, &QSTR(dir_name)); + return try_lookup_noperm(&QSTR(dir_name), sb->s_root); } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); @@ -1241,13 +1059,6 @@ void rpc_put_sb_net(const struct net *net) } EXPORT_SYMBOL_GPL(rpc_put_sb_net); -static const struct rpc_filelist gssd_dummy_clnt_dir[] = { - [0] = { - .name = "clntXX", - .mode = S_IFDIR | 0555, - }, -}; - static ssize_t dummy_downcall(struct file *filp, const char __user *src, size_t len) { @@ -1276,14 +1087,6 @@ rpc_dummy_info_show(struct seq_file *m, void *v) } DEFINE_SHOW_ATTRIBUTE(rpc_dummy_info); -static const struct rpc_filelist gssd_dummy_info_file[] = { - [0] = { - .name = "info", - .i_fop = &rpc_dummy_info_fops, - .mode = S_IFREG | 0400, - }, -}; - /** * rpc_gssd_dummy_populate - create a dummy gssd pipe * @root: root of the rpc_pipefs filesystem @@ -1292,69 +1095,32 @@ static const struct rpc_filelist gssd_dummy_info_file[] = { * Create a dummy set of directories and a pipe that gssd can hold open to * indicate that it is up and running. */ -static struct dentry * +static int rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) { - int ret = 0; - struct dentry *gssd_dentry; - struct dentry *clnt_dentry = NULL; - struct dentry *pipe_dentry = NULL; - - /* We should never get this far if "gssd" doesn't exist */ - gssd_dentry = d_hash_and_lookup(root, &QSTR(files[RPCAUTH_gssd].name)); - if (!gssd_dentry) - return ERR_PTR(-ENOENT); - - ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL); - if (ret) { - pipe_dentry = ERR_PTR(ret); - goto out; - } + struct dentry *gssd_dentry, *clnt_dentry; + int err; - clnt_dentry = d_hash_and_lookup(gssd_dentry, - &QSTR(gssd_dummy_clnt_dir[0].name)); - if (!clnt_dentry) { - __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); - pipe_dentry = ERR_PTR(-ENOENT); - goto out; - } + gssd_dentry = rpc_new_dir(root, "gssd", 0555); + if (IS_ERR(gssd_dentry)) + return -ENOENT; - ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL); - if (ret) { - __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); - pipe_dentry = ERR_PTR(ret); - goto out; - } + clnt_dentry = rpc_new_dir(gssd_dentry, "clntXX", 0555); + if (IS_ERR(clnt_dentry)) + return -ENOENT; - pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); - if (IS_ERR(pipe_dentry)) { - __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1); - __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); - } -out: - dput(clnt_dentry); - dput(gssd_dentry); - return pipe_dentry; -} - -static void -rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) -{ - struct dentry *clnt_dir = pipe_dentry->d_parent; - struct dentry *gssd_dir = clnt_dir->d_parent; - - dget(pipe_dentry); - __rpc_rmpipe(d_inode(clnt_dir), pipe_dentry); - __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); - __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); - dput(pipe_dentry); + err = rpc_new_file(clnt_dentry, "info", 0400, + &rpc_dummy_info_fops, NULL); + if (!err) + err = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); + return err; } static int rpc_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; - struct dentry *root, *gssd_dentry; + struct dentry *root; struct net *net = sb->s_fs_info; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1363,7 +1129,7 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = RPCAUTH_GSSMAGIC; sb->s_op = &s_ops; - sb->s_d_op = &simple_dentry_operations; + sb->s_d_flags = DCACHE_DONTCACHE; sb->s_time_gran = 1; inode = rpc_get_inode(sb, S_IFDIR | 0555); @@ -1373,11 +1139,9 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc) if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; - gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy); - if (IS_ERR(gssd_dentry)) { - __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); - return PTR_ERR(gssd_dentry); - } + err = rpc_gssd_dummy_populate(root, sn->gssd_dummy); + if (err) + return err; dprintk("RPC: sending pipefs MOUNT notification for net %x%s\n", net->ns.inum, NET_NAME(net)); @@ -1386,18 +1150,6 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc) err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_MOUNT, sb); - if (err) - goto err_depopulate; - mutex_unlock(&sn->pipefs_sb_lock); - return 0; - -err_depopulate: - rpc_gssd_dummy_depopulate(gssd_dentry); - blocking_notifier_call_chain(&rpc_pipefs_notifier_list, - RPC_PIPEFS_UMOUNT, - sb); - sn->pipefs_sb = NULL; - __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); mutex_unlock(&sn->pipefs_sb_lock); return err; } diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 1b2b84feeec69..4e92e2a501684 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -27,97 +27,60 @@ struct xdr_skb_reader { struct sk_buff *skb; unsigned int offset; + bool need_checksum; size_t count; __wsum csum; }; -typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to, - size_t len); - /** * xdr_skb_read_bits - copy some data bits from skb to internal buffer * @desc: sk_buff copy helper * @to: copy destination * @len: number of bytes to copy * - * Possibly called several times to iterate over an sk_buff and copy - * data out of it. + * Possibly called several times to iterate over an sk_buff and copy data out of + * it. */ static size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) { - if (len > desc->count) - len = desc->count; - if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len))) - return 0; - desc->count -= len; - desc->offset += len; - return len; -} + len = min(len, desc->count); + + if (desc->need_checksum) { + __wsum csum; + + csum = skb_copy_and_csum_bits(desc->skb, desc->offset, to, len); + desc->csum = csum_block_add(desc->csum, csum, desc->offset); + } else { + if (unlikely(skb_copy_bits(desc->skb, desc->offset, to, len))) + return 0; + } -/** - * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer - * @desc: sk_buff copy helper - * @to: copy destination - * @len: number of bytes to copy - * - * Same as skb_read_bits, but calculate a checksum at the same time. - */ -static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, size_t len) -{ - unsigned int pos; - __wsum csum2; - - if (len > desc->count) - len = desc->count; - pos = desc->offset; - csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len); - desc->csum = csum_block_add(desc->csum, csum2, pos); desc->count -= len; desc->offset += len; return len; } -/** - * xdr_partial_copy_from_skb - copy data out of an skb - * @xdr: target XDR buffer - * @base: starting offset - * @desc: sk_buff copy helper - * @copy_actor: virtual method for copying data - * - */ static ssize_t -xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) +xdr_partial_copy_from_skb(struct xdr_buf *xdr, struct xdr_skb_reader *desc) { - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - ssize_t copied = 0; - size_t ret; - - len = xdr->head[0].iov_len; - if (base < len) { - len -= base; - ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); - copied += ret; - if (ret != len || !desc->count) - goto out; - base = 0; - } else - base -= len; - - if (unlikely(pglen == 0)) - goto copy_tail; - if (unlikely(base >= pglen)) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_SHIFT; - base &= ~PAGE_MASK; - } - do { + struct page **ppage = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + unsigned int poff = xdr->page_base & ~PAGE_MASK; + unsigned int pglen = xdr->page_len; + ssize_t copied = 0; + size_t ret; + + if (xdr->head[0].iov_len == 0) + return 0; + + ret = xdr_skb_read_bits(desc, xdr->head[0].iov_base, + xdr->head[0].iov_len); + if (ret != xdr->head[0].iov_len || !desc->count) + return ret; + copied += ret; + + while (pglen) { + unsigned int len = min(PAGE_SIZE - poff, pglen); char *kaddr; /* ACL likes to be lazy in allocating pages - ACLs @@ -126,36 +89,29 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN); if (unlikely(*ppage == NULL)) { if (copied == 0) - copied = -ENOMEM; - goto out; + return -ENOMEM; + return copied; } } - len = PAGE_SIZE; kaddr = kmap_atomic(*ppage); - if (base) { - len -= base; - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr + base, len); - base = 0; - } else { - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr, len); - } + ret = xdr_skb_read_bits(desc, kaddr + poff, len); flush_dcache_page(*ppage); kunmap_atomic(kaddr); + copied += ret; if (ret != len || !desc->count) - goto out; + return copied; ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) - copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); -out: + pglen -= len; + poff = 0; + } + + if (xdr->tail[0].iov_len) { + copied += xdr_skb_read_bits(desc, xdr->tail[0].iov_base, + xdr->tail[0].iov_len); + } + return copied; } @@ -169,17 +125,22 @@ out: */ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) { - struct xdr_skb_reader desc; - - desc.skb = skb; - desc.offset = 0; - desc.count = skb->len - desc.offset; + struct xdr_skb_reader desc = { + .skb = skb, + .count = skb->len - desc.offset, + }; - if (skb_csum_unnecessary(skb)) - goto no_checksum; + if (skb_csum_unnecessary(skb)) { + if (xdr_partial_copy_from_skb(xdr, &desc) < 0) + return -1; + if (desc.count) + return -1; + return 0; + } + desc.need_checksum = true; desc.csum = csum_partial(skb->data, desc.offset, skb->csum); - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_and_csum_bits) < 0) + if (xdr_partial_copy_from_skb(xdr, &desc) < 0) return -1; if (desc.offset != skb->len) { __wsum csum2; @@ -194,14 +155,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); return 0; -no_checksum: - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; } -EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr); static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg, size_t seek) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e7f9c295d13c0..b1fab3a695443 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -636,24 +636,18 @@ svc_destroy(struct svc_serv **servp) EXPORT_SYMBOL_GPL(svc_destroy); static bool -svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) +svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { - unsigned long pages, ret; - - /* bc_xprt uses fore channel allocated buffers */ - if (svc_is_backchannel(rqstp)) - return true; - - pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply. - * We assume one is at most one page - */ - WARN_ON_ONCE(pages > RPCSVC_MAXPAGES); - if (pages > RPCSVC_MAXPAGES) - pages = RPCSVC_MAXPAGES; - - ret = alloc_pages_bulk_node(GFP_KERNEL, node, pages, - rqstp->rq_pages); - return ret == pages; + rqstp->rq_maxpages = svc_serv_maxpages(serv); + + /* rq_pages' last entry is NULL for historical reasons. */ + rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1, + sizeof(struct page *), + GFP_KERNEL, node); + if (!rqstp->rq_pages) + return false; + + return true; } /* @@ -662,17 +656,19 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) static void svc_release_buffer(struct svc_rqst *rqstp) { - unsigned int i; + unsigned long i; - for (i = 0; i < ARRAY_SIZE(rqstp->rq_pages); i++) + for (i = 0; i < rqstp->rq_maxpages; i++) if (rqstp->rq_pages[i]) put_page(rqstp->rq_pages[i]); + kfree(rqstp->rq_pages); } static void svc_rqst_free(struct svc_rqst *rqstp) { folio_batch_release(&rqstp->rq_fbatch); + kfree(rqstp->rq_bvec); svc_release_buffer(rqstp); if (rqstp->rq_scratch_page) put_page(rqstp->rq_scratch_page); @@ -708,7 +704,13 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp->rq_resp) goto out_enomem; - if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) + if (!svc_init_buffer(rqstp, serv, node)) + goto out_enomem; + + rqstp->rq_bvec = kcalloc_node(rqstp->rq_maxpages, + sizeof(struct bio_vec), + GFP_KERNEL, node); + if (!rqstp->rq_bvec) goto out_enomem; rqstp->rq_err = -EAGAIN; /* No error yet */ @@ -749,14 +751,16 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool) WRITE_ONCE(rqstp->rq_qtime, ktime_get()); if (!task_is_running(rqstp->rq_task)) { wake_up_process(rqstp->rq_task); - trace_svc_wake_up(rqstp->rq_task->pid); + trace_svc_pool_thread_wake(pool, rqstp->rq_task->pid); percpu_counter_inc(&pool->sp_threads_woken); + } else { + trace_svc_pool_thread_running(pool, rqstp->rq_task->pid); } rcu_read_unlock(); return; } rcu_read_unlock(); - + trace_svc_pool_thread_noidle(pool, 0); } EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread); @@ -900,7 +904,7 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) { struct page **begin = rqstp->rq_pages; - struct page **end = &rqstp->rq_pages[RPCSVC_MAXPAGES]; + struct page **end = &rqstp->rq_pages[rqstp->rq_maxpages]; if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) { trace_svc_replace_page_err(rqstp); @@ -1330,6 +1334,9 @@ svc_process_common(struct svc_rqst *rqstp) int pr, rc; __be32 *p; + /* Reset the accept_stat for the RPC */ + rqstp->rq_accept_statp = NULL; + /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); clear_bit(RQ_DROPME, &rqstp->rq_flags); @@ -1369,9 +1376,8 @@ svc_process_common(struct svc_rqst *rqstp) case SVC_OK: break; case SVC_GARBAGE: - goto err_garbage_args; - case SVC_SYSERR: - goto err_system_err; + rqstp->rq_auth_stat = rpc_autherr_badcred; + goto err_bad_auth; case SVC_DENIED: goto err_bad_auth; case SVC_CLOSE: @@ -1382,7 +1388,8 @@ svc_process_common(struct svc_rqst *rqstp) goto sendit; default: pr_warn_once("Unexpected svc_auth_status (%d)\n", auth_res); - goto err_system_err; + rqstp->rq_auth_stat = rpc_autherr_failed; + goto err_bad_auth; } if (progp == NULL) @@ -1509,20 +1516,6 @@ err_bad_proc: serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; - -err_garbage_args: - svc_printk(rqstp, "failed to decode RPC header\n"); - - if (serv->sv_stats) - serv->sv_stats->rpcbadfmt++; - *rqstp->rq_accept_statp = rpc_garbage_args; - goto sendit; - -err_system_err: - if (serv->sv_stats) - serv->sv_stats->rpcbadfmt++; - *rqstp->rq_accept_statp = rpc_system_err; - goto sendit; } /* @@ -1714,46 +1707,6 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** - * svc_fill_write_vector - Construct data argument for VFS write call - * @rqstp: svc_rqst to operate on - * @payload: xdr_buf containing only the write data payload - * - * Fills in rqstp::rq_vec, and returns the number of elements. - */ -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - struct xdr_buf *payload) -{ - struct page **pages = payload->pages; - struct kvec *first = payload->head; - struct kvec *vec = rqstp->rq_vec; - size_t total = payload->len; - unsigned int i; - - /* Some types of transport can present the write payload - * entirely in rq_arg.pages. In this case, @first is empty. - */ - i = 0; - if (first->iov_len) { - vec[i].iov_base = first->iov_base; - vec[i].iov_len = min_t(size_t, total, first->iov_len); - total -= vec[i].iov_len; - ++i; - } - - while (total) { - vec[i].iov_base = page_address(*pages); - vec[i].iov_len = min_t(size_t, total, PAGE_SIZE); - total -= vec[i].iov_len; - ++i; - ++pages; - } - - WARN_ON_ONCE(i > ARRAY_SIZE(rqstp->rq_vec)); - return i; -} -EXPORT_SYMBOL_GPL(svc_fill_write_vector); - -/** * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call * @rqstp: svc_rqst to operate on * @first: buffer containing first section of pathname diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ae25405d8bd22..8b1837228799c 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -488,6 +488,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) pool = svc_pool_for_cpu(xprt->xpt_server); percpu_counter_inc(&pool->sp_sockets_queued); + xprt->xpt_qtime = ktime_get(); lwq_enqueue(&xprt->xpt_ready, &pool->sp_xprts); svc_pool_wake_idle_thread(pool); @@ -651,18 +652,10 @@ static void svc_check_conn_limits(struct svc_serv *serv) static bool svc_alloc_arg(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg = &rqstp->rq_arg; unsigned long pages, filled, ret; - pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; - if (pages > RPCSVC_MAXPAGES) { - pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n", - pages, RPCSVC_MAXPAGES); - /* use as many pages as possible */ - pages = RPCSVC_MAXPAGES; - } - + pages = rqstp->rq_maxpages; for (filled = 0; filled < pages; filled = ret) { ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); if (ret > filled) @@ -929,7 +922,7 @@ void svc_send(struct svc_rqst *rqstp) */ static void svc_age_temp_xprts(struct timer_list *t) { - struct svc_serv *serv = from_timer(serv, t, sv_temptimer); + struct svc_serv *serv = timer_container_of(serv, t, sv_temptimer); struct svc_xprt *xprt; struct list_head *le, *next; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 72e5a01df3d35..46c156b121db4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -713,8 +713,7 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - count = xdr_buf_to_bvec(rqstp->rq_bvec, - ARRAY_SIZE(rqstp->rq_bvec), xdr); + count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, count, rqstp->rq_res.len); @@ -1198,7 +1197,7 @@ err_noclose: * that the pages backing @xdr are unchanging. */ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, - rpc_fraghdr marker, unsigned int *sentp) + rpc_fraghdr marker, int *sentp) { struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES, @@ -1219,8 +1218,8 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, memcpy(buf, &marker, sizeof(marker)); bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); - count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, - ARRAY_SIZE(rqstp->rq_bvec) - 1, &rqstp->rq_res); + count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages, + &rqstp->rq_res); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); @@ -1248,8 +1247,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) struct xdr_buf *xdr = &rqstp->rq_res; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); - unsigned int sent; - int err; + int sent, err; svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); rqstp->rq_xprt_ctxt = NULL; @@ -1340,7 +1338,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svsk->sk_marker = xdr_zero; svsk->sk_tcplen = 0; svsk->sk_datalen = 0; - memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); + memset(&svsk->sk_pages[0], 0, + svsk->sk_maxpages * sizeof(struct page *)); tcp_sock_set_nodelay(sk); @@ -1379,10 +1378,13 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + unsigned long pages; - svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); + pages = svc_serv_maxpages(serv); + svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); + svsk->sk_maxpages = pages; inet = sock->sk; @@ -1542,7 +1544,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, if (protocol == IPPROTO_TCP) { sk_net_refcnt_upgrade(sock->sk); - if ((error = kernel_listen(sock, 64)) < 0) + if ((error = kernel_listen(sock, SOMAXCONN)) < 0) goto bummer; } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 4e003cb516fe4..70efc727a9cd8 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -37,19 +37,6 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj) } EXPORT_SYMBOL_GPL(xdr_encode_netobj); -__be32 * -xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj) -{ - unsigned int len; - - if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ) - return NULL; - obj->len = len; - obj->data = (u8 *) p; - return p + XDR_QUADLEN(len); -} -EXPORT_SYMBOL_GPL(xdr_decode_netobj); - /** * xdr_encode_opaque_fixed - Encode fixed length opaque data * @p: pointer to current position in XDR buffer. @@ -102,21 +89,6 @@ xdr_encode_string(__be32 *p, const char *string) } EXPORT_SYMBOL_GPL(xdr_encode_string); -__be32 * -xdr_decode_string_inplace(__be32 *p, char **sp, - unsigned int *lenp, unsigned int maxlen) -{ - u32 len; - - len = be32_to_cpu(*p++); - if (len > maxlen) - return NULL; - *lenp = len; - *sp = (char *) p; - return p + XDR_QUADLEN(len); -} -EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); - /** * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf * @buf: XDR buffer where string resides @@ -213,6 +185,7 @@ bvec_overflow: pr_warn_once("%s: bio_vec array overflow\n", __func__); return count - 1; } +EXPORT_SYMBOL_GPL(xdr_buf_to_bvec); /** * xdr_inline_pages - Prepare receive buffer for a large reply @@ -992,21 +965,18 @@ EXPORT_SYMBOL_GPL(xdr_init_encode); * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer into which to encode data - * @pages: list of pages to decode into - * @rqst: pointer to controlling rpc_rqst, for debugging * */ -void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, - struct page **pages, struct rpc_rqst *rqst) +void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf) { xdr_reset_scratch_buffer(xdr); xdr->buf = buf; - xdr->page_ptr = pages; + xdr->page_ptr = buf->pages; xdr->iov = NULL; - xdr->p = page_address(*pages); + xdr->p = page_address(*xdr->page_ptr); xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); - xdr->rqst = rqst; + xdr->rqst = NULL; } EXPORT_SYMBOL_GPL(xdr_init_encode_pages); @@ -2247,88 +2217,6 @@ out: EXPORT_SYMBOL_GPL(xdr_process_buf); /** - * xdr_stream_decode_opaque - Decode variable length opaque - * @xdr: pointer to xdr_stream - * @ptr: location to store opaque data - * @size: size of storage buffer @ptr - * - * Return values: - * On success, returns size of object stored in *@ptr - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE on overflow of storage buffer @ptr - */ -ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, size); - if (ret <= 0) - return ret; - memcpy(ptr, p, ret); - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque); - -/** - * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque - * @xdr: pointer to xdr_stream - * @ptr: location to store pointer to opaque data - * @maxlen: maximum acceptable object size - * @gfp_flags: GFP mask to use - * - * Return values: - * On success, returns size of object stored in *@ptr - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE if the size of the object would exceed @maxlen - * %-ENOMEM on memory allocation failure - */ -ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, - size_t maxlen, gfp_t gfp_flags) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); - if (ret > 0) { - *ptr = kmemdup(p, ret, gfp_flags); - if (*ptr != NULL) - return ret; - ret = -ENOMEM; - } - *ptr = NULL; - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup); - -/** - * xdr_stream_decode_string - Decode variable length string - * @xdr: pointer to xdr_stream - * @str: location to store string - * @size: size of storage buffer @str - * - * Return values: - * On success, returns length of NUL-terminated string stored in *@str - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE on overflow of storage buffer @str - */ -ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, size); - if (ret > 0) { - memcpy(str, p, ret); - str[ret] = '\0'; - return strlen(str); - } - *str = '\0'; - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_string); - -/** * xdr_stream_decode_string_dup - Decode and duplicate variable length string * @xdr: pointer to xdr_stream * @str: location to store pointer to string diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 0eab154655116..1023361845f9f 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -854,7 +854,7 @@ xprt_schedule_autodisconnect(struct rpc_xprt *xprt) static void xprt_init_autodisconnect(struct timer_list *t) { - struct rpc_xprt *xprt = from_timer(xprt, t, timer); + struct rpc_xprt *xprt = timer_container_of(xprt, t, timer); if (!RB_EMPTY_ROOT(&xprt->recv_queue)) return; @@ -1365,7 +1365,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task) INIT_LIST_HEAD(&req->rq_xmit2); goto out; } - } else if (!req->rq_seqno) { + } else if (req->rq_seqno_count == 0) { list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_task->tk_owner != task->tk_owner) continue; @@ -1898,6 +1898,7 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.bvec = NULL; req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; + req->rq_seqno_count = 0; xprt_init_majortimeo(task, req, task->tk_client->cl_timeout); trace_xprt_reserve(req); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 292022f0976e1..e7e4a39ca6c65 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -120,12 +120,16 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_recv_ctxt *ctxt; + unsigned long pages; dma_addr_t addr; void *buffer; - ctxt = kzalloc_node(sizeof(*ctxt), GFP_KERNEL, node); + pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server); + ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages), + GFP_KERNEL, node); if (!ctxt) goto fail0; + ctxt->rc_maxpages = pages; buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; @@ -497,7 +501,7 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) * a computation, perform a simple range check. This is an * arbitrary but sensible limit (ie, not architectural). */ - if (unlikely(segcount > RPCSVC_MAXPAGES)) + if (unlikely(segcount > rctxt->rc_maxpages)) return false; p = xdr_inline_decode(&rctxt->rc_stream, diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 40797114d50a4..661b3fe2779f0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -765,7 +765,7 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, } len -= seg_len; - if (len && ((head->rc_curpage + 1) > ARRAY_SIZE(rqstp->rq_pages))) + if (len && ((head->rc_curpage + 1) > rqstp->rq_maxpages)) goto out_overrun; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 96154a2367a11..914cd263c2f17 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -118,6 +118,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_send_ctxt *ctxt; + unsigned long pages; dma_addr_t addr; void *buffer; int i; @@ -126,13 +127,19 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) GFP_KERNEL, node); if (!ctxt) goto fail0; + pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server); + ctxt->sc_pages = kcalloc_node(pages, sizeof(struct page *), + GFP_KERNEL, node); + if (!ctxt->sc_pages) + goto fail1; + ctxt->sc_maxpages = pages; buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) - goto fail1; + goto fail2; addr = ib_dma_map_single(rdma->sc_pd->device, buffer, rdma->sc_max_req_size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) - goto fail2; + goto fail3; svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); @@ -151,8 +158,10 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; return ctxt; -fail2: +fail3: kfree(buffer); +fail2: + kfree(ctxt->sc_pages); fail1: kfree(ctxt); fail0: @@ -176,6 +185,7 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) rdma->sc_max_req_size, DMA_TO_DEVICE); kfree(ctxt->sc_xprt_buf); + kfree(ctxt->sc_pages); kfree(ctxt); } } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index aca8bdf65d729..3d7f1413df023 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -406,12 +406,12 @@ static void svc_rdma_xprt_done(struct rpcrdma_notification *rn) */ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) { + unsigned int ctxts, rq_depth, maxpayload; struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; - unsigned int ctxts, rq_depth; struct ib_device *dev; int ret = 0; RPC_IFDEBUG(struct sockaddr *sap); @@ -462,12 +462,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_bc_requests = 2; } - /* Arbitrarily estimate the number of rw_ctxs needed for - * this transport. This is enough rw_ctxs to make forward - * progress even if the client is using one rkey per page - * in each Read chunk. + /* Arbitrary estimate of the needed number of rdma_rw contexts. */ - ctxts = 3 * RPCSVC_MAXPAGES; + maxpayload = min(xprt->xpt_server->sv_max_payload, + RPCSVC_MAXPAYLOAD_RDMA); + ctxts = newxprt->sc_max_requests * 3 * + rdma_rw_mr_factor(dev, newxprt->sc_port_num, + maxpayload >> PAGE_SHIFT); + newxprt->sc_sq_depth = rq_depth + ctxts; if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; @@ -575,6 +577,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) ib_destroy_qp(newxprt->sc_qp); rdma_destroy_id(newxprt->sc_cm_id); + rpcrdma_rn_unregister(dev, &newxprt->sc_rn); /* This call to put will destroy the transport */ svc_xprt_put(&newxprt->sc_xprt); return NULL; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 83cc095846d35..c5f7bbf5775ff 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -358,7 +358,7 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) static int xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, - struct cmsghdr *cmsg, int ret) + unsigned int *msg_flags, struct cmsghdr *cmsg, int ret) { u8 content_type = tls_get_record_type(sock->sk, cmsg); u8 level, description; @@ -371,7 +371,7 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, * record, even though there might be more frames * waiting to be decrypted. */ - msg->msg_flags &= ~MSG_EOR; + *msg_flags &= ~MSG_EOR; break; case TLS_RECORD_TYPE_ALERT: tls_alert_recv(sock->sk, msg, &level, &description); @@ -386,19 +386,33 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, } static int -xs_sock_recv_cmsg(struct socket *sock, struct msghdr *msg, int flags) +xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) { union { struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; int ret; - msg->msg_control = &u; - msg->msg_controllen = sizeof(u); - ret = sock_recvmsg(sock, msg, flags); - if (msg->msg_controllen != sizeof(u)) - ret = xs_sock_process_cmsg(sock, msg, &u.cmsg, ret); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, flags); + if (ret > 0 && + tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { + iov_iter_revert(&msg.msg_iter, ret); + ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, + -EAGAIN); + } return ret; } @@ -408,7 +422,13 @@ xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) ssize_t ret; if (seek != 0) iov_iter_advance(&msg->msg_iter, seek); - ret = xs_sock_recv_cmsg(sock, msg, flags); + ret = sock_recvmsg(sock, msg, flags); + /* Handle TLS inband control message lazily */ + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = xs_sock_recv_cmsg(sock, &msg->msg_flags, flags); + } return ret > 0 ? ret + seek : ret; } @@ -434,7 +454,7 @@ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { iov_iter_discard(&msg->msg_iter, ITER_DEST, count); - return xs_sock_recv_cmsg(sock, msg, flags); + return xs_sock_recvmsg(sock, msg, flags, 0); } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE @@ -2726,20 +2746,14 @@ static void xs_tcp_tls_setup_socket(struct work_struct *work) if (status) goto out_close; xprt_release_write(lower_xprt, NULL); - trace_rpc_socket_connect(upper_xprt, upper_transport->sock, 0); - if (!xprt_test_and_set_connected(upper_xprt)) { - upper_xprt->connect_cookie++; - clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); - xprt_clear_connecting(upper_xprt); - - upper_xprt->stat.connect_count++; - upper_xprt->stat.connect_time += (long)jiffies - - upper_xprt->stat.connect_start; - xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); - } rpc_shutdown_client(lower_clnt); + /* Check for ingress data that arrived before the socket's + * ->data_ready callback was set up. + */ + xs_poll_check_readable(upper_transport); + out_unlock: current_restore_flags(pflags, PF_MEMALLOC); upper_transport->clnt = NULL; diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 8584893b47851..ea5bb131ebd06 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -425,7 +425,7 @@ static void tipc_aead_free(struct rcu_head *rp) } free_percpu(aead->tfm_entry); kfree_sensitive(aead->key); - kfree(aead); + kfree_sensitive(aead); } static int tipc_aead_users(struct tipc_aead __rcu *aead) @@ -818,7 +818,11 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, } /* Get net to avoid freed tipc_crypto when delete namespace */ - get_net(aead->crypto->net); + if (!maybe_get_net(aead->crypto->net)) { + tipc_bearer_put(b); + rc = -ENODEV; + goto exit; + } /* Now, do encrypt */ rc = crypto_aead_encrypt(req); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 685389d4b245e..775fd4f3f0725 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -292,7 +292,7 @@ void tipc_disc_remove_dest(struct tipc_discoverer *d) */ static void tipc_disc_timeout(struct timer_list *t) { - struct tipc_discoverer *d = from_timer(d, t, timer); + struct tipc_discoverer *d = timer_container_of(d, t, timer); struct tipc_net *tn = tipc_net(d->net); struct tipc_media_addr maddr; struct sk_buff *skb = NULL; diff --git a/net/tipc/link.c b/net/tipc/link.c index 18be6ff4c3db0..3ee44d7317004 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2228,7 +2228,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) break; - strncpy(if_name, data, TIPC_MAX_IF_NAME); + strscpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index b45c5b91bc7af..572b79bf76ce1 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -630,7 +630,7 @@ void tipc_mon_get_state(struct net *net, u32 addr, static void mon_timeout(struct timer_list *t) { - struct tipc_monitor *mon = from_timer(mon, t, timer); + struct tipc_monitor *mon = timer_container_of(mon, t, timer); struct tipc_peer *self; int best_member_cnt = dom_size(mon->peer_cnt) - 1; diff --git a/net/tipc/node.c b/net/tipc/node.c index ccf5e427f43eb..a07fb073368ca 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -800,7 +800,7 @@ static bool tipc_node_cleanup(struct tipc_node *peer) */ static void tipc_node_timeout(struct timer_list *t) { - struct tipc_node *n = from_timer(n, t, timer); + struct tipc_node *n = timer_container_of(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; int remains = n->link_cnt; @@ -1581,7 +1581,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { - strncpy(linkname, tipc_link_name(link), len); + strscpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 65dcbb54f55d9..e028bf6584992 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2862,7 +2862,7 @@ static void tipc_sk_retry_connect(struct sock *sk, struct sk_buff_head *list) static void tipc_sk_timeout(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); struct tipc_sock *tsk = tipc_sk(sk); u32 pnode = tsk_peer_node(tsk); struct sk_buff_head list; @@ -3642,7 +3642,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) || nla_put_u32(skb, TIPC_NLA_SOCK_UID, from_kuid_munged(sk_user_ns(NETLINK_CB(cb->skb).sk), - sock_i_uid(sk))) || + sk_uid(sk))) || nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE, tipc_diag_gen_cookie(sk), TIPC_NLA_SOCK_PAD)) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 621addab28349..f8490d94e3236 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -105,7 +105,7 @@ void tipc_sub_report_overlap(struct tipc_subscription *sub, static void tipc_sub_timeout(struct timer_list *t) { - struct tipc_subscription *sub = from_timer(sub, t, timer); + struct tipc_subscription *sub = timer_container_of(sub, t, timer); spin_lock(&sub->lock); tipc_sub_send_event(sub, NULL, TIPC_SUBSCR_TIMEOUT); diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 8ee0c07d00e9b..ffe577bf6b515 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -704,8 +704,10 @@ static void tipc_topsrv_stop(struct net *net) for (id = 0; srv->idr_in_use; id++) { con = idr_find(&srv->conn_idr, id); if (con) { + conn_get(con); spin_unlock_bh(&srv->idr_lock); tipc_conn_close(con); + conn_put(con); spin_lock_bh(&srv->idr_lock); } } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 108a4cc2e0010..b85ab0fb3b8cc 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -172,7 +172,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, struct udp_media_addr *dst, struct dst_cache *cache) { struct dst_entry *ndst; - int ttl, err = 0; + int ttl, err; local_bh_disable(); ndst = dst_cache_get(cache); @@ -197,7 +197,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->port, - dst->port, false, true); + dst->port, false, true, 0); #if IS_ENABLED(CONFIG_IPV6) } else { if (!ndst) { @@ -217,13 +217,13 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, dst_cache_set_ip6(cache, ndst, &fl6.saddr); } ttl = ip6_dst_hoplimit(ndst); - err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, - &src->ipv6, &dst->ipv6, 0, ttl, 0, - src->port, dst->port, false); + udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, + &src->ipv6, &dst->ipv6, 0, ttl, 0, + src->port, dst->port, false, 0); #endif } local_bh_enable(); - return err; + return 0; tx_error: local_bh_enable(); @@ -489,7 +489,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); b = tipc_bearer_find(net, bname); - if (!b) { + if (!b || b->bcast_addr.media_id != TIPC_MEDIA_TYPE_UDP) { rtnl_unlock(); return -EINVAL; } @@ -500,7 +500,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) rtnl_lock(); b = rtnl_dereference(tn->bearer_list[bid]); - if (!b) { + if (!b || b->bcast_addr.media_id != TIPC_MEDIA_TYPE_UDP) { rtnl_unlock(); return -EINVAL; } diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 65b0da6fdf6a7..095cf31bae0ba 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -512,9 +512,8 @@ static int tls_strp_read_sock(struct tls_strparser *strp) if (inq < strp->stm.full_len) return tls_strp_read_copy(strp, true); + tls_strp_load_anchor_with_queue(strp, inq); if (!strp->stm.full_len) { - tls_strp_load_anchor_with_queue(strp, inq); - sz = tls_rx_msg_size(strp, strp->anchor); if (sz < 0) { tls_strp_abort_strp(strp, sz); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 914d4e1516a3c..549d1ea01a72a 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -872,6 +872,19 @@ more_data: delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); delta -= msg->sg.size; + + if ((s32)delta > 0) { + /* It indicates that we executed bpf_msg_pop_data(), + * causing the plaintext data size to decrease. + * Therefore the encrypted data size also needs to + * correspondingly decrease. We only need to subtract + * delta to calculate the new ciphertext length since + * ktls does not support block encryption. + */ + struct sk_msg *enc = &ctx->open_rec->msg_encrypted; + + sk_msg_trim(sk, enc, enc->sg.size - delta); + } } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc && !full_record) { @@ -908,6 +921,13 @@ more_data: &msg_redir, send, flags); lock_sock(sk); if (err < 0) { + /* Regardless of whether the data represented by + * msg_redir is sent successfully, we have already + * uncharged it via sk_msg_return_zero(). The + * msg->sg.size represents the remaining unprocessed + * data, which needs to be uncharged here. + */ + sk_mem_uncharge(sk, msg->sg.size); *copied -= sk_msg_free_nocharge(sk, &msg_redir); msg->sg.size = 0; } @@ -1120,9 +1140,13 @@ alloc_encrypted: num_async++; else if (ret == -ENOMEM) goto wait_for_memory; - else if (ctx->open_rec && ret == -ENOSPC) + else if (ctx->open_rec && ret == -ENOSPC) { + if (msg_pl->cork_bytes) { + ret = 0; + goto send_end; + } goto rollback_iter; - else if (ret != -EAGAIN) + } else if (ret != -EAGAIN) goto send_end; } continue; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f78a2492826f9..6d7c110814ffa 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -85,10 +85,13 @@ #include <linux/file.h> #include <linux/filter.h> #include <linux/fs.h> +#include <linux/fs_struct.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mount.h> #include <linux/namei.h> +#include <linux/net.h> +#include <linux/pidfs.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/sched/signal.h> @@ -654,6 +657,11 @@ static void unix_sock_destructor(struct sock *sk) #endif } +static unsigned int unix_skb_len(const struct sk_buff *skb) +{ + return skb->len - UNIXCB(skb).consumed; +} + static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); @@ -688,10 +696,16 @@ static void unix_release_sock(struct sock *sk, int embrion) if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb && !unix_skb_len(skb)) + skb = skb_peek_next(skb, &sk->sk_receive_queue); +#endif unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); - if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) + if (skb || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); @@ -734,13 +748,47 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_gc(); /* Garbage collect fds */ } -static void init_peercred(struct sock *sk) +struct unix_peercred { + struct pid *peer_pid; + const struct cred *peer_cred; +}; + +static inline int prepare_peercred(struct unix_peercred *peercred) +{ + struct pid *pid; + int err; + + pid = task_tgid(current); + err = pidfs_register_pid(pid); + if (likely(!err)) { + peercred->peer_pid = get_pid(pid); + peercred->peer_cred = get_current_cred(); + } + return err; +} + +static void drop_peercred(struct unix_peercred *peercred) { - sk->sk_peer_pid = get_pid(task_tgid(current)); - sk->sk_peer_cred = get_current_cred(); + const struct cred *cred = NULL; + struct pid *pid = NULL; + + might_sleep(); + + swap(peercred->peer_pid, pid); + swap(peercred->peer_cred, cred); + + put_pid(pid); + put_cred(cred); } -static void update_peercred(struct sock *sk) +static inline void init_peercred(struct sock *sk, + const struct unix_peercred *peercred) +{ + sk->sk_peer_pid = peercred->peer_pid; + sk->sk_peer_cred = peercred->peer_cred; +} + +static void update_peercred(struct sock *sk, struct unix_peercred *peercred) { const struct cred *old_cred; struct pid *old_pid; @@ -748,11 +796,11 @@ static void update_peercred(struct sock *sk) spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; - init_peercred(sk); + init_peercred(sk, peercred); spin_unlock(&sk->sk_peer_lock); - put_pid(old_pid); - put_cred(old_cred); + peercred->peer_pid = old_pid; + peercred->peer_cred = old_cred; } static void copy_peercred(struct sock *sk, struct sock *peersk) @@ -765,11 +813,17 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_unlock(&sk->sk_peer_lock); } +static bool unix_may_passcred(const struct sock *sk) +{ + return sk->sk_scm_credentials || sk->sk_scm_pidfd; +} + static int unix_listen(struct socket *sock, int backlog) { int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); + struct unix_peercred peercred = {}; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) @@ -777,6 +831,9 @@ static int unix_listen(struct socket *sock, int backlog) err = -EINVAL; if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ + err = prepare_peercred(&peercred); + if (err) + goto out; unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; @@ -786,11 +843,12 @@ static int unix_listen(struct socket *sock, int backlog) WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ - update_peercred(sk); + update_peercred(sk, &peercred); err = 0; out_unlock: unix_state_unlock(sk); + drop_peercred(&peercred); out: return err; } @@ -871,6 +929,52 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) #define unix_show_fdinfo NULL #endif +static bool unix_custom_sockopt(int optname) +{ + switch (optname) { + case SO_INQ: + return true; + default: + return false; + } +} + +static int unix_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct unix_sock *u = unix_sk(sock->sk); + struct sock *sk = sock->sk; + int val; + + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + if (!unix_custom_sockopt(optname)) + return sock_setsockopt(sock, level, optname, optval, optlen); + + if (optlen != sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + switch (optname) { + case SO_INQ: + if (sk->sk_type != SOCK_STREAM) + return -EINVAL; + + if (val > 1 || val < 0) + return -EINVAL; + + WRITE_ONCE(u->recvmsg_inq, val); + break; + default: + return -ENOPROTOOPT; + } + + return 0; +} + static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, @@ -887,6 +991,7 @@ static const struct proto_ops unix_stream_ops = { #endif .listen = unix_listen, .shutdown = unix_shutdown, + .setsockopt = unix_setsockopt, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, @@ -950,13 +1055,6 @@ static void unix_close(struct sock *sk, long timeout) */ } -static void unix_unhash(struct sock *sk) -{ - /* Nothing to do here, unix socket does not need a ->unhash(). - * This is merely for sockmap. - */ -} - static bool unix_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SOCKET) { @@ -987,7 +1085,6 @@ struct proto unix_stream_proto = { .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, - .unhash = unix_unhash, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, @@ -1018,6 +1115,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sock_init_data(sock, sk); + sk->sk_scm_rights = 1; sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; @@ -1060,6 +1158,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, switch (sock->type) { case SOCK_STREAM: + set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); sock->ops = &unix_stream_ops; break; /* @@ -1101,7 +1200,7 @@ static int unix_release(struct socket *sock) } static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, - int type) + int type, int flags) { struct inode *inode; struct path path; @@ -1109,13 +1208,39 @@ static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, int err; unix_mkname_bsd(sunaddr, addr_len); - err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); - if (err) - goto fail; - err = path_permission(&path, MAY_WRITE); - if (err) - goto path_put; + if (flags & SOCK_COREDUMP) { + const struct cred *cred; + struct cred *kcred; + struct path root; + + kcred = prepare_kernel_cred(&init_task); + if (!kcred) { + err = -ENOMEM; + goto fail; + } + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + + cred = override_creds(kcred); + err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, + LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | + LOOKUP_NO_MAGICLINKS, &path); + put_cred(revert_creds(cred)); + path_put(&root); + if (err) + goto fail; + } else { + err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); + if (err) + goto fail; + + err = path_permission(&path, MAY_WRITE); + if (err) + goto path_put; + } err = -ECONNREFUSED; inode = d_backing_inode(path.dentry); @@ -1165,12 +1290,12 @@ static struct sock *unix_find_abstract(struct net *net, static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type) + int addr_len, int type, int flags) { struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(sunaddr, addr_len, type); + sk = unix_find_bsd(sunaddr, addr_len, type, flags); else sk = unix_find_abstract(net, sunaddr, addr_len, type); @@ -1419,16 +1544,14 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !READ_ONCE(unix_sk(sk)->addr)) { + if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) { err = unix_autobind(sk); if (err) goto out; } restart: - other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); + other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; @@ -1525,6 +1648,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; + struct unix_peercred peercred = {}; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; unsigned char state; @@ -1539,9 +1663,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; - if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !READ_ONCE(u->addr)) { + if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1561,6 +1683,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; } + err = prepare_peercred(&peercred); + if (err) + goto out; + /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (!skb) { @@ -1570,7 +1696,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, restart: /* Find listening sock. */ - other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags); if (IS_ERR(other)) { err = PTR_ERR(other); goto out_free_skb; @@ -1633,10 +1759,12 @@ restart: /* The way is open! Fastly set all the necessary fields... */ sock_hold(sk); - unix_peer(newsk) = sk; - newsk->sk_state = TCP_ESTABLISHED; - newsk->sk_type = sk->sk_type; - init_peercred(newsk); + unix_peer(newsk) = sk; + newsk->sk_state = TCP_ESTABLISHED; + newsk->sk_type = sk->sk_type; + newsk->sk_scm_recv_flags = other->sk_scm_recv_flags; + init_peercred(newsk, &peercred); + newu = unix_sk(newsk); newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); @@ -1695,20 +1823,33 @@ out_free_skb: out_free_sk: unix_release_sock(newsk, 0); out: + drop_peercred(&peercred); return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { + struct unix_peercred ska_peercred = {}, skb_peercred = {}; struct sock *ska = socka->sk, *skb = sockb->sk; + int err; + + err = prepare_peercred(&ska_peercred); + if (err) + return err; + + err = prepare_peercred(&skb_peercred); + if (err) { + drop_peercred(&ska_peercred); + return err; + } /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; - init_peercred(ska); - init_peercred(skb); + init_peercred(ska, &ska_peercred); + init_peercred(skb, &skb_peercred); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; @@ -1717,17 +1858,6 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) return 0; } -static void unix_sock_inherit_flags(const struct socket *old, - struct socket *new) -{ - if (test_bit(SOCK_PASSCRED, &old->flags)) - set_bit(SOCK_PASSCRED, &new->flags); - if (test_bit(SOCK_PASSPIDFD, &old->flags)) - set_bit(SOCK_PASSPIDFD, &new->flags); - if (test_bit(SOCK_PASSSEC, &old->flags)) - set_bit(SOCK_PASSSEC, &new->flags); -} - static int unix_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { @@ -1760,11 +1890,13 @@ static int unix_accept(struct socket *sock, struct socket *newsock, skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); + if (tsk->sk_type == SOCK_STREAM) + set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + /* attach accepted sock to socket */ unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; - unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; @@ -1859,7 +1991,7 @@ static void unix_destruct_scm(struct sk_buff *skb) struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; + scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); @@ -1873,7 +2005,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen { int err = 0; - UNIXCB(skb).pid = get_pid(scm->pid); + UNIXCB(skb).pid = get_pid(scm->pid); UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; @@ -1885,30 +2017,46 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } -static bool unix_passcred_enabled(const struct socket *sock, - const struct sock *other) +static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm) { - return test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || - test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); + scm_set_cred(scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); + unix_set_secdata(scm, skb); } -/* +/** + * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. + * @skb: skb to attach creds to. + * @sk: Sender sock. + * @other: Receiver sock. + * * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. + * + * Context: May sleep. + * Return: On success zero, on error a negative error code is returned. */ -static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, - const struct sock *other) +static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, + const struct sock *other) { if (UNIXCB(skb).pid) - return; - if (unix_passcred_enabled(sock, other)) { - UNIXCB(skb).pid = get_pid(task_tgid(current)); + return 0; + + if (unix_may_passcred(sk) || unix_may_passcred(other) || + !other->sk_socket) { + struct pid *pid; + int err; + + pid = task_tgid(current); + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + + UNIXCB(skb).pid = get_pid(pid); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } + + return 0; } static bool unix_skb_scm_eq(struct sk_buff *skb, @@ -1982,9 +2130,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !READ_ONCE(u->addr)) { + if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -2026,7 +2172,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen) { lookup: other = unix_find_other(sock_net(sk), msg->msg_name, - msg->msg_namelen, sk->sk_type); + msg->msg_namelen, sk->sk_type, 0); if (IS_ERR(other)) { err = PTR_ERR(other); goto out_free; @@ -2045,6 +2191,10 @@ lookup: goto out_sock_put; } + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out_sock_put; + restart: sk_locked = 0; unix_state_lock(other); @@ -2101,6 +2251,11 @@ restart_locked: goto out_unlock; } + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; + goto out_unlock; + } + if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) @@ -2147,7 +2302,7 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); - maybe_add_creds(skb, sock, other); + scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); @@ -2175,14 +2330,14 @@ out: #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) -static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, +static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, struct scm_cookie *scm, bool fds_sent) { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; int err; - skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); + skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; @@ -2191,6 +2346,10 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other if (err < 0) goto out; + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out; + skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); @@ -2201,16 +2360,20 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { - unix_state_unlock(other); err = -EPIPE; - goto out; + goto out_unlock; + } + + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; + goto out_unlock; } - maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); WRITE_ONCE(ousk->oob_skb, skb); + WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); @@ -2219,6 +2382,8 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other other->sk_data_ready(other); return 0; +out_unlock: + unix_state_unlock(other); out: consume_skb(skb); return err; @@ -2231,6 +2396,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sock *other = NULL; + struct unix_sock *otheru; struct scm_cookie scm; bool fds_sent = false; int err, sent = 0; @@ -2254,14 +2420,16 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen) { err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; - } else { - other = unix_peer(sk); - if (!other) { - err = -ENOTCONN; - goto out_err; - } } + other = unix_peer(sk); + if (!other) { + err = -ENOTCONN; + goto out_err; + } + + otheru = unix_sk(other); + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto out_pipe; @@ -2298,10 +2466,13 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, fds_sent = true; + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out_free; + if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; - err = skb_splice_from_iter(skb, &msg->msg_iter, size, - sk->sk_allocation); + err = skb_splice_from_iter(skb, &msg->msg_iter, size); if (err < 0) goto out_free; @@ -2322,9 +2493,19 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, (other->sk_shutdown & RCV_SHUTDOWN)) goto out_pipe_unlock; - maybe_add_creds(skb, sock, other); + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + unix_state_unlock(other); + err = -EPERM; + goto out_free; + } + scm_stat_add(other, skb); - skb_queue_tail(&other->sk_receive_queue, skb); + + spin_lock(&other->sk_receive_queue.lock); + WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); + unix_state_unlock(other); other->sk_data_ready(other); sent += size; @@ -2332,7 +2513,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { - err = queue_oob(sock, msg, other, &scm, fds_sent); + err = queue_oob(sk, msg, other, &scm, fds_sent); if (err) goto out_err; sent++; @@ -2434,12 +2615,10 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ - unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && - (sk->sk_shutdown & RCV_SHUTDOWN)) + (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN)) err = 0; - unix_state_unlock(sk); goto out; } @@ -2470,8 +2649,7 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, memset(&scm, 0, sizeof(scm)); - scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - unix_set_secdata(&scm, skb); + unix_skb_to_scm(skb, &scm); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) @@ -2578,11 +2756,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, return timeo; } -static unsigned int unix_skb_len(const struct sk_buff *skb) -{ - return skb->len - UNIXCB(skb).consumed; -} - struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); @@ -2597,11 +2770,11 @@ struct unix_stream_read_state { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { + struct sk_buff *oob_skb, *read_skb = NULL; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; - struct sk_buff *oob_skb; mutex_lock(&u->iolock); unix_state_lock(sk); @@ -2616,8 +2789,16 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) oob_skb = u->oob_skb; - if (!(state->flags & MSG_PEEK)) + if (!(state->flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); + WRITE_ONCE(u->inq_len, u->inq_len - 1); + + if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && + !unix_skb_len(oob_skb->prev)) { + read_skb = oob_skb->prev; + __skb_unlink(read_skb, &sk->sk_receive_queue); + } + } spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); @@ -2629,6 +2810,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) mutex_unlock(&u->iolock); + consume_skb(read_skb); + if (chunk < 0) return -EFAULT; @@ -2691,6 +2874,7 @@ unlock: static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { + struct sk_buff_head *queue = &sk->sk_receive_queue; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; @@ -2698,60 +2882,57 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) return -ENOTCONN; - mutex_lock(&u->iolock); - skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); - mutex_unlock(&u->iolock); - if (!skb) + err = sock_error(sk); + if (err) return err; -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (unlikely(skb == READ_ONCE(u->oob_skb))) { - bool drop = false; - - unix_state_lock(sk); + mutex_lock(&u->iolock); + spin_lock(&queue->lock); - if (sock_flag(sk, SOCK_DEAD)) { - unix_state_unlock(sk); - kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); - return -ECONNRESET; - } + skb = __skb_dequeue(queue); + if (!skb) { + spin_unlock(&queue->lock); + mutex_unlock(&u->iolock); + return -EAGAIN; + } - spin_lock(&sk->sk_receive_queue.lock); - if (likely(skb == u->oob_skb)) { - WRITE_ONCE(u->oob_skb, NULL); - drop = true; - } - spin_unlock(&sk->sk_receive_queue.lock); + WRITE_ONCE(u->inq_len, u->inq_len - skb->len); - unix_state_unlock(sk); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb == u->oob_skb) { + WRITE_ONCE(u->oob_skb, NULL); + spin_unlock(&queue->lock); + mutex_unlock(&u->iolock); - if (drop) { - kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); - return -EAGAIN; - } + kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); + return -EAGAIN; } #endif + spin_unlock(&queue->lock); + mutex_unlock(&u->iolock); + return recv_actor(sk, skb); } static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { - struct scm_cookie scm; + int noblock = state->flags & MSG_DONTWAIT; struct socket *sock = state->socket; + struct msghdr *msg = state->msg; struct sock *sk = sock->sk; - struct unix_sock *u = unix_sk(sk); - int copied = 0; + size_t size = state->size; int flags = state->flags; - int noblock = flags & MSG_DONTWAIT; bool check_creds = false; - int target; + struct scm_cookie scm; + unsigned int last_len; + struct unix_sock *u; + int copied = 0; int err = 0; long timeo; + int target; int skip; - size_t size = state->size; - unsigned int last_len; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { err = -EINVAL; @@ -2771,6 +2952,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, memset(&scm, 0, sizeof(scm)); + u = unix_sk(sk); + /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ @@ -2854,23 +3037,19 @@ unlock: /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; - } else if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) { + } else if (unix_may_passcred(sk)) { /* Copy credentials */ - scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - unix_set_secdata(&scm, skb); + unix_skb_to_scm(skb, &scm); check_creds = true; } /* Copy address just once */ - if (state->msg && state->msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, - state->msg->msg_name); - unix_copy_addr(state->msg, skb->sk); + if (msg && msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); - BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, - state->msg->msg_name, - &state->msg->msg_namelen); + unix_copy_addr(msg, skb->sk); + BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, + &msg->msg_namelen); sunaddr = NULL; } @@ -2899,7 +3078,11 @@ unlock: if (unix_skb_len(skb)) break; - skb_unlink(skb, &sk->sk_receive_queue); + spin_lock(&sk->sk_receive_queue.lock); + WRITE_ONCE(u->inq_len, u->inq_len - skb->len); + __skb_unlink(skb, &sk->sk_receive_queue); + spin_unlock(&sk->sk_receive_queue.lock); + consume_skb(skb); if (scm.fp) @@ -2928,10 +3111,17 @@ unlock: } while (size); mutex_unlock(&u->iolock); - if (state->msg) - scm_recv_unix(sock, state->msg, &scm, flags); - else + if (msg) { + scm_recv_unix(sock, msg, &scm, flags); + + if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) { + msg->msg_inq = READ_ONCE(u->inq_len); + put_cmsg(msg, SOL_SOCKET, SCM_INQ, + sizeof(msg->msg_inq), &msg->msg_inq); + } + } else { scm_destroy(&scm); + } out: return copied ? : err; } @@ -3070,9 +3260,11 @@ long unix_inq_len(struct sock *sk) if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; + if (sk->sk_type == SOCK_STREAM) + return READ_ONCE(unix_sk(sk)->inq_len); + spin_lock(&sk->sk_receive_queue.lock); - if (sk->sk_type == SOCK_STREAM || - sk->sk_type == SOCK_SEQPACKET) { + if (sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) amount += unix_skb_len(skb); } else { @@ -3094,7 +3286,6 @@ EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { - struct path path; struct file *f; int fd; @@ -3104,27 +3295,20 @@ static int unix_open_file(struct sock *sk) if (!smp_load_acquire(&unix_sk(sk)->addr)) return -ENOENT; - path = unix_sk(sk)->path; - if (!path.dentry) + if (!unix_sk(sk)->path.dentry) return -ENOENT; - path_get(&path); - fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - goto out; + return fd; - f = dentry_open(&path, O_PATH, current_cred()); + f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()); if (IS_ERR(f)) { put_unused_fd(fd); - fd = PTR_ERR(f); - goto out; + return PTR_ERR(f); } fd_install(fd, f); -out: - path_put(&path); - return fd; } @@ -3600,7 +3784,7 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) goto unlock; } - uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); diff --git a/net/unix/diag.c b/net/unix/diag.c index 79b182d0e62ae..ca34730261510 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -106,7 +106,7 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb, struct user_namespace *user_ns) { - uid_t uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + uid_t uid = from_kuid_munged(user_ns, sk_uid(sk)); return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid); } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index fc6afbc8d6806..ead6a3c14b879 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -407,6 +407,8 @@ EXPORT_SYMBOL_GPL(vsock_enqueue_accept); static bool vsock_use_local_transport(unsigned int remote_cid) { + lockdep_assert_held(&vsock_register_mutex); + if (!transport_local) return false; @@ -464,6 +466,8 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) remote_flags = vsk->remote_addr.svm_flags; + mutex_lock(&vsock_register_mutex); + switch (sk->sk_type) { case SOCK_DGRAM: new_transport = transport_dgram; @@ -479,12 +483,15 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) new_transport = transport_h2g; break; default: - return -ESOCKTNOSUPPORT; + ret = -ESOCKTNOSUPPORT; + goto err; } if (vsk->transport) { - if (vsk->transport == new_transport) - return 0; + if (vsk->transport == new_transport) { + ret = 0; + goto err; + } /* transport->release() must be called with sock lock acquired. * This path can only be taken during vsock_connect(), where we @@ -508,8 +515,16 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) /* We increase the module refcnt to prevent the transport unloading * while there are open sockets assigned to it. */ - if (!new_transport || !try_module_get(new_transport->module)) - return -ENODEV; + if (!new_transport || !try_module_get(new_transport->module)) { + ret = -ENODEV; + goto err; + } + + /* It's safe to release the mutex after a successful try_module_get(). + * Whichever transport `new_transport` points at, it won't go away until + * the last module_put() below or in vsock_deassign_transport(). + */ + mutex_unlock(&vsock_register_mutex); if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow || @@ -528,12 +543,31 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) vsk->transport = new_transport; return 0; +err: + mutex_unlock(&vsock_register_mutex); + return ret; } EXPORT_SYMBOL_GPL(vsock_assign_transport); +/* + * Provide safe access to static transport_{h2g,g2h,dgram,local} callbacks. + * Otherwise we may race with module removal. Do not use on `vsk->transport`. + */ +static u32 vsock_registered_transport_cid(const struct vsock_transport **transport) +{ + u32 cid = VMADDR_CID_ANY; + + mutex_lock(&vsock_register_mutex); + if (*transport) + cid = (*transport)->get_local_cid(); + mutex_unlock(&vsock_register_mutex); + + return cid; +} + bool vsock_find_cid(unsigned int cid) { - if (transport_g2h && cid == transport_g2h->get_local_cid()) + if (cid == vsock_registered_transport_cid(&transport_g2h)) return true; if (transport_h2g && cid == VMADDR_CID_HOST) @@ -994,11 +1028,6 @@ static int vsock_getname(struct socket *sock, vm_addr = &vsk->local_addr; } - if (!vm_addr) { - err = -EINVAL; - goto out; - } - /* sys_getsockname() and sys_getpeername() pass us a * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately * that macro is defined in socket.c instead of .h, so we hardcode its @@ -1013,6 +1042,39 @@ out: return err; } +void vsock_linger(struct sock *sk) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + ssize_t (*unsent)(struct vsock_sock *vsk); + struct vsock_sock *vsk = vsock_sk(sk); + long timeout; + + if (!sock_flag(sk, SOCK_LINGER)) + return; + + timeout = sk->sk_lingertime; + if (!timeout) + return; + + /* Transports must implement `unsent_bytes` if they want to support + * SOCK_LINGER through `vsock_linger()` since we use it to check when + * the socket can be closed. + */ + unsent = vsk->transport->unsent_bytes; + if (!unsent) + return; + + add_wait_queue(sk_sleep(sk), &wait); + + do { + if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) + break; + } while (!signal_pending(current) && timeout); + + remove_wait_queue(sk_sleep(sk), &wait); +} +EXPORT_SYMBOL_GPL(vsock_linger); + static int vsock_shutdown(struct socket *sock, int mode) { int err; @@ -1356,6 +1418,28 @@ static int vsock_do_ioctl(struct socket *sock, unsigned int cmd, vsk = vsock_sk(sk); switch (cmd) { + case SIOCINQ: { + ssize_t n_bytes; + + if (!vsk->transport) { + ret = -EOPNOTSUPP; + break; + } + + if (sock_type_connectible(sk->sk_type) && + sk->sk_state == TCP_LISTEN) { + ret = -EINVAL; + break; + } + + n_bytes = vsock_stream_has_data(vsk); + if (n_bytes < 0) { + ret = n_bytes; + break; + } + ret = put_user(n_bytes, arg); + break; + } case SIOCOUTQ: { ssize_t n_bytes; @@ -2503,18 +2587,19 @@ static long vsock_dev_do_ioctl(struct file *filp, unsigned int cmd, void __user *ptr) { u32 __user *p = ptr; - u32 cid = VMADDR_CID_ANY; int retval = 0; + u32 cid; switch (cmd) { case IOCTL_VM_SOCKETS_GET_LOCAL_CID: /* To be compatible with the VMCI behavior, we prioritize the * guest CID instead of well-know host CID (VMADDR_CID_HOST). */ - if (transport_g2h) - cid = transport_g2h->get_local_cid(); - else if (transport_h2g) - cid = transport_h2g->get_local_cid(); + cid = vsock_registered_transport_cid(&transport_g2h); + if (cid == VMADDR_CID_ANY) + cid = vsock_registered_transport_cid(&transport_h2g); + if (cid == VMADDR_CID_ANY) + cid = vsock_registered_transport_cid(&transport_local); if (put_user(cid, p) != 0) retval = -EFAULT; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 31342ab502b4f..432fcbbd14d4f 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -694,15 +694,26 @@ out: static s64 hvs_stream_has_data(struct vsock_sock *vsk) { struct hvsock *hvs = vsk->trans; + bool need_refill; s64 ret; if (hvs->recv_data_len > 0) - return 1; + return hvs->recv_data_len; switch (hvs_channel_readable_payload(hvs->chan)) { case 1: - ret = 1; - break; + need_refill = !hvs->recv_desc; + if (!need_refill) + return -EIO; + + hvs->recv_desc = hv_pkt_iter_first(hvs->chan); + if (!hvs->recv_desc) + return -ENOBUFS; + + ret = hvs_update_recv_data(hvs); + if (ret) + return ret; + return hvs->recv_data_len; case 0: vsk->peer_shutdown |= SEND_SHUTDOWN; ret = 0; diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index f0e48e6911fc4..b6569b0ca2bb7 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -307,7 +307,7 @@ out_rcu: static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) { - int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM; + int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; struct scatterlist pkt, *p; struct virtqueue *vq; struct sk_buff *skb; @@ -316,7 +316,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) vq = vsock->vqs[VSOCK_VQ_RX]; do { - skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL); + skb = virtio_vsock_alloc_linear_skb(total_len, GFP_KERNEL); if (!skb) break; @@ -624,8 +624,9 @@ static void virtio_transport_rx_work(struct work_struct *work) do { virtqueue_disable_cb(vq); for (;;) { + unsigned int len, payload_len; + struct virtio_vsock_hdr *hdr; struct sk_buff *skb; - unsigned int len; if (!virtio_transport_more_replies(vsock)) { /* Stop rx until the device processes already @@ -642,13 +643,22 @@ static void virtio_transport_rx_work(struct work_struct *work) vsock->rx_buf_nr--; /* Drop short/long packets */ - if (unlikely(len < sizeof(struct virtio_vsock_hdr) || + if (unlikely(len < sizeof(*hdr) || len > virtio_vsock_skb_len(skb))) { kfree_skb(skb); continue; } - virtio_vsock_skb_rx_put(skb); + hdr = virtio_vsock_hdr(skb); + payload_len = le32_to_cpu(hdr->len); + if (unlikely(payload_len > len - sizeof(*hdr))) { + kfree_skb(skb); + continue; + } + + if (payload_len) + virtio_vsock_skb_put(skb, payload_len); + virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&virtio_transport, skb); } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d880965..fe92e5fa95b4d 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, uarg = msg_zerocopy_realloc(sk_vsock(vsk), iter->count, - NULL); + NULL, false); if (!uarg) return -1; @@ -107,10 +107,10 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, { if (zcopy) return __zerocopy_sg_from_iter(info->msg, NULL, skb, - &info->msg->msg_iter, - len); + &info->msg->msg_iter, len, NULL); - return memcpy_from_msg(skb_put(skb, len), info->msg, len); + virtio_vsock_skb_put(skb, len); + return skb_copy_datagram_from_iter(skb, 0, &info->msg->msg_iter, len); } static void virtio_transport_init_hdr(struct sk_buff *skb, @@ -441,18 +441,20 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { - if (vvs->rx_bytes + len > vvs->buf_alloc) + if (vvs->buf_used + len > vvs->buf_alloc) return false; vvs->rx_bytes += len; + vvs->buf_used += len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, - u32 len) + u32 bytes_read, u32 bytes_dequeued) { - vvs->rx_bytes -= len; - vvs->fwd_cnt += len; + vvs->rx_bytes -= bytes_read; + vvs->buf_used -= bytes_dequeued; + vvs->fwd_cnt += bytes_dequeued; } void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) @@ -581,11 +583,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - size_t bytes, total = 0; struct sk_buff *skb; u32 fwd_cnt_delta; bool low_rx_bytes; int err = -EFAULT; + size_t total = 0; u32 free_space; spin_lock_bh(&vvs->rx_lock); @@ -597,6 +599,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } while (total < len && !skb_queue_empty(&vvs->rx_queue)) { + size_t bytes, dequeued = 0; + skb = skb_peek(&vvs->rx_queue); bytes = min_t(size_t, len - total, @@ -620,12 +624,12 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes; if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) { - u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len); - - virtio_transport_dec_rx_pkt(vvs, pkt_len); + dequeued = le32_to_cpu(virtio_vsock_hdr(skb)->len); __skb_unlink(skb, &vvs->rx_queue); consume_skb(skb); } + + virtio_transport_dec_rx_pkt(vvs, bytes, dequeued); } fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; @@ -781,7 +785,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, msg->msg_flags |= MSG_EOR; } - virtio_transport_dec_rx_pkt(vvs, pkt_len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); kfree_skb(skb); } @@ -1192,23 +1196,6 @@ static void virtio_transport_remove_sock(struct vsock_sock *vsk) vsock_remove_sock(vsk); } -static void virtio_transport_wait_close(struct sock *sk, long timeout) -{ - if (timeout) { - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(sk_sleep(sk), &wait); - - do { - if (sk_wait_event(sk, &timeout, - sock_flag(sk, SOCK_DONE), &wait)) - break; - } while (!signal_pending(current) && timeout); - - remove_wait_queue(sk_sleep(sk), &wait); - } -} - static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, bool cancel_timeout) { @@ -1278,8 +1265,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk) if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); - if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) - virtio_transport_wait_close(sk, sk->sk_lingertime); + if (!(current->flags & PF_EXITING)) + vsock_linger(sk); if (sock_flag(sk, SOCK_DONE)) { return true; @@ -1735,6 +1722,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto struct sock *sk = sk_vsock(vsk); struct virtio_vsock_hdr *hdr; struct sk_buff *skb; + u32 pkt_len; int off = 0; int err; @@ -1752,7 +1740,8 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count--; - virtio_transport_dec_rx_pkt(vvs, le32_to_cpu(hdr->len)); + pkt_len = le32_to_cpu(hdr->len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); spin_unlock_bh(&vvs->rx_lock); virtio_transport_send_credit_update(vsk); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index b370070194fa4..7eccd6708d664 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -119,6 +119,8 @@ vmci_transport_packet_init(struct vmci_transport_packet *pkt, u16 proto, struct vmci_handle handle) { + memset(pkt, 0, sizeof(*pkt)); + /* We register the stream control handler as an any cid handle so we * must always send from a source address of VMADDR_CID_ANY */ @@ -131,8 +133,6 @@ vmci_transport_packet_init(struct vmci_transport_packet *pkt, pkt->type = type; pkt->src_port = src->svm_port; pkt->dst_port = dst->svm_port; - memset(&pkt->proto, 0, sizeof(pkt->proto)); - memset(&pkt->_reserved2, 0, sizeof(pkt->_reserved2)); switch (pkt->type) { case VMCI_TRANSPORT_PACKET_TYPE_INVALID: diff --git a/net/wireless/core.c b/net/wireless/core.c index dcce326fdb8ca..a7e2931ffb2ef 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -239,7 +239,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, rdev->opencount--; - if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (rdev->scan_req && rdev->scan_req->req.wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) @@ -995,6 +995,24 @@ int wiphy_register(struct wiphy *wiphy) wiphy->max_num_akm_suites > CFG80211_MAX_NUM_AKM_SUITES) return -EINVAL; + /* Allocate radio configuration space for multi-radio wiphy */ + if (wiphy->n_radio > 0) { + int idx; + + wiphy->radio_cfg = kcalloc(wiphy->n_radio, + sizeof(*wiphy->radio_cfg), + GFP_KERNEL); + if (!wiphy->radio_cfg) + return -ENOMEM; + /* + * Initialize wiphy radio parameters to IEEE 802.11 + * MIB default values. RTS threshold is disabled by + * default with the special -1 value. + */ + for (idx = 0; idx < wiphy->n_radio; idx++) + wiphy->radio_cfg[idx].rts_threshold = (u32)-1; + } + /* check and set up bitrates */ ieee80211_set_bitrate_flags(wiphy); @@ -1222,6 +1240,7 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) void wiphy_free(struct wiphy *wiphy) { + kfree(wiphy->radio_cfg); put_device(&wiphy->dev); } EXPORT_SYMBOL(wiphy_free); @@ -1555,7 +1574,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, case NETDEV_DOWN: wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, -1); - if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (rdev->scan_req && rdev->scan_req->req.wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) @@ -1709,7 +1728,7 @@ EXPORT_SYMBOL_GPL(wiphy_work_flush); void wiphy_delayed_work_timer(struct timer_list *t) { - struct wiphy_delayed_work *dwork = from_timer(dwork, t, timer); + struct wiphy_delayed_work *dwork = timer_container_of(dwork, t, timer); wiphy_work_queue(dwork->wiphy, &dwork->work); } diff --git a/net/wireless/core.h b/net/wireless/core.h index c56a35040caa1..b6bd7f4d6385a 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -21,6 +21,13 @@ #define WIPHY_IDX_INVALID -1 +struct cfg80211_scan_request_int { + struct cfg80211_scan_info info; + bool notified; + /* must be last - variable members */ + struct cfg80211_scan_request req; +}; + struct cfg80211_registered_device { const struct cfg80211_ops *ops; struct list_head list; @@ -70,8 +77,8 @@ struct cfg80211_registered_device { struct rb_root bss_tree; u32 bss_generation; u32 bss_entries; - struct cfg80211_scan_request *scan_req; /* protected by RTNL */ - struct cfg80211_scan_request *int_scan_req; + struct cfg80211_scan_request_int *scan_req; /* protected by RTNL */ + struct cfg80211_scan_request_int *int_scan_req; struct sk_buff *scan_msg; struct list_head sched_scan_req_list; time64_t suspend_at; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 05d44a4435189..46394eb2086f6 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -352,8 +352,25 @@ cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a, return -EINVAL; } - if (ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_a) != - ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_b)) { + /* + * Only verify the values in Extended MLD Capabilities that are + * not reserved when transmitted by an AP (and expected to remain the + * same over time). + * The Recommended Max Simultaneous Links subfield in particular is + * reserved when included in a unicast Probe Response frame and may + * also change when the AP adds/removes links. The BTM MLD + * Recommendation For Multiple APs Support subfield is reserved when + * transmitted by an AP. All other bits are currently reserved. + * See IEEE P802.11be/D7.0, Table 9-417o. + */ + if ((ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_a) & + (IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE | + IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE | + IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK)) != + (ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_b) & + (IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE | + IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE | + IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK))) { NL_SET_ERR_MSG(extack, "extended link MLD capabilities/ops mismatch"); return -EINVAL; @@ -850,7 +867,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, mgmt = (const struct ieee80211_mgmt *)params->buf; - if (!ieee80211_is_mgmt(mgmt->frame_control)) + if (!ieee80211_is_mgmt(mgmt->frame_control) || + ieee80211_has_order(mgmt->frame_control)) return -EINVAL; stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE; @@ -1331,7 +1349,8 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev, lockdep_assert_wiphy(wiphy); trace_cfg80211_mlo_reconf_add_done(dev, data->added_links, - data->buf, data->len); + data->buf, data->len, + data->driver_initiated); if (WARN_ON(!wdev->valid_links)) return; @@ -1361,11 +1380,16 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev, wdev->links[link_id].client.current_bss = bss_from_pub(bss); + if (data->driver_initiated) + cfg80211_hold_bss(bss_from_pub(bss)); + memcpy(wdev->links[link_id].addr, data->links[link_id].addr, ETH_ALEN); } else { - cfg80211_unhold_bss(bss_from_pub(bss)); + if (!data->driver_initiated) + cfg80211_unhold_bss(bss_from_pub(bss)); + cfg80211_put_bss(wiphy, bss); } } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f039a7d0d6f73..89519aa52893e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -229,6 +229,7 @@ static int validate_beacon_head(const struct nlattr *attr, unsigned int len = nla_len(attr); const struct element *elem; const struct ieee80211_mgmt *mgmt = (void *)data; + const struct ieee80211_ext *ext; unsigned int fixedlen, hdrlen; bool s1g_bcn; @@ -237,8 +238,10 @@ static int validate_beacon_head(const struct nlattr *attr, s1g_bcn = ieee80211_is_s1g_beacon(mgmt->frame_control); if (s1g_bcn) { - fixedlen = offsetof(struct ieee80211_ext, - u.s1g_beacon.variable); + ext = (struct ieee80211_ext *)mgmt; + fixedlen = + offsetof(struct ieee80211_ext, u.s1g_beacon.variable) + + ieee80211_s1g_optional_len(ext->frame_control); hdrlen = offsetof(struct ieee80211_ext, u.s1g_beacon); } else { fixedlen = offsetof(struct ieee80211_mgmt, @@ -469,6 +472,8 @@ nl80211_mbssid_config_policy[NL80211_MBSSID_CONFIG_ATTR_MAX + 1] = { [NL80211_MBSSID_CONFIG_ATTR_INDEX] = { .type = NLA_U8 }, [NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX] = { .type = NLA_U32 }, [NL80211_MBSSID_CONFIG_ATTR_EMA] = { .type = NLA_FLAG }, + [NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID] = + NLA_POLICY_MAX(NLA_U8, IEEE80211_MLD_MAX_NUM_LINKS), }; static const struct nla_policy @@ -477,6 +482,16 @@ nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = { [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 }, }; +static const struct nla_policy +nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { + [NL80211_S1G_SHORT_BEACON_ATTR_HEAD] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head, + IEEE80211_MAX_DATA_LEN), + [NL80211_S1G_SHORT_BEACON_ATTR_TAIL] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -833,6 +848,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN), [NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG }, [NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT }, + [NL80211_ATTR_EML_CAPABILITY] = { .type = NLA_U16 }, [NL80211_ATTR_PUNCT_BITMAP] = NLA_POLICY_FULL_RANGE(NLA_U32, &nl80211_punct_bitmap_range), @@ -851,6 +867,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLO_RECONF_REM_LINKS] = { .type = NLA_U16 }, [NL80211_ATTR_EPCS] = { .type = NLA_FLAG }, [NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS] = { .type = NLA_U16 }, + [NL80211_ATTR_WIPHY_RADIO_INDEX] = { .type = NLA_U8 }, + [NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2), + [NL80211_ATTR_S1G_SHORT_BEACON] = + NLA_POLICY_NESTED(nl80211_s1g_short_beacon), }; /* policy for the key attributes */ @@ -1580,7 +1600,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, return result; error: - kfree(result); + kfree_sensitive(result); return ERR_PTR(err); } @@ -2443,6 +2463,7 @@ fail: static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx) { const struct wiphy_radio *r = &wiphy->radio[idx]; + const struct wiphy_radio_cfg *rcfg = &wiphy->radio_cfg[idx]; struct nlattr *radio, *freq; int i; @@ -2453,6 +2474,11 @@ static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx) if (nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_INDEX, idx)) goto nla_put_failure; + if (rcfg->rts_threshold && + nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD, + rcfg->rts_threshold)) + goto nla_put_failure; + if (r->antenna_mask && nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, r->antenna_mask)) @@ -2636,7 +2662,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, u32 tx_ant = 0, rx_ant = 0; int res; - res = rdev_get_antenna(rdev, &tx_ant, &rx_ant); + res = rdev_get_antenna(rdev, -1, &tx_ant, &rx_ant); if (!res) { if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_TX, @@ -3605,6 +3631,33 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) return __nl80211_set_channel(rdev, netdev, info, link_id); } +static int nl80211_set_wiphy_radio(struct genl_info *info, + struct cfg80211_registered_device *rdev, + int radio_idx) +{ + u32 rts_threshold = 0, old_rts, changed = 0; + int result; + + if (!rdev->ops->set_wiphy_params) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]) { + rts_threshold = nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]); + changed |= WIPHY_PARAM_RTS_THRESHOLD; + } + + old_rts = rdev->wiphy.radio_cfg[radio_idx].rts_threshold; + + rdev->wiphy.radio_cfg[radio_idx].rts_threshold = rts_threshold; + + result = rdev_set_wiphy_params(rdev, radio_idx, changed); + if (result) + rdev->wiphy.radio_cfg[radio_idx].rts_threshold = old_rts; + + return 0; +} + static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = NULL; @@ -3617,6 +3670,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u32 frag_threshold = 0, rts_threshold = 0; u8 coverage_class = 0; u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0; + int radio_idx = -1; rtnl_lock(); /* @@ -3667,6 +3721,19 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (result) return result; + if (info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]) { + /* Radio idx is not expected for non-multi radio wiphy */ + if (rdev->wiphy.n_radio <= 0) + return -EINVAL; + + radio_idx = nla_get_u8( + info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]); + if (radio_idx >= rdev->wiphy.n_radio) + return -EINVAL; + + return nl80211_set_wiphy_radio(info, rdev, radio_idx); + } + if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { struct ieee80211_txq_params txq_params; struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1]; @@ -3756,7 +3823,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) mbm = nla_get_u32(info->attrs[idx]); } - result = rdev_set_tx_power(rdev, txp_wdev, type, mbm); + result = rdev_set_tx_power(rdev, txp_wdev, radio_idx, type, + mbm); if (result) return result; } @@ -3782,7 +3850,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) tx_ant = tx_ant & rdev->wiphy.available_antennas_tx; rx_ant = rx_ant & rdev->wiphy.available_antennas_rx; - result = rdev_set_antenna(rdev, tx_ant, rx_ant); + result = rdev_set_antenna(rdev, radio_idx, tx_ant, rx_ant); if (result) return result; } @@ -3876,16 +3944,30 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (changed) { u8 old_retry_short, old_retry_long; u32 old_frag_threshold, old_rts_threshold; - u8 old_coverage_class; + u8 old_coverage_class, i; u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum; + u32 *old_radio_rts_threshold = NULL; if (!rdev->ops->set_wiphy_params) return -EOPNOTSUPP; + if (rdev->wiphy.n_radio) { + old_radio_rts_threshold = kcalloc(rdev->wiphy.n_radio, + sizeof(u32), + GFP_KERNEL); + if (!old_radio_rts_threshold) + return -ENOMEM; + } + old_retry_short = rdev->wiphy.retry_short; old_retry_long = rdev->wiphy.retry_long; old_frag_threshold = rdev->wiphy.frag_threshold; old_rts_threshold = rdev->wiphy.rts_threshold; + if (old_radio_rts_threshold) { + for (i = 0 ; i < rdev->wiphy.n_radio; i++) + old_radio_rts_threshold[i] = + rdev->wiphy.radio_cfg[i].rts_threshold; + } old_coverage_class = rdev->wiphy.coverage_class; old_txq_limit = rdev->wiphy.txq_limit; old_txq_memory_limit = rdev->wiphy.txq_memory_limit; @@ -3897,8 +3979,13 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.retry_long = retry_long; if (changed & WIPHY_PARAM_FRAG_THRESHOLD) rdev->wiphy.frag_threshold = frag_threshold; - if (changed & WIPHY_PARAM_RTS_THRESHOLD) + if ((changed & WIPHY_PARAM_RTS_THRESHOLD) && + old_radio_rts_threshold) { rdev->wiphy.rts_threshold = rts_threshold; + for (i = 0 ; i < rdev->wiphy.n_radio; i++) + rdev->wiphy.radio_cfg[i].rts_threshold = + rdev->wiphy.rts_threshold; + } if (changed & WIPHY_PARAM_COVERAGE_CLASS) rdev->wiphy.coverage_class = coverage_class; if (changed & WIPHY_PARAM_TXQ_LIMIT) @@ -3908,18 +3995,26 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (changed & WIPHY_PARAM_TXQ_QUANTUM) rdev->wiphy.txq_quantum = txq_quantum; - result = rdev_set_wiphy_params(rdev, changed); + result = rdev_set_wiphy_params(rdev, radio_idx, changed); if (result) { rdev->wiphy.retry_short = old_retry_short; rdev->wiphy.retry_long = old_retry_long; rdev->wiphy.frag_threshold = old_frag_threshold; rdev->wiphy.rts_threshold = old_rts_threshold; + if (old_radio_rts_threshold) { + for (i = 0 ; i < rdev->wiphy.n_radio; i++) + rdev->wiphy.radio_cfg[i].rts_threshold = + old_radio_rts_threshold[i]; + } rdev->wiphy.coverage_class = old_coverage_class; rdev->wiphy.txq_limit = old_txq_limit; rdev->wiphy.txq_memory_limit = old_txq_memory_limit; rdev->wiphy.txq_quantum = old_txq_quantum; - return result; } + + if (old_rts_threshold) + kfree(old_radio_rts_threshold); + return result; } return 0; @@ -4009,7 +4104,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (rdev->ops->get_tx_power && !wdev->valid_links) { int dbm, ret; - ret = rdev_get_tx_power(rdev, wdev, 0, &dbm); + ret = rdev_get_tx_power(rdev, wdev, -1, 0, &dbm); if (ret == 0 && nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, DBM_TO_MBM(dbm))) @@ -4081,7 +4176,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (rdev->ops->get_tx_power) { int dbm, ret; - ret = rdev_get_tx_power(rdev, wdev, link_id, &dbm); + ret = rdev_get_tx_power(rdev, wdev, -1, link_id, &dbm); if (ret == 0 && nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, DBM_TO_MBM(dbm))) @@ -5523,11 +5618,13 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, static int nl80211_parse_mbssid_config(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, struct nlattr *attrs, struct cfg80211_mbssid_config *config, u8 num_elems) { struct nlattr *tb[NL80211_MBSSID_CONFIG_ATTR_MAX + 1]; + int tx_link_id = -1; if (!wiphy->mbssid_max_interfaces) return -EOPNOTSUPP; @@ -5551,6 +5648,9 @@ static int nl80211_parse_mbssid_config(struct wiphy *wiphy, (!config->index && !num_elems)) return -EINVAL; + if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID]) + tx_link_id = nla_get_u8(tb[NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID]); + if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]) { u32 tx_ifindex = nla_get_u32(tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]); @@ -5572,10 +5672,25 @@ static int nl80211_parse_mbssid_config(struct wiphy *wiphy, } config->tx_wdev = tx_netdev->ieee80211_ptr; + /* Caller should call dev_put(config->tx_wdev) from this point */ + + if (config->tx_wdev->valid_links) { + if (tx_link_id == -1 || + !(config->tx_wdev->valid_links & BIT(tx_link_id))) + return -ENOLINK; + + config->tx_link_id = tx_link_id; + } } else { + if (tx_link_id >= 0 && tx_link_id != link_id) + return -EINVAL; + config->tx_wdev = dev->ieee80211_ptr; } } else if (!config->index) { + if (tx_link_id >= 0 && tx_link_id != link_id) + return -EINVAL; + config->tx_wdev = dev->ieee80211_ptr; } else { return -EINVAL; @@ -6100,6 +6215,41 @@ static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params return 0; } +static int +nl80211_parse_s1g_short_beacon(struct cfg80211_registered_device *rdev, + struct nlattr *attrs, + struct cfg80211_s1g_short_beacon *sb) +{ + struct nlattr *tb[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1]; + int ret; + + if (!rdev->wiphy.bands[NL80211_BAND_S1GHZ]) + return -EINVAL; + + ret = nla_parse_nested(tb, NL80211_S1G_SHORT_BEACON_ATTR_MAX, attrs, + NULL, NULL); + if (ret) + return ret; + + /* Short beacon tail is optional (i.e might only include the TIM) */ + if (!tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]) + return -EINVAL; + + sb->short_head = nla_data(tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]); + sb->short_head_len = nla_len(tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]); + sb->short_tail_len = 0; + + if (tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]) { + sb->short_tail = + nla_data(tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]); + sb->short_tail_len = + nla_len(tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]); + } + + sb->update = true; + return 0; +} + static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -6325,7 +6475,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_MBSSID_CONFIG]) { - err = nl80211_parse_mbssid_config(&rdev->wiphy, dev, + err = nl80211_parse_mbssid_config(&rdev->wiphy, dev, link_id, info->attrs[NL80211_ATTR_MBSSID_CONFIG], ¶ms->mbssid_config, params->beacon.mbssid_ies ? @@ -6340,6 +6490,22 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_S1G_SHORT_BEACON]) { + if (!info->attrs[NL80211_ATTR_S1G_LONG_BEACON_PERIOD]) { + err = -EINVAL; + goto out; + } + + params->s1g_long_beacon_period = nla_get_u8( + info->attrs[NL80211_ATTR_S1G_LONG_BEACON_PERIOD]); + + err = nl80211_parse_s1g_short_beacon( + rdev, info->attrs[NL80211_ATTR_S1G_SHORT_BEACON], + ¶ms->s1g_short_beacon); + if (err) + goto out; + } + err = nl80211_calculate_ap_params(params); if (err) goto out; @@ -6448,6 +6614,14 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) goto out; } + attr = info->attrs[NL80211_ATTR_S1G_SHORT_BEACON]; + if (attr) { + err = nl80211_parse_s1g_short_beacon(rdev, attr, + ¶ms->s1g_short_beacon); + if (err) + goto out; + } + err = rdev_change_beacon(rdev, dev, params); out: @@ -6705,6 +6879,185 @@ static bool nl80211_put_signal(struct sk_buff *msg, u8 mask, s8 *signal, return true; } +static int nl80211_fill_link_station(struct sk_buff *msg, + struct cfg80211_registered_device *rdev, + struct link_station_info *link_sinfo) +{ + struct nlattr *bss_param, *link_sinfoattr; + +#define PUT_LINK_SINFO(attr, memb, type) do { \ + BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \ + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \ + nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \ + link_sinfo->memb)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_LINK_SINFO_U64(attr, memb) do { \ + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ ## attr) && \ + nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \ + link_sinfo->memb, NL80211_STA_INFO_PAD)) \ + goto nla_put_failure; \ + } while (0) + + link_sinfoattr = nla_nest_start_noflag(msg, NL80211_ATTR_STA_INFO); + if (!link_sinfoattr) + goto nla_put_failure; + + PUT_LINK_SINFO(INACTIVE_TIME, inactive_time, u32); + + if (link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES) | + BIT_ULL(NL80211_STA_INFO_RX_BYTES64)) && + nla_put_u32(msg, NL80211_STA_INFO_RX_BYTES, + (u32)link_sinfo->rx_bytes)) + goto nla_put_failure; + + if (link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES64)) && + nla_put_u32(msg, NL80211_STA_INFO_TX_BYTES, + (u32)link_sinfo->tx_bytes)) + goto nla_put_failure; + + PUT_LINK_SINFO_U64(RX_BYTES64, rx_bytes); + PUT_LINK_SINFO_U64(TX_BYTES64, tx_bytes); + PUT_LINK_SINFO_U64(RX_DURATION, rx_duration); + PUT_LINK_SINFO_U64(TX_DURATION, tx_duration); + + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) + PUT_LINK_SINFO(AIRTIME_WEIGHT, airtime_weight, u16); + + switch (rdev->wiphy.signal_type) { + case CFG80211_SIGNAL_TYPE_MBM: + PUT_LINK_SINFO(SIGNAL, signal, u8); + PUT_LINK_SINFO(SIGNAL_AVG, signal_avg, u8); + break; + default: + break; + } + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL)) { + if (!nl80211_put_signal(msg, link_sinfo->chains, + link_sinfo->chain_signal, + NL80211_STA_INFO_CHAIN_SIGNAL)) + goto nla_put_failure; + } + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) { + if (!nl80211_put_signal(msg, link_sinfo->chains, + link_sinfo->chain_signal_avg, + NL80211_STA_INFO_CHAIN_SIGNAL_AVG)) + goto nla_put_failure; + } + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) { + if (!nl80211_put_sta_rate(msg, &link_sinfo->txrate, + NL80211_STA_INFO_TX_BITRATE)) + goto nla_put_failure; + } + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) { + if (!nl80211_put_sta_rate(msg, &link_sinfo->rxrate, + NL80211_STA_INFO_RX_BITRATE)) + goto nla_put_failure; + } + + PUT_LINK_SINFO(RX_PACKETS, rx_packets, u32); + PUT_LINK_SINFO(TX_PACKETS, tx_packets, u32); + PUT_LINK_SINFO(TX_RETRIES, tx_retries, u32); + PUT_LINK_SINFO(TX_FAILED, tx_failed, u32); + PUT_LINK_SINFO(EXPECTED_THROUGHPUT, expected_throughput, u32); + PUT_LINK_SINFO(BEACON_LOSS, beacon_loss_count, u32); + + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { + bss_param = nla_nest_start_noflag(msg, + NL80211_STA_INFO_BSS_PARAM); + if (!bss_param) + goto nla_put_failure; + + if (((link_sinfo->bss_param.flags & + BSS_PARAM_FLAGS_CTS_PROT) && + nla_put_flag(msg, NL80211_STA_BSS_PARAM_CTS_PROT)) || + ((link_sinfo->bss_param.flags & + BSS_PARAM_FLAGS_SHORT_PREAMBLE) && + nla_put_flag(msg, + NL80211_STA_BSS_PARAM_SHORT_PREAMBLE)) || + ((link_sinfo->bss_param.flags & + BSS_PARAM_FLAGS_SHORT_SLOT_TIME) && + nla_put_flag(msg, + NL80211_STA_BSS_PARAM_SHORT_SLOT_TIME)) || + nla_put_u8(msg, NL80211_STA_BSS_PARAM_DTIM_PERIOD, + link_sinfo->bss_param.dtim_period) || + nla_put_u16(msg, NL80211_STA_BSS_PARAM_BEACON_INTERVAL, + link_sinfo->bss_param.beacon_interval)) + goto nla_put_failure; + + nla_nest_end(msg, bss_param); + } + + PUT_LINK_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); + PUT_LINK_SINFO_U64(BEACON_RX, rx_beacon); + PUT_LINK_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); + PUT_LINK_SINFO(RX_MPDUS, rx_mpdu_count, u32); + PUT_LINK_SINFO(FCS_ERROR_COUNT, fcs_err_count, u32); + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT)) { + PUT_LINK_SINFO(ACK_SIGNAL, ack_signal, u8); + PUT_LINK_SINFO(ACK_SIGNAL_AVG, avg_ack_signal, s8); + } + +#undef PUT_LINK_SINFO +#undef PUT_LINK_SINFO_U64 + + if (link_sinfo->pertid) { + struct nlattr *tidsattr; + int tid; + + tidsattr = nla_nest_start_noflag(msg, + NL80211_STA_INFO_TID_STATS); + if (!tidsattr) + goto nla_put_failure; + + for (tid = 0; tid < IEEE80211_NUM_TIDS + 1; tid++) { + struct cfg80211_tid_stats *tidstats; + struct nlattr *tidattr; + + tidstats = &link_sinfo->pertid[tid]; + + if (!tidstats->filled) + continue; + + tidattr = nla_nest_start_noflag(msg, tid + 1); + if (!tidattr) + goto nla_put_failure; + +#define PUT_TIDVAL_U64(attr, memb) do { \ + if (tidstats->filled & BIT(NL80211_TID_STATS_ ## attr) && \ + nla_put_u64_64bit(msg, NL80211_TID_STATS_ ## attr, \ + tidstats->memb, NL80211_TID_STATS_PAD)) \ + goto nla_put_failure; \ + } while (0) + + PUT_TIDVAL_U64(RX_MSDU, rx_msdu); + PUT_TIDVAL_U64(TX_MSDU, tx_msdu); + PUT_TIDVAL_U64(TX_MSDU_RETRIES, tx_msdu_retries); + PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed); + +#undef PUT_TIDVAL_U64 + if ((tidstats->filled & + BIT(NL80211_TID_STATS_TXQ_STATS)) && + !nl80211_put_txq_stats(msg, &tidstats->txq_stats, + NL80211_TID_STATS_TXQ_STATS)) + goto nla_put_failure; + + nla_nest_end(msg, tidattr); + } + + nla_nest_end(msg, tidsattr); + } + + nla_nest_end(msg, link_sinfoattr); + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg80211_registered_device *rdev, @@ -6713,6 +7066,9 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, { void *hdr; struct nlattr *sinfoattr, *bss_param; + struct link_station_info *link_sinfo; + struct nlattr *links, *link; + int link_id; hdr = nl80211hdr_put(msg, portid, seq, flags, cmd); if (!hdr) { @@ -6927,6 +7283,40 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, goto nla_put_failure; } + if (sinfo->valid_links) { + links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); + if (!links) + goto nla_put_failure; + + for_each_valid_link(sinfo, link_id) { + link_sinfo = sinfo->links[link_id]; + + if (WARN_ON_ONCE(!link_sinfo)) + continue; + + if (!is_valid_ether_addr(link_sinfo->addr)) + continue; + + link = nla_nest_start(msg, link_id + 1); + if (!link) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, + link_id)) + goto nla_put_failure; + + if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, + link_sinfo->addr)) + goto nla_put_failure; + + if (nl80211_fill_link_station(msg, rdev, link_sinfo)) + goto nla_put_failure; + + nla_nest_end(msg, link); + } + nla_nest_end(msg, links); + } + cfg80211_sinfo_release_content(sinfo); genlmsg_end(msg, hdr); return 0; @@ -6937,6 +7327,194 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, return -EMSGSIZE; } +static void cfg80211_sta_set_mld_sinfo(struct station_info *sinfo) +{ + struct link_station_info *link_sinfo; + int link_id, init = 0; + u32 link_inactive_time; + + sinfo->signal = -99; + + for_each_valid_link(sinfo, link_id) { + link_sinfo = sinfo->links[link_id]; + if (!link_sinfo) + continue; + + if ((link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { + sinfo->tx_packets += link_sinfo->tx_packets; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); + } + + if ((link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { + sinfo->rx_packets += link_sinfo->rx_packets; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); + } + + if (link_sinfo->filled & + (BIT_ULL(NL80211_STA_INFO_TX_BYTES) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES64))) { + sinfo->tx_bytes += link_sinfo->tx_bytes; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES); + } + + if (link_sinfo->filled & + (BIT_ULL(NL80211_STA_INFO_RX_BYTES) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES64))) { + sinfo->rx_bytes += link_sinfo->rx_bytes; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES); + } + + if (link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_TX_RETRIES)) { + sinfo->tx_retries += link_sinfo->tx_retries; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); + } + + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED)) { + sinfo->tx_failed += link_sinfo->tx_failed; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); + } + + if (link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC)) { + sinfo->rx_dropped_misc += link_sinfo->rx_dropped_misc; + sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); + } + + if (link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_BEACON_LOSS)) { + sinfo->beacon_loss_count += + link_sinfo->beacon_loss_count; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); + } + + if (link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) { + sinfo->expected_throughput += + link_sinfo->expected_throughput; + sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); + } + + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_MPDUS)) { + sinfo->rx_mpdu_count += link_sinfo->rx_mpdu_count; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_MPDUS); + } + + if (link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_FCS_ERROR_COUNT)) { + sinfo->fcs_err_count += link_sinfo->fcs_err_count; + sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_FCS_ERROR_COUNT); + } + + if (link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_BEACON_RX)) { + sinfo->rx_beacon += link_sinfo->rx_beacon; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX); + } + + /* Update MLO signal, signal_avg as best among links */ + if ((link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL)) && + link_sinfo->signal > sinfo->signal) { + sinfo->signal = link_sinfo->signal; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); + } + + if ((link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) && + link_sinfo->signal_avg > sinfo->signal_avg) { + sinfo->signal_avg = link_sinfo->signal_avg; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); + } + + /* Update MLO inactive_time, bss_param based on least + * value for corresponding field of link. + */ + if ((link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME)) && + (!init || + link_inactive_time > link_sinfo->inactive_time)) { + link_inactive_time = link_sinfo->inactive_time; + sinfo->inactive_time = link_sinfo->inactive_time; + sinfo->filled |= NL80211_STA_INFO_INACTIVE_TIME; + } + + if (link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM) && + (!init || + sinfo->bss_param.dtim_period > + link_sinfo->bss_param.dtim_period)) { + sinfo->bss_param.dtim_period = + link_sinfo->bss_param.dtim_period; + sinfo->filled |= NL80211_STA_BSS_PARAM_DTIM_PERIOD; + sinfo->bss_param.beacon_interval = + link_sinfo->bss_param.beacon_interval; + sinfo->filled |= NL80211_STA_BSS_PARAM_BEACON_INTERVAL; + } + + /* Update MLO rates as per last updated link rate */ + if ((link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && + (!init || + link_inactive_time > link_sinfo->inactive_time)) { + sinfo->txrate = link_sinfo->txrate; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); + } + if ((link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) && + (!init || + link_inactive_time > link_sinfo->inactive_time)) { + sinfo->rxrate = link_sinfo->rxrate; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); + } + + if (link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_TX_DURATION) && + (!init || + link_inactive_time > link_sinfo->inactive_time)) { + sinfo->tx_duration += link_sinfo->tx_duration; + sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_TX_DURATION); + } + if (link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_RX_DURATION) && + (!init || + link_inactive_time > link_sinfo->inactive_time)) { + sinfo->rx_duration += link_sinfo->rx_duration; + sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_RX_DURATION); + } + init++; + + /* pertid stats accumulate for rx/tx fields */ + if (sinfo->pertid) { + sinfo->pertid->rx_msdu += + link_sinfo->pertid->rx_msdu; + sinfo->pertid->tx_msdu += + link_sinfo->pertid->tx_msdu; + sinfo->pertid->tx_msdu_retries += + link_sinfo->pertid->tx_msdu_retries; + sinfo->pertid->tx_msdu_failed += + link_sinfo->pertid->tx_msdu_failed; + + sinfo->pertid->filled |= + BIT(NL80211_TID_STATS_RX_MSDU) | + BIT(NL80211_TID_STATS_TX_MSDU) | + BIT(NL80211_TID_STATS_TX_MSDU_RETRIES) | + BIT(NL80211_TID_STATS_TX_MSDU_FAILED); + } + } + + /* Reset sinfo->filled bits to exclude fields which don't make + * much sense at the MLO level. + */ + sinfo->filled &= ~BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); + sinfo->filled &= ~BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); +} + static int nl80211_dump_station(struct sk_buff *skb, struct netlink_callback *cb) { @@ -6945,7 +7523,8 @@ static int nl80211_dump_station(struct sk_buff *skb, struct wireless_dev *wdev; u8 mac_addr[ETH_ALEN]; int sta_idx = cb->args[2]; - int err; + bool sinfo_alloc = false; + int err, i; err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL); if (err) @@ -6965,6 +7544,17 @@ static int nl80211_dump_station(struct sk_buff *skb, while (1) { memset(&sinfo, 0, sizeof(sinfo)); + + for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { + sinfo.links[i] = + kzalloc(sizeof(*sinfo.links[0]), GFP_KERNEL); + if (!sinfo.links[i]) { + err = -ENOMEM; + goto out_err; + } + sinfo_alloc = true; + } + err = rdev_dump_station(rdev, wdev->netdev, sta_idx, mac_addr, &sinfo); if (err == -ENOENT) @@ -6972,6 +7562,14 @@ static int nl80211_dump_station(struct sk_buff *skb, if (err) goto out_err; + if (sinfo.valid_links) + cfg80211_sta_set_mld_sinfo(&sinfo); + + /* reset the sinfo_alloc flag as nl80211_send_station() + * always releases sinfo + */ + sinfo_alloc = false; + if (nl80211_send_station(skb, NL80211_CMD_NEW_STATION, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, @@ -6986,6 +7584,8 @@ static int nl80211_dump_station(struct sk_buff *skb, cb->args[2] = sta_idx; err = skb->len; out_err: + if (sinfo_alloc) + cfg80211_sinfo_release_content(&sinfo); wiphy_unlock(&rdev->wiphy); return err; @@ -6998,7 +7598,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) struct station_info sinfo; struct sk_buff *msg; u8 *mac_addr = NULL; - int err; + int err, i; memset(&sinfo, 0, sizeof(sinfo)); @@ -7010,9 +7610,19 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->get_station) return -EOPNOTSUPP; + for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { + sinfo.links[i] = kzalloc(sizeof(*sinfo.links[0]), GFP_KERNEL); + if (!sinfo.links[i]) { + cfg80211_sinfo_release_content(&sinfo); + return -ENOMEM; + } + } + err = rdev_get_station(rdev, dev, mac_addr, &sinfo); - if (err) + if (err) { + cfg80211_sinfo_release_content(&sinfo); return err; + } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { @@ -7020,6 +7630,9 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; } + if (sinfo.valid_links) + cfg80211_sta_set_mld_sinfo(&sinfo); + if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, info->snd_portid, info->snd_seq, 0, rdev, dev, mac_addr, &sinfo) < 0) { @@ -7118,6 +7731,11 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; } + /* Accept EMLSR capabilities only for AP client before association */ + if (statype != CFG80211_STA_AP_CLIENT_UNASSOC && + params->eml_cap_present) + return -EINVAL; + switch (statype) { case CFG80211_STA_AP_MLME_CLIENT: /* Use this only for authorizing/unauthorizing a station */ @@ -7321,6 +7939,10 @@ static int nl80211_set_station_tdls(struct genl_info *info, } } + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) + params->link_sta_params.s1g_capa = + nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]); + err = nl80211_parse_sta_channel_info(info, params); if (err) return err; @@ -7473,6 +8095,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) params.link_sta_params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) { + params.eml_cap_present = true; + params.eml_cap = + nla_get_u16(info->attrs[NL80211_ATTR_EML_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) params.airtime_weight = nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]); @@ -7631,10 +8259,20 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) { + params.eml_cap_present = true; + params.eml_cap = + nla_get_u16(info->attrs[NL80211_ATTR_EML_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) params.link_sta_params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) + params.link_sta_params.s1g_capa = + nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]); + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.link_sta_params.opmode_notif_used = true; params.link_sta_params.opmode_notif = @@ -9203,6 +9841,7 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev, { unsigned int link_id; bool all_ok = true; + int radio_idx; lockdep_assert_wiphy(wdev->wiphy); @@ -9212,8 +9851,10 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev, if (!cfg80211_beaconing_iface_active(wdev)) return true; + radio_idx = cfg80211_get_radio_idx_by_chan(wdev->wiphy, chan); + /* - * FIXME: check if we have a free HW resource/link for chan + * FIXME: check if we have a free radio/link for chan * * This, as well as the FIXME below, requires knowing the link * capabilities of the hardware. @@ -9222,20 +9863,28 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev, /* we cannot leave radar channels */ for_each_valid_link(wdev, link_id) { struct cfg80211_chan_def *chandef; + int link_radio_idx; chandef = wdev_chandef(wdev, link_id); if (!chandef || !chandef->chan) continue; + if (!(chandef->chan->flags & IEEE80211_CHAN_RADAR)) + continue; + /* - * FIXME: don't require all_ok, but rather check only the - * correct HW resource/link onto which 'chan' falls, - * as only that link leaves the channel for doing - * the off-channel operation. + * chandef->chan is a radar channel. If the radio/link onto + * which this radar channel falls is the same radio/link onto + * which the input 'chan' falls, off-channel operation should + * not be allowed. Hence, set 'all_ok' to false. */ - if (chandef->chan->flags & IEEE80211_CHAN_RADAR) + link_radio_idx = cfg80211_get_radio_idx_by_chan(wdev->wiphy, + chandef->chan); + if (link_radio_idx == radio_idx) { all_ok = false; + break; + } } if (all_ok) @@ -9256,34 +9905,12 @@ static bool nl80211_check_scan_feat(struct wiphy *wiphy, u32 flags, u32 flag, static int nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, - void *request, struct nlattr **attrs, - bool is_sched_scan) + struct nlattr **attrs, u8 *mac_addr, u8 *mac_addr_mask, + u32 *flags, enum nl80211_feature_flags randomness_flag) { - u8 *mac_addr, *mac_addr_mask; - u32 *flags; - enum nl80211_feature_flags randomness_flag; - if (!attrs[NL80211_ATTR_SCAN_FLAGS]) return 0; - if (is_sched_scan) { - struct cfg80211_sched_scan_request *req = request; - - randomness_flag = wdev ? - NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR : - NL80211_FEATURE_ND_RANDOM_MAC_ADDR; - flags = &req->flags; - mac_addr = req->mac_addr; - mac_addr_mask = req->mac_addr_mask; - } else { - struct cfg80211_scan_request *req = request; - - randomness_flag = NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR; - flags = &req->flags; - mac_addr = req->mac_addr; - mac_addr_mask = req->mac_addr_mask; - } - *flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]); if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && @@ -9332,11 +9959,35 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, return 0; } +static int +nl80211_check_scan_flags_sched(struct wiphy *wiphy, struct wireless_dev *wdev, + struct nlattr **attrs, + struct cfg80211_sched_scan_request *req) +{ + return nl80211_check_scan_flags(wiphy, wdev, attrs, + req->mac_addr, req->mac_addr_mask, + &req->flags, + wdev ? NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR : + NL80211_FEATURE_ND_RANDOM_MAC_ADDR); +} + +static int +nl80211_check_scan_flags_reg(struct wiphy *wiphy, struct wireless_dev *wdev, + struct nlattr **attrs, + struct cfg80211_scan_request_int *req) +{ + return nl80211_check_scan_flags(wiphy, wdev, attrs, + req->req.mac_addr, + req->req.mac_addr_mask, + &req->req.flags, + NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR); +} + static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; - struct cfg80211_scan_request *request; + struct cfg80211_scan_request_int *request; struct nlattr *scan_freqs = NULL; bool scan_freqs_khz = false; struct nlattr *attr; @@ -9388,21 +10039,20 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (ie_len > wiphy->max_scan_ie_len) return -EINVAL; - size = struct_size(request, channels, n_channels); + size = struct_size(request, req.channels, n_channels); ssids_offset = size; - size = size_add(size, array_size(sizeof(*request->ssids), n_ssids)); + size = size_add(size, array_size(sizeof(*request->req.ssids), n_ssids)); ie_offset = size; size = size_add(size, ie_len); request = kzalloc(size, GFP_KERNEL); if (!request) return -ENOMEM; - request->n_channels = n_channels; if (n_ssids) - request->ssids = (void *)request + ssids_offset; - request->n_ssids = n_ssids; + request->req.ssids = (void *)request + ssids_offset; + request->req.n_ssids = n_ssids; if (ie_len) - request->ie = (void *)request + ie_offset; + request->req.ie = (void *)request + ie_offset; i = 0; if (scan_freqs) { @@ -9425,7 +10075,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) !cfg80211_wdev_channel_allowed(wdev, chan)) continue; - request->channels[i] = chan; + request->req.channels[i] = chan; i++; } } else { @@ -9446,7 +10096,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) !cfg80211_wdev_channel_allowed(wdev, chan)) continue; - request->channels[i] = chan; + request->req.channels[i] = chan; i++; } } @@ -9457,10 +10107,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; } - request->n_channels = i; + request->req.n_channels = i; - for (i = 0; i < request->n_channels; i++) { - struct ieee80211_channel *chan = request->channels[i]; + for (i = 0; i < request->req.n_channels; i++) { + struct ieee80211_channel *chan = request->req.channels[i]; /* if we can go off-channel to the target channel we're good */ if (cfg80211_off_channel_oper_allowed(wdev, chan)) @@ -9479,22 +10129,23 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) err = -EINVAL; goto out_free; } - request->ssids[i].ssid_len = nla_len(attr); - memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr)); + request->req.ssids[i].ssid_len = nla_len(attr); + memcpy(request->req.ssids[i].ssid, + nla_data(attr), nla_len(attr)); i++; } } if (info->attrs[NL80211_ATTR_IE]) { - request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); - memcpy((void *)request->ie, + request->req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + memcpy((void *)request->req.ie, nla_data(info->attrs[NL80211_ATTR_IE]), - request->ie_len); + request->req.ie_len); } for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) - request->rates[i] = + request->req.rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; if (info->attrs[NL80211_ATTR_SCAN_SUPP_RATES]) { @@ -9514,25 +10165,24 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) err = ieee80211_get_ratemask(wiphy->bands[band], nla_data(attr), nla_len(attr), - &request->rates[band]); + &request->req.rates[band]); if (err) goto out_free; } } if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) { - request->duration = + request->req.duration = nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]); - request->duration_mandatory = + request->req.duration_mandatory = nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]); } - err = nl80211_check_scan_flags(wiphy, wdev, request, info->attrs, - false); + err = nl80211_check_scan_flags_reg(wiphy, wdev, info->attrs, request); if (err) goto out_free; - request->no_cck = + request->req.no_cck = nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]); /* Initial implementation used NL80211_ATTR_MAC to set the specific @@ -9545,19 +10195,21 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) * (NL80211_ATTR_SCAN_FLAGS is used to enable random MAC address use). */ if (info->attrs[NL80211_ATTR_BSSID]) - memcpy(request->bssid, + memcpy(request->req.bssid, nla_data(info->attrs[NL80211_ATTR_BSSID]), ETH_ALEN); - else if (!(request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) && + else if (!(request->req.flags & NL80211_SCAN_FLAG_RANDOM_ADDR) && info->attrs[NL80211_ATTR_MAC]) - memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]), + memcpy(request->req.bssid, + nla_data(info->attrs[NL80211_ATTR_MAC]), ETH_ALEN); else - eth_broadcast_addr(request->bssid); + eth_broadcast_addr(request->req.bssid); - request->tsf_report_link_id = nl80211_link_id_or_invalid(info->attrs); - request->wdev = wdev; - request->wiphy = &rdev->wiphy; - request->scan_start = jiffies; + request->req.tsf_report_link_id = + nl80211_link_id_or_invalid(info->attrs); + request->req.wdev = wdev; + request->req.wiphy = &rdev->wiphy; + request->req.scan_start = jiffies; rdev->scan_req = request; err = cfg80211_scan(rdev); @@ -9979,7 +10631,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, request->ie_len); } - err = nl80211_check_scan_flags(wiphy, wdev, request, attrs, true); + err = nl80211_check_scan_flags_sched(wiphy, wdev, attrs, request); if (err) goto out_free; @@ -10433,6 +11085,16 @@ skip_beacons: if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX]) params.block_tx = true; + if ((wdev->iftype == NL80211_IFTYPE_AP || + wdev->iftype == NL80211_IFTYPE_P2P_GO) && + info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) { + err = nl80211_parse_unsol_bcast_probe_resp( + rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP], + ¶ms.unsol_bcast_probe_resp); + if (err) + goto free; + } + params.link_id = link_id; err = rdev_channel_switch(rdev, dev, ¶ms); @@ -16235,6 +16897,14 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info) params.counter_offset_presp = offset; } + if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) { + err = nl80211_parse_unsol_bcast_probe_resp( + rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP], + ¶ms.unsol_bcast_probe_resp); + if (err) + goto out; + } + params.link_id = nl80211_link_id(info->attrs); err = rdev_color_change(rdev, dev, ¶ms); @@ -16889,6 +17559,7 @@ static int nl80211_set_sar_specs(struct sk_buff *skb, struct genl_info *info) if (!sar_spec) return -ENOMEM; + sar_spec->num_sub_specs = specs; sar_spec->type = type; specs = 0; nla_for_each_nested(spec_list, tb[NL80211_SAR_ATTR_SPECS], rem) { @@ -17859,7 +18530,7 @@ void nl80211_notify_iface(struct cfg80211_registered_device *rdev, static int nl80211_add_scan_req(struct sk_buff *msg, struct cfg80211_registered_device *rdev) { - struct cfg80211_scan_request *req = rdev->scan_req; + struct cfg80211_scan_request_int *req = rdev->scan_req; struct nlattr *nest; int i; struct cfg80211_scan_info *info; @@ -17870,19 +18541,20 @@ static int nl80211_add_scan_req(struct sk_buff *msg, nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_SSIDS); if (!nest) goto nla_put_failure; - for (i = 0; i < req->n_ssids; i++) { - if (nla_put(msg, i, req->ssids[i].ssid_len, req->ssids[i].ssid)) + for (i = 0; i < req->req.n_ssids; i++) { + if (nla_put(msg, i, req->req.ssids[i].ssid_len, + req->req.ssids[i].ssid)) goto nla_put_failure; } nla_nest_end(msg, nest); - if (req->flags & NL80211_SCAN_FLAG_FREQ_KHZ) { + if (req->req.flags & NL80211_SCAN_FLAG_FREQ_KHZ) { nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQ_KHZ); if (!nest) goto nla_put_failure; - for (i = 0; i < req->n_channels; i++) { + for (i = 0; i < req->req.n_channels; i++) { if (nla_put_u32(msg, i, - ieee80211_channel_to_khz(req->channels[i]))) + ieee80211_channel_to_khz(req->req.channels[i]))) goto nla_put_failure; } nla_nest_end(msg, nest); @@ -17891,19 +18563,20 @@ static int nl80211_add_scan_req(struct sk_buff *msg, NL80211_ATTR_SCAN_FREQUENCIES); if (!nest) goto nla_put_failure; - for (i = 0; i < req->n_channels; i++) { - if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + for (i = 0; i < req->req.n_channels; i++) { + if (nla_put_u32(msg, i, + req->req.channels[i]->center_freq)) goto nla_put_failure; } nla_nest_end(msg, nest); } - if (req->ie && - nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie)) + if (req->req.ie && + nla_put(msg, NL80211_ATTR_IE, req->req.ie_len, req->req.ie)) goto nla_put_failure; - if (req->flags && - nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags)) + if (req->req.flags && + nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->req.flags)) goto nla_put_failure; info = rdev->int_scan_req ? &rdev->int_scan_req->info : @@ -19083,7 +19756,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, EXPORT_SYMBOL(cfg80211_conn_failed); static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, - const u8 *addr, gfp_t gfp) + const u8 *addr, int link_id, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -19106,7 +19779,9 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || - nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr)) + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || + (link_id >= 0 && + nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -19118,13 +19793,13 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, return true; } -bool cfg80211_rx_spurious_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp) +bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; bool ret; - trace_cfg80211_rx_spurious_frame(dev, addr); + trace_cfg80211_rx_spurious_frame(dev, addr, link_id); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && wdev->iftype != NL80211_IFTYPE_P2P_GO)) { @@ -19132,19 +19807,19 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev, return false; } ret = __nl80211_unexpected_frame(dev, NL80211_CMD_UNEXPECTED_FRAME, - addr, gfp); + addr, link_id, gfp); trace_cfg80211_return_bool(ret); return ret; } EXPORT_SYMBOL(cfg80211_rx_spurious_frame); -bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp) +bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; bool ret; - trace_cfg80211_rx_unexpected_4addr_frame(dev, addr); + trace_cfg80211_rx_unexpected_4addr_frame(dev, addr, link_id); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && wdev->iftype != NL80211_IFTYPE_P2P_GO && @@ -19154,7 +19829,7 @@ bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, } ret = __nl80211_unexpected_frame(dev, NL80211_CMD_UNEXPECTED_4ADDR_FRAME, - addr, gfp); + addr, link_id, gfp); trace_cfg80211_return_bool(ret); return ret; } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 9f4783c2354c9..ac6884bacf3fa 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -456,15 +456,15 @@ rdev_set_monitor_channel(struct cfg80211_registered_device *rdev, } static inline int rdev_scan(struct cfg80211_registered_device *rdev, - struct cfg80211_scan_request *request) + struct cfg80211_scan_request_int *request) { int ret; - if (WARN_ON_ONCE(!request->n_ssids && request->ssids)) + if (WARN_ON_ONCE(!request->req.n_ssids && request->req.ssids)) return -EINVAL; trace_rdev_scan(&rdev->wiphy, request); - ret = rdev->ops->scan(&rdev->wiphy, request); + ret = rdev->ops->scan(&rdev->wiphy, &request->req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -577,35 +577,40 @@ static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev, } static inline int -rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed) +rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, int radio_idx, + u32 changed) { int ret = -EOPNOTSUPP; - trace_rdev_set_wiphy_params(&rdev->wiphy, changed); + trace_rdev_set_wiphy_params(&rdev->wiphy, radio_idx, changed); if (rdev->ops->set_wiphy_params) - ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); + ret = rdev->ops->set_wiphy_params(&rdev->wiphy, radio_idx, + changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm) + struct wireless_dev *wdev, int radio_idx, + enum nl80211_tx_power_setting type, + int mbm) { int ret; - trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm); - ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm); + trace_rdev_set_tx_power(&rdev->wiphy, wdev, radio_idx, type, mbm); + ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, radio_idx, type, + mbm); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, unsigned int link_id, - int *dbm) + struct wireless_dev *wdev, int radio_idx, + unsigned int link_id, int *dbm) { int ret; - trace_rdev_get_tx_power(&rdev->wiphy, wdev, link_id); - ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, link_id, dbm); + trace_rdev_get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id); + ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id, + dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } @@ -857,21 +862,21 @@ rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, } static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev, - u32 tx_ant, u32 rx_ant) + int radio_idx, u32 tx_ant, u32 rx_ant) { int ret; - trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant); - ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant); + trace_rdev_set_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); + ret = rdev->ops->set_antenna(&rdev->wiphy, -1, tx_ant, rx_ant); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev, - u32 *tx_ant, u32 *rx_ant) + int radio_idx, u32 *tx_ant, u32 *rx_ant) { int ret; - trace_rdev_get_antenna(&rdev->wiphy); - ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant); + trace_rdev_get_antenna(&rdev->wiphy, radio_idx); + ret = rdev->ops->get_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); if (ret) trace_rdev_return_int(&rdev->wiphy, ret); else diff --git a/net/wireless/reg.c b/net/wireless/reg.c index c1752b31734fa..3b0ac3437f819 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -53,7 +53,7 @@ #include <linux/list.h> #include <linux/ctype.h> #include <linux/nl80211.h> -#include <linux/platform_device.h> +#include <linux/device/faux.h> #include <linux/verification.h> #include <linux/moduleparam.h> #include <linux/firmware.h> @@ -105,7 +105,7 @@ static struct regulatory_request __rcu *last_request = (void __force __rcu *)&core_request_world; /* To trigger userspace events and load firmware */ -static struct platform_device *reg_pdev; +static struct faux_device *reg_fdev; /* * Central wireless core regulatory domains, we only need two, @@ -583,7 +583,7 @@ static int call_crda(const char *alpha2) else pr_debug("Calling CRDA to update world regulatory domain\n"); - ret = kobject_uevent_env(®_pdev->dev.kobj, KOBJ_CHANGE, env); + ret = kobject_uevent_env(®_fdev->dev.kobj, KOBJ_CHANGE, env); if (ret) return ret; @@ -779,7 +779,7 @@ static bool regdb_has_valid_signature(const u8 *data, unsigned int size) const struct firmware *sig; bool result; - if (request_firmware(&sig, "regulatory.db.p7s", ®_pdev->dev)) + if (request_firmware(&sig, "regulatory.db.p7s", ®_fdev->dev)) return false; result = verify_pkcs7_signature(data, size, sig->data, sig->size, @@ -1061,7 +1061,7 @@ static int query_regdb_file(const char *alpha2) return -ENOMEM; err = request_firmware_nowait(THIS_MODULE, true, "regulatory.db", - ®_pdev->dev, GFP_KERNEL, + ®_fdev->dev, GFP_KERNEL, (void *)alpha2, regdb_fw_cb); if (err) kfree(alpha2); @@ -1077,7 +1077,7 @@ int reg_reload_regdb(void) const struct ieee80211_regdomain *current_regdomain; struct regulatory_request *request; - err = request_firmware(&fw, "regulatory.db", ®_pdev->dev); + err = request_firmware(&fw, "regulatory.db", ®_fdev->dev); if (err) return err; @@ -4229,6 +4229,8 @@ static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev) struct wireless_dev *wdev; unsigned int link_id; + guard(wiphy)(&rdev->wiphy); + /* If we finished CAC or received radar, we should end any * CAC running on the same channels. * the check !cfg80211_chandef_dfs_usable contain 2 options: @@ -4300,12 +4302,12 @@ static int __init regulatory_init_db(void) * in that case, don't try to do any further work here as * it's doomed to lead to crashes. */ - if (IS_ERR_OR_NULL(reg_pdev)) + if (!reg_fdev) return -EINVAL; err = load_builtin_regdb_keys(); if (err) { - platform_device_unregister(reg_pdev); + faux_device_destroy(reg_fdev); return err; } @@ -4313,7 +4315,7 @@ static int __init regulatory_init_db(void) err = regulatory_hint_core(cfg80211_world_regdom->alpha2); if (err) { if (err == -ENOMEM) { - platform_device_unregister(reg_pdev); + faux_device_destroy(reg_fdev); return err; } /* @@ -4342,9 +4344,9 @@ late_initcall(regulatory_init_db); int __init regulatory_init(void) { - reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); - if (IS_ERR(reg_pdev)) - return PTR_ERR(reg_pdev); + reg_fdev = faux_device_create("regulatory", NULL, NULL); + if (!reg_fdev) + return -ENODEV; rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); @@ -4372,9 +4374,9 @@ void regulatory_exit(void) reset_regdomains(true, NULL); rtnl_unlock(); - dev_set_uevent_suppress(®_pdev->dev, true); + dev_set_uevent_suppress(®_fdev->dev, true); - platform_device_unregister(reg_pdev); + faux_device_destroy(reg_fdev); list_for_each_entry_safe(reg_beacon, btmp, ®_pending_beacons, list) { list_del(®_beacon->list); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index ddd3a97f6609d..a8339ed52404c 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -782,9 +782,9 @@ cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, } EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_parse_colocated_ap); -static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, - struct ieee80211_channel *chan, - bool add_to_6ghz) +static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, + struct ieee80211_channel *chan, + bool add_to_6ghz) { int i; u32 n_channels = request->n_channels; @@ -838,30 +838,32 @@ static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, return false; } -static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) +static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev, + bool first_part) { u8 i; struct cfg80211_colocated_ap *ap; int n_channels, count = 0, err; - struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req; + struct cfg80211_scan_request_int *request, *rdev_req = rdev->scan_req; LIST_HEAD(coloc_ap_list); bool need_scan_psc = true; const struct ieee80211_sband_iftype_data *iftd; size_t size, offs_ssids, offs_6ghz_params, offs_ies; - rdev_req->scan_6ghz = true; + rdev_req->req.scan_6ghz = true; + rdev_req->req.first_part = first_part; if (!rdev->wiphy.bands[NL80211_BAND_6GHZ]) return -EOPNOTSUPP; iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ], - rdev_req->wdev->iftype); + rdev_req->req.wdev->iftype); if (!iftd || !iftd->he_cap.has_he) return -EOPNOTSUPP; n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels; - if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) { + if (rdev_req->req.flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) { struct cfg80211_internal_bss *intbss; spin_lock_bh(&rdev->bss_lock); @@ -883,8 +885,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) * This is relevant for ML probe requests when the lower * band APs have not been discovered. */ - if (is_broadcast_ether_addr(rdev_req->bssid) || - !ether_addr_equal(rdev_req->bssid, res->bssid) || + if (is_broadcast_ether_addr(rdev_req->req.bssid) || + !ether_addr_equal(rdev_req->req.bssid, res->bssid) || res->channel->band != NL80211_BAND_6GHZ) continue; @@ -911,13 +913,13 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) spin_unlock_bh(&rdev->bss_lock); } - size = struct_size(request, channels, n_channels); + size = struct_size(request, req.channels, n_channels); offs_ssids = size; - size += sizeof(*request->ssids) * rdev_req->n_ssids; + size += sizeof(*request->req.ssids) * rdev_req->req.n_ssids; offs_6ghz_params = size; - size += sizeof(*request->scan_6ghz_params) * count; + size += sizeof(*request->req.scan_6ghz_params) * count; offs_ies = size; - size += rdev_req->ie_len; + size += rdev_req->req.ie_len; request = kzalloc(size, GFP_KERNEL); if (!request) { @@ -926,26 +928,26 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) } *request = *rdev_req; - request->n_channels = 0; - request->n_6ghz_params = 0; - if (rdev_req->n_ssids) { + request->req.n_channels = 0; + request->req.n_6ghz_params = 0; + if (rdev_req->req.n_ssids) { /* * Add the ssids from the parent scan request to the new * scan request, so the driver would be able to use them * in its probe requests to discover hidden APs on PSC * channels. */ - request->ssids = (void *)request + offs_ssids; - memcpy(request->ssids, rdev_req->ssids, - sizeof(*request->ssids) * request->n_ssids); + request->req.ssids = (void *)request + offs_ssids; + memcpy(request->req.ssids, rdev_req->req.ssids, + sizeof(*request->req.ssids) * request->req.n_ssids); } - request->scan_6ghz_params = (void *)request + offs_6ghz_params; + request->req.scan_6ghz_params = (void *)request + offs_6ghz_params; - if (rdev_req->ie_len) { + if (rdev_req->req.ie_len) { void *ie = (void *)request + offs_ies; - memcpy(ie, rdev_req->ie, rdev_req->ie_len); - request->ie = ie; + memcpy(ie, rdev_req->req.ie, rdev_req->req.ie_len); + request->req.ie = ie; } /* @@ -953,10 +955,12 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) * and at least one of the reported co-located APs with same SSID * indicating that all APs in the same ESS are co-located */ - if (count && request->n_ssids == 1 && request->ssids[0].ssid_len) { + if (count && + request->req.n_ssids == 1 && + request->req.ssids[0].ssid_len) { list_for_each_entry(ap, &coloc_ap_list, list) { if (ap->colocated_ess && - cfg80211_find_ssid_match(ap, request)) { + cfg80211_find_ssid_match(ap, &request->req)) { need_scan_psc = false; break; } @@ -968,51 +972,52 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) * regardless of the collocated APs (PSC channels or all channels * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set) */ - for (i = 0; i < rdev_req->n_channels; i++) { - if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ && + for (i = 0; i < rdev_req->req.n_channels; i++) { + if (rdev_req->req.channels[i]->band == NL80211_BAND_6GHZ && ((need_scan_psc && - cfg80211_channel_is_psc(rdev_req->channels[i])) || - !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) { - cfg80211_scan_req_add_chan(request, - rdev_req->channels[i], + cfg80211_channel_is_psc(rdev_req->req.channels[i])) || + !(rdev_req->req.flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) { + cfg80211_scan_req_add_chan(&request->req, + rdev_req->req.channels[i], false); } } - if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ)) + if (!(rdev_req->req.flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ)) goto skip; list_for_each_entry(ap, &coloc_ap_list, list) { bool found = false; struct cfg80211_scan_6ghz_params *scan_6ghz_params = - &request->scan_6ghz_params[request->n_6ghz_params]; + &request->req.scan_6ghz_params[request->req.n_6ghz_params]; struct ieee80211_channel *chan = ieee80211_get_channel(&rdev->wiphy, ap->center_freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED || - !cfg80211_wdev_channel_allowed(rdev_req->wdev, chan)) + !cfg80211_wdev_channel_allowed(rdev_req->req.wdev, chan)) continue; - for (i = 0; i < rdev_req->n_channels; i++) { - if (rdev_req->channels[i] == chan) + for (i = 0; i < rdev_req->req.n_channels; i++) { + if (rdev_req->req.channels[i] == chan) found = true; } if (!found) continue; - if (request->n_ssids > 0 && - !cfg80211_find_ssid_match(ap, request)) + if (request->req.n_ssids > 0 && + !cfg80211_find_ssid_match(ap, &request->req)) continue; - if (!is_broadcast_ether_addr(request->bssid) && - !ether_addr_equal(request->bssid, ap->bssid)) + if (!is_broadcast_ether_addr(request->req.bssid) && + !ether_addr_equal(request->req.bssid, ap->bssid)) continue; - if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid) + if (!request->req.n_ssids && ap->multi_bss && + !ap->transmitted_bssid) continue; - cfg80211_scan_req_add_chan(request, chan, true); + cfg80211_scan_req_add_chan(&request->req, chan, true); memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN); scan_6ghz_params->short_ssid = ap->short_ssid; scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid; @@ -1028,14 +1033,14 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) if (cfg80211_channel_is_psc(chan) && !need_scan_psc) scan_6ghz_params->psc_no_listen = true; - request->n_6ghz_params++; + request->req.n_6ghz_params++; } skip: cfg80211_free_coloc_ap_list(&coloc_ap_list); - if (request->n_channels) { - struct cfg80211_scan_request *old = rdev->int_scan_req; + if (request->req.n_channels) { + struct cfg80211_scan_request_int *old = rdev->int_scan_req; rdev->int_scan_req = request; @@ -1043,7 +1048,7 @@ skip: * If this scan follows a previous scan, save the scan start * info from the first part of the scan */ - if (old) + if (!first_part && !WARN_ON(!old)) rdev->int_scan_req->info = old->info; err = rdev_scan(rdev, request); @@ -1063,35 +1068,39 @@ skip: int cfg80211_scan(struct cfg80211_registered_device *rdev) { - struct cfg80211_scan_request *request; - struct cfg80211_scan_request *rdev_req = rdev->scan_req; + struct cfg80211_scan_request_int *request; + struct cfg80211_scan_request_int *rdev_req = rdev->scan_req; u32 n_channels = 0, idx, i; - if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) + if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) { + rdev_req->req.first_part = true; return rdev_scan(rdev, rdev_req); + } - for (i = 0; i < rdev_req->n_channels; i++) { - if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) + for (i = 0; i < rdev_req->req.n_channels; i++) { + if (rdev_req->req.channels[i]->band != NL80211_BAND_6GHZ) n_channels++; } if (!n_channels) - return cfg80211_scan_6ghz(rdev); + return cfg80211_scan_6ghz(rdev, true); - request = kzalloc(struct_size(request, channels, n_channels), + request = kzalloc(struct_size(request, req.channels, n_channels), GFP_KERNEL); if (!request) return -ENOMEM; *request = *rdev_req; - request->n_channels = n_channels; + request->req.n_channels = n_channels; - for (i = idx = 0; i < rdev_req->n_channels; i++) { - if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) - request->channels[idx++] = rdev_req->channels[i]; + for (i = idx = 0; i < rdev_req->req.n_channels; i++) { + if (rdev_req->req.channels[i]->band != NL80211_BAND_6GHZ) + request->req.channels[idx++] = + rdev_req->req.channels[i]; } - rdev_req->scan_6ghz = false; + rdev_req->req.scan_6ghz = false; + rdev_req->req.first_part = true; rdev->int_scan_req = request; return rdev_scan(rdev, request); } @@ -1099,7 +1108,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev) void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message) { - struct cfg80211_scan_request *request, *rdev_req; + struct cfg80211_scan_request_int *request, *rdev_req; struct wireless_dev *wdev; struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT @@ -1118,13 +1127,13 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (!rdev_req) return; - wdev = rdev_req->wdev; + wdev = rdev_req->req.wdev; request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req; if (wdev_running(wdev) && (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) && - !rdev_req->scan_6ghz && !request->info.aborted && - !cfg80211_scan_6ghz(rdev)) + !rdev_req->req.scan_6ghz && !request->info.aborted && + !cfg80211_scan_6ghz(rdev, false)) return; /* @@ -1136,10 +1145,10 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, cfg80211_sme_scan_done(wdev->netdev); if (!request->info.aborted && - request->flags & NL80211_SCAN_FLAG_FLUSH) { + request->req.flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); - __cfg80211_bss_expire(rdev, request->scan_start); + __cfg80211_bss_expire(rdev, request->req.scan_start); spin_unlock_bh(&rdev->bss_lock); } @@ -1175,13 +1184,16 @@ void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk) void cfg80211_scan_done(struct cfg80211_scan_request *request, struct cfg80211_scan_info *info) { - struct cfg80211_scan_info old_info = request->info; + struct cfg80211_scan_request_int *intreq = + container_of(request, struct cfg80211_scan_request_int, req); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(request->wiphy); + struct cfg80211_scan_info old_info = intreq->info; - trace_cfg80211_scan_done(request, info); - WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req && - request != wiphy_to_rdev(request->wiphy)->int_scan_req); + trace_cfg80211_scan_done(intreq, info); + WARN_ON(intreq != rdev->scan_req && + intreq != rdev->int_scan_req); - request->info = *info; + intreq->info = *info; /* * In case the scan is split, the scan_start_tsf and tsf_bssid should @@ -1189,14 +1201,13 @@ void cfg80211_scan_done(struct cfg80211_scan_request *request, * be non zero. */ if (request->scan_6ghz && old_info.scan_start_tsf) { - request->info.scan_start_tsf = old_info.scan_start_tsf; - memcpy(request->info.tsf_bssid, old_info.tsf_bssid, - sizeof(request->info.tsf_bssid)); + intreq->info.scan_start_tsf = old_info.scan_start_tsf; + memcpy(intreq->info.tsf_bssid, old_info.tsf_bssid, + sizeof(intreq->info.tsf_bssid)); } - request->notified = true; - wiphy_work_queue(request->wiphy, - &wiphy_to_rdev(request->wiphy)->scan_done_wk); + intreq->notified = true; + wiphy_work_queue(request->wiphy, &rdev->scan_done_wk); } EXPORT_SYMBOL(cfg80211_scan_done); @@ -2220,6 +2231,7 @@ cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len) return IEEE80211_REG_LPI_AP; case IEEE80211_6GHZ_CTRL_REG_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: return IEEE80211_REG_SP_AP; case IEEE80211_6GHZ_CTRL_REG_VLP_AP: return IEEE80211_REG_VLP_AP; @@ -3250,6 +3262,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, const u8 *ie; size_t ielen; u64 tsf; + size_t s1g_optional_len; if (WARN_ON(!mgmt)) return NULL; @@ -3264,12 +3277,11 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *) mgmt; - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_short_beacon.variable); - else - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_beacon.variable); + s1g_optional_len = + ieee80211_s1g_optional_len(ext->frame_control); + min_hdr_len = + offsetof(struct ieee80211_ext, u.s1g_beacon.variable) + + s1g_optional_len; } else { /* same for beacons */ min_hdr_len = offsetof(struct ieee80211_mgmt, @@ -3285,11 +3297,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, const struct ieee80211_s1g_bcn_compat_ie *compat; const struct element *elem; - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - ie = ext->u.s1g_short_beacon.variable; - else - ie = ext->u.s1g_beacon.variable; - + ie = ext->u.s1g_beacon.variable + s1g_optional_len; elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, ie, ielen); if (!elem) return NULL; @@ -3500,7 +3508,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, struct cfg80211_registered_device *rdev; struct wiphy *wiphy; struct iw_scan_req *wreq = NULL; - struct cfg80211_scan_request *creq; + struct cfg80211_scan_request_int *creq; int i, err, n_channels = 0; enum nl80211_band band; @@ -3530,19 +3538,20 @@ int cfg80211_wext_siwscan(struct net_device *dev, n_channels = ieee80211_get_num_supported_channels(wiphy); } - creq = kzalloc(struct_size(creq, channels, n_channels) + + creq = kzalloc(struct_size(creq, req.channels, n_channels) + sizeof(struct cfg80211_ssid), GFP_ATOMIC); if (!creq) return -ENOMEM; - creq->wiphy = wiphy; - creq->wdev = dev->ieee80211_ptr; + creq->req.wiphy = wiphy; + creq->req.wdev = dev->ieee80211_ptr; /* SSIDs come after channels */ - creq->ssids = (void *)creq + struct_size(creq, channels, n_channels); - creq->n_channels = n_channels; - creq->n_ssids = 1; - creq->scan_start = jiffies; + creq->req.ssids = (void *)creq + + struct_size(creq, req.channels, n_channels); + creq->req.n_channels = n_channels; + creq->req.n_ssids = 1; + creq->req.scan_start = jiffies; /* translate "Scan on frequencies" request */ i = 0; @@ -3558,7 +3567,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, /* ignore disabled channels */ chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || - !cfg80211_wdev_channel_allowed(creq->wdev, chan)) + !cfg80211_wdev_channel_allowed(creq->req.wdev, chan)) continue; /* If we have a wireless request structure and the @@ -3581,7 +3590,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, } wext_freq_found: - creq->channels[i] = &wiphy->bands[band]->channels[j]; + creq->req.channels[i] = + &wiphy->bands[band]->channels[j]; i++; wext_freq_not_found: ; } @@ -3592,28 +3602,30 @@ int cfg80211_wext_siwscan(struct net_device *dev, goto out; } - /* Set real number of channels specified in creq->channels[] */ - creq->n_channels = i; + /* Set real number of channels specified in creq->req.channels[] */ + creq->req.n_channels = i; /* translate "Scan for SSID" request */ if (wreq) { if (wrqu->data.flags & IW_SCAN_THIS_ESSID) { if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) return -EINVAL; - memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); - creq->ssids[0].ssid_len = wreq->essid_len; + memcpy(creq->req.ssids[0].ssid, wreq->essid, + wreq->essid_len); + creq->req.ssids[0].ssid_len = wreq->essid_len; } if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) { - creq->ssids = NULL; - creq->n_ssids = 0; + creq->req.ssids = NULL; + creq->req.n_ssids = 0; } } for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) - creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; + creq->req.rates[i] = + (1 << wiphy->bands[i]->n_bitrates) - 1; - eth_broadcast_addr(creq->bssid); + eth_broadcast_addr(creq->req.bssid); scoped_guard(wiphy, &rdev->wiphy) { rdev->scan_req = creq; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index cf998500a9654..826ec0a6355f1 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -5,7 +5,7 @@ * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2009, 2020, 2022-2024 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020, 2022-2025 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ @@ -64,7 +64,7 @@ static void cfg80211_sme_free(struct wireless_dev *wdev) static int cfg80211_conn_scan(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - struct cfg80211_scan_request *request; + struct cfg80211_scan_request_int *request; int n_channels, err; lockdep_assert_wiphy(wdev->wiphy); @@ -77,13 +77,12 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) else n_channels = ieee80211_get_num_supported_channels(wdev->wiphy); - request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) + - sizeof(request->channels[0]) * n_channels, + request = kzalloc(sizeof(*request) + sizeof(request->req.ssids[0]) + + sizeof(request->req.channels[0]) * n_channels, GFP_KERNEL); if (!request) return -ENOMEM; - request->n_channels = n_channels; if (wdev->conn->params.channel) { enum nl80211_band band = wdev->conn->params.channel->band; struct ieee80211_supported_band *sband = @@ -93,8 +92,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) kfree(request); return -EINVAL; } - request->channels[0] = wdev->conn->params.channel; - request->rates[band] = (1 << sband->n_bitrates) - 1; + request->req.channels[0] = wdev->conn->params.channel; + request->req.rates[band] = (1 << sband->n_bitrates) - 1; } else { int i = 0, j; enum nl80211_band band; @@ -109,26 +108,26 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) channel = &bands->channels[j]; if (channel->flags & IEEE80211_CHAN_DISABLED) continue; - request->channels[i++] = channel; + request->req.channels[i++] = channel; } - request->rates[band] = (1 << bands->n_bitrates) - 1; + request->req.rates[band] = (1 << bands->n_bitrates) - 1; } n_channels = i; } - request->n_channels = n_channels; - request->ssids = (void *)request + - struct_size(request, channels, n_channels); - request->n_ssids = 1; + request->req.n_channels = n_channels; + request->req.ssids = (void *)request + + struct_size(request, req.channels, n_channels); + request->req.n_ssids = 1; - memcpy(request->ssids[0].ssid, wdev->conn->params.ssid, - wdev->conn->params.ssid_len); - request->ssids[0].ssid_len = wdev->conn->params.ssid_len; + memcpy(request->req.ssids[0].ssid, wdev->conn->params.ssid, + wdev->conn->params.ssid_len); + request->req.ssids[0].ssid_len = wdev->conn->params.ssid_len; - eth_broadcast_addr(request->bssid); + eth_broadcast_addr(request->req.bssid); - request->wdev = wdev; - request->wiphy = &rdev->wiphy; - request->scan_start = jiffies; + request->req.wdev = wdev; + request->req.wiphy = &rdev->wiphy; + request->req.scan_start = jiffies; rdev->scan_req = request; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 4ed9fada4ec0e..34c584a215e5d 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -373,7 +373,8 @@ TRACE_EVENT(rdev_return_int, ); TRACE_EVENT(rdev_scan, - TP_PROTO(struct wiphy *wiphy, struct cfg80211_scan_request *request), + TP_PROTO(struct wiphy *wiphy, + struct cfg80211_scan_request_int *request), TP_ARGS(wiphy, request), TP_STRUCT__entry( WIPHY_ENTRY @@ -406,9 +407,19 @@ DEFINE_EVENT(wiphy_only_evt, rdev_return_void, TP_ARGS(wiphy) ); -DEFINE_EVENT(wiphy_only_evt, rdev_get_antenna, - TP_PROTO(struct wiphy *wiphy), - TP_ARGS(wiphy) +TRACE_EVENT(rdev_get_antenna, + TP_PROTO(struct wiphy *wiphy, int radio_idx), + TP_ARGS(wiphy, radio_idx), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(int, radio_idx) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->radio_idx = radio_idx; + ), + TP_printk(WIPHY_PR_FMT ", radio_idx: %d", + WIPHY_PR_ARG, __entry->radio_idx) ); DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll, @@ -1678,18 +1689,20 @@ TRACE_EVENT(rdev_join_ocb, ); TRACE_EVENT(rdev_set_wiphy_params, - TP_PROTO(struct wiphy *wiphy, u32 changed), - TP_ARGS(wiphy, changed), + TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 changed), + TP_ARGS(wiphy, radio_idx, changed), TP_STRUCT__entry( WIPHY_ENTRY + __field(int, radio_idx) __field(u32, changed) ), TP_fast_assign( WIPHY_ASSIGN; + __entry->radio_idx = radio_idx; __entry->changed = changed; ), - TP_printk(WIPHY_PR_FMT ", changed: %u", - WIPHY_PR_ARG, __entry->changed) + TP_printk(WIPHY_PR_FMT ", radio_idx: %d, changed: %u", + WIPHY_PR_ARG, __entry->radio_idx, __entry->changed) ); DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, @@ -1710,30 +1723,51 @@ DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) ); -DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_tx_power, +TRACE_EVENT(rdev_get_tx_power, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id), - TP_ARGS(wiphy, wdev, link_id) + int radio_idx, unsigned int link_id), + TP_ARGS(wiphy, wdev, radio_idx, link_id), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(int, radio_idx) + __field(unsigned int, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->radio_idx = radio_idx; + __entry->link_id = link_id; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", radio_idx: %d, link_id: %u", + WIPHY_PR_ARG, WDEV_PR_ARG, + __entry->radio_idx, __entry->link_id) ); TRACE_EVENT(rdev_set_tx_power, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm), - TP_ARGS(wiphy, wdev, type, mbm), + int radio_idx, enum nl80211_tx_power_setting type, + int mbm), + TP_ARGS(wiphy, wdev, radio_idx, type, mbm), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY + __field(int, radio_idx) __field(enum nl80211_tx_power_setting, type) __field(int, mbm) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; + __entry->radio_idx = radio_idx; __entry->type = type; __entry->mbm = mbm; ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type: %u, mbm: %d", - WIPHY_PR_ARG, WDEV_PR_ARG,__entry->type, __entry->mbm) + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", radio_idx: %d, type: %u, mbm: %d", + WIPHY_PR_ARG, WDEV_PR_ARG, + __entry->radio_idx, __entry->type, __entry->mbm) ); TRACE_EVENT(rdev_return_int_int, @@ -1866,26 +1900,24 @@ TRACE_EVENT(rdev_return_void_tx_rx, __entry->rx_max) ); -DECLARE_EVENT_CLASS(tx_rx_evt, - TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), - TP_ARGS(wiphy, tx, rx), +TRACE_EVENT(rdev_set_antenna, + TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 tx, u32 rx), + TP_ARGS(wiphy, radio_idx, tx, rx), TP_STRUCT__entry( WIPHY_ENTRY + __field(int, radio_idx) __field(u32, tx) __field(u32, rx) ), TP_fast_assign( WIPHY_ASSIGN; + __entry->radio_idx = radio_idx; __entry->tx = tx; __entry->rx = rx; ), - TP_printk(WIPHY_PR_FMT ", tx: %u, rx: %u ", - WIPHY_PR_ARG, __entry->tx, __entry->rx) -); - -DEFINE_EVENT(tx_rx_evt, rdev_set_antenna, - TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), - TP_ARGS(wiphy, tx, rx) + TP_printk(WIPHY_PR_FMT ", radio_idx: %d, tx: %u, rx: %u ", + WIPHY_PR_ARG, __entry->radio_idx, + __entry->tx, __entry->rx) ); DECLARE_EVENT_CLASS(wiphy_netdev_id_evt, @@ -3538,27 +3570,30 @@ TRACE_EVENT(cfg80211_cac_event, ); DECLARE_EVENT_CLASS(cfg80211_rx_evt, - TP_PROTO(struct net_device *netdev, const u8 *addr), - TP_ARGS(netdev, addr), + TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id), + TP_ARGS(netdev, addr, link_id), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(addr) + __field(int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(addr, addr); + __entry->link_id = link_id; ), - TP_printk(NETDEV_PR_FMT ", %pM", NETDEV_PR_ARG, __entry->addr) + TP_printk(NETDEV_PR_FMT ", %pM, link_id:%d", NETDEV_PR_ARG, + __entry->addr, __entry->link_id) ); DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame, - TP_PROTO(struct net_device *netdev, const u8 *addr), - TP_ARGS(netdev, addr) + TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id), + TP_ARGS(netdev, addr, link_id) ); DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_unexpected_4addr_frame, - TP_PROTO(struct net_device *netdev, const u8 *addr), - TP_ARGS(netdev, addr) + TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id), + TP_ARGS(netdev, addr, link_id) ); TRACE_EVENT(cfg80211_ibss_joined, @@ -3685,12 +3720,12 @@ TRACE_EVENT(cfg80211_tdls_oper_request, ); TRACE_EVENT(cfg80211_scan_done, - TP_PROTO(struct cfg80211_scan_request *request, + TP_PROTO(struct cfg80211_scan_request_int *request, struct cfg80211_scan_info *info), TP_ARGS(request, info), TP_STRUCT__entry( __field(u32, n_channels) - __dynamic_array(u8, ie, request ? request->ie_len : 0) + __dynamic_array(u8, ie, request ? request->req.ie_len : 0) __array(u32, rates, NUM_NL80211_BANDS) __field(u32, wdev_id) MAC_ENTRY(wiphy_mac) @@ -3701,16 +3736,16 @@ TRACE_EVENT(cfg80211_scan_done, ), TP_fast_assign( if (request) { - memcpy(__get_dynamic_array(ie), request->ie, - request->ie_len); - memcpy(__entry->rates, request->rates, + memcpy(__get_dynamic_array(ie), request->req.ie, + request->req.ie_len); + memcpy(__entry->rates, request->req.rates, NUM_NL80211_BANDS); - __entry->wdev_id = request->wdev ? - request->wdev->identifier : 0; - if (request->wiphy) + __entry->wdev_id = request->req.wdev ? + request->req.wdev->identifier : 0; + if (request->req.wiphy) MAC_ASSIGN(wiphy_mac, - request->wiphy->perm_addr); - __entry->no_cck = request->no_cck; + request->req.wiphy->perm_addr); + __entry->no_cck = request->req.no_cck; } if (info) { __entry->aborted = info->aborted; @@ -4126,20 +4161,22 @@ TRACE_EVENT(cfg80211_links_removed, TRACE_EVENT(cfg80211_mlo_reconf_add_done, TP_PROTO(struct net_device *netdev, u16 link_mask, - const u8 *buf, size_t len), - TP_ARGS(netdev, link_mask, buf, len), + const u8 *buf, size_t len, bool driver_initiated), + TP_ARGS(netdev, link_mask, buf, len, driver_initiated), TP_STRUCT__entry( NETDEV_ENTRY __field(u16, link_mask) __dynamic_array(u8, buf, len) + __field(bool, driver_initiated) ), TP_fast_assign( NETDEV_ASSIGN; __entry->link_mask = link_mask; memcpy(__get_dynamic_array(buf), buf, len); + __entry->driver_initiated = driver_initiated; ), - TP_printk(NETDEV_PR_FMT ", link_mask:0x%x", - NETDEV_PR_ARG, __entry->link_mask) + TP_printk(NETDEV_PR_FMT ", link_mask:0x%x, driver_initiated:%d", + NETDEV_PR_ARG, __entry->link_mask, __entry->driver_initiated) ); TRACE_EVENT(rdev_assoc_ml_reconf, diff --git a/net/wireless/util.c b/net/wireless/util.c index ed868c0f7ca8e..240c68baa3d1f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -820,6 +820,52 @@ bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr) } EXPORT_SYMBOL(ieee80211_is_valid_amsdu); + +/* + * Detects if an MSDU frame was maliciously converted into an A-MSDU + * frame by an adversary. This is done by parsing the received frame + * as if it were a regular MSDU, even though the A-MSDU flag is set. + * + * For non-mesh interfaces, detection involves checking whether the + * payload, when interpreted as an MSDU, begins with a valid RFC1042 + * header. This is done by comparing the A-MSDU subheader's destination + * address to the start of the RFC1042 header. + * + * For mesh interfaces, the MSDU includes a 6-byte Mesh Control field + * and an optional variable-length Mesh Address Extension field before + * the RFC1042 header. The position of the RFC1042 header must therefore + * be calculated based on the mesh header length. + * + * Since this function intentionally parses an A-MSDU frame as an MSDU, + * it only assumes that the A-MSDU subframe header is present, and + * beyond this it performs its own bounds checks under the assumption + * that the frame is instead parsed as a non-aggregated MSDU. + */ +static bool +is_amsdu_aggregation_attack(struct ethhdr *eth, struct sk_buff *skb, + enum nl80211_iftype iftype) +{ + int offset; + + /* Non-mesh case can be directly compared */ + if (iftype != NL80211_IFTYPE_MESH_POINT) + return ether_addr_equal(eth->h_dest, rfc1042_header); + + offset = __ieee80211_get_mesh_hdrlen(eth->h_dest[0]); + if (offset == 6) { + /* Mesh case with empty address extension field */ + return ether_addr_equal(eth->h_source, rfc1042_header); + } else if (offset + ETH_ALEN <= skb->len) { + /* Mesh case with non-empty address extension field */ + u8 temp[ETH_ALEN]; + + skb_copy_bits(skb, offset, temp, ETH_ALEN); + return ether_addr_equal(temp, rfc1042_header); + } + + return false; +} + void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, @@ -861,8 +907,10 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, /* the last MSDU has no padding */ if (subframe_len > remaining) goto purge; - /* mitigate A-MSDU aggregation injection attacks */ - if (ether_addr_equal(hdr.eth.h_dest, rfc1042_header)) + /* mitigate A-MSDU aggregation injection attacks, to be + * checked when processing first subframe (offset == 0). + */ + if (offset == 0 && is_amsdu_aggregation_attack(&hdr.eth, skb, iftype)) goto purge; offset += sizeof(struct ethhdr); @@ -2516,6 +2564,30 @@ int cfg80211_check_combinations(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_check_combinations); +int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, + const struct ieee80211_channel *chan) +{ + const struct wiphy_radio *radio; + int i, j; + u32 freq; + + if (!chan) + return -EINVAL; + + freq = ieee80211_channel_to_khz(chan); + for (i = 0; i < wiphy->n_radio; i++) { + radio = &wiphy->radio[i]; + for (j = 0; j < radio->n_freq_range; j++) { + if (freq >= radio->freq_range[j].start_freq && + freq < radio->freq_range[j].end_freq) + return i; + } + } + + return -ENOENT; +} +EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan); + int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask) @@ -2626,6 +2698,18 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, return false; } +int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo, + gfp_t gfp) +{ + link_sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, + sizeof(*link_sinfo->pertid), gfp); + if (!link_sinfo->pertid) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(cfg80211_link_sinfo_alloc_tid_stats); + int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index a74b1afc594e5..1241fda78a68c 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -263,7 +263,7 @@ int cfg80211_wext_siwrts(struct net_device *dev, else wdev->wiphy->rts_threshold = rts->value; - err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); + err = rdev_set_wiphy_params(rdev, -1, WIPHY_PARAM_RTS_THRESHOLD); if (err) wdev->wiphy->rts_threshold = orts; return err; @@ -304,7 +304,7 @@ int cfg80211_wext_siwfrag(struct net_device *dev, wdev->wiphy->frag_threshold = frag->value & ~0x1; } - err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); + err = rdev_set_wiphy_params(rdev, -1, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; return err; @@ -355,7 +355,7 @@ static int cfg80211_wext_siwretry(struct net_device *dev, changed |= WIPHY_PARAM_RETRY_SHORT; } - err = rdev_set_wiphy_params(rdev, changed); + err = rdev_set_wiphy_params(rdev, -1, changed); if (err) { wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; @@ -890,7 +890,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, guard(wiphy)(&rdev->wiphy); - return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); + return rdev_set_tx_power(rdev, wdev, -1, type, DBM_TO_MBM(dbm)); } static int cfg80211_wext_giwtxpower(struct net_device *dev, @@ -910,7 +910,7 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, return -EOPNOTSUPP; scoped_guard(wiphy, &rdev->wiphy) { - err = rdev_get_tx_power(rdev, wdev, 0, &val); + err = rdev_get_tx_power(rdev, wdev, -1, 0, &val); } if (err) return err; diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index bea70eb6f0345..c32a7c6903d53 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -431,7 +431,7 @@ static struct nlmsghdr *rtnetlink_ifinfo_prep(struct net_device *dev, r->__ifi_pad = 0; r->ifi_type = dev->type; r->ifi_index = dev->ifindex; - r->ifi_flags = dev_get_flags(dev); + r->ifi_flags = netif_get_flags(dev); r->ifi_change = 0; /* Wireless changes don't affect those flags */ if (nla_put_string(skb, IFLA_IFNAME, dev->name)) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 8dda4178497c9..655d1e0ae25f7 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -359,7 +359,7 @@ static void __x25_destroy_socket(struct sock *); */ static void x25_destroy_timer(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); x25_destroy_socket_from_timer(sk); } @@ -891,7 +891,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, if (sk->sk_state != TCP_LISTEN) goto out2; - rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); + rc = x25_wait_for_data(sk, READ_ONCE(sk->sk_rcvtimeo)); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 748d8630ab58b..fb8ac1aa58269 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -170,28 +170,6 @@ void x25_establish_link(struct x25_neigh *nb) dev_queue_xmit(skb); } -void x25_terminate_link(struct x25_neigh *nb) -{ - struct sk_buff *skb; - unsigned char *ptr; - - if (nb->dev->type != ARPHRD_X25) - return; - - skb = alloc_skb(1, GFP_ATOMIC); - if (!skb) { - pr_err("x25_dev: out of memory\n"); - return; - } - - ptr = skb_put(skb, 1); - *ptr = X25_IFACE_DISCONNECT; - - skb->protocol = htons(ETH_P_X25); - skb->dev = nb->dev; - dev_queue_xmit(skb); -} - void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb) { unsigned char *dptr; diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 37b190499405e..4608aa5b4f318 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -46,7 +46,7 @@ static inline void x25_start_t20timer(struct x25_neigh *nb) static void x25_t20timer_expiry(struct timer_list *t) { - struct x25_neigh *nb = from_timer(nb, t, t20timer); + struct x25_neigh *nb = timer_container_of(nb, t, t20timer); x25_transmit_restart_request(nb); diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index e4c5ad5b070f5..2ec63a1f4c6d4 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -89,7 +89,7 @@ unsigned long x25_display_timer(struct sock *sk) static void x25_heartbeat_expiry(struct timer_list *t) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sock *sk = timer_container_of(sk, t, sk_timer); bh_lock_sock(sk); if (sock_owned_by_user(sk)) /* can currently only occur in state 3 */ @@ -156,7 +156,7 @@ static inline void x25_do_timer_expiry(struct sock * sk) static void x25_timer_expiry(struct timer_list *t) { - struct x25_sock *x25 = from_timer(x25, t, timer); + struct x25_sock *x25 = timer_container_of(x25, t, timer); struct sock *sk = &x25->sk; bh_lock_sock(sk); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 72c000c0ae5f5..9c3acecc14b1a 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -34,7 +34,7 @@ #include "xsk.h" #define TX_BATCH_SIZE 32 -#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE) +#define MAX_PER_SOCKET_BUDGET 32 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { @@ -300,6 +300,13 @@ static bool xsk_tx_writeable(struct xdp_sock *xs) return true; } +static void __xsk_tx_release(struct xdp_sock *xs) +{ + __xskq_cons_release(xs->tx); + if (xsk_tx_writeable(xs)) + xs->sk.sk_write_space(&xs->sk); +} + static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { @@ -407,11 +414,8 @@ void xsk_tx_release(struct xsk_buff_pool *pool) struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { - __xskq_cons_release(xs->tx); - if (xsk_tx_writeable(xs)) - xs->sk.sk_write_space(&xs->sk); - } + list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) + __xsk_tx_release(xs); rcu_read_unlock(); } EXPORT_SYMBOL(xsk_tx_release); @@ -779,10 +783,10 @@ free_err: static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); - u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; + u32 max_batch; int err = 0; mutex_lock(&xs->mutex); @@ -796,6 +800,7 @@ static int __xsk_generic_xmit(struct sock *sk) if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; + max_batch = READ_ONCE(xs->max_tx_budget); while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { if (max_batch-- == 0) { err = -EAGAIN; @@ -858,8 +863,7 @@ static int __xsk_generic_xmit(struct sock *sk) out: if (sent_frame) - if (xsk_tx_writeable(xs)) - sk->sk_write_space(sk); + __xsk_tx_release(xs); mutex_unlock(&xs->mutex); return err; @@ -1437,6 +1441,21 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return err; } + case XDP_MAX_TX_SKB_BUDGET: + { + unsigned int budget; + + if (optlen != sizeof(budget)) + return -EINVAL; + if (copy_from_sockptr(&budget, optval, sizeof(budget))) + return -EFAULT; + if (!xs->tx || + budget < TX_BATCH_SIZE || budget > xs->tx->nentries) + return -EACCES; + + WRITE_ONCE(xs->max_tx_budget, budget); + return 0; + } default: break; } @@ -1734,6 +1753,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); xs->state = XSK_READY; + xs->max_tx_budget = TX_BATCH_SIZE; mutex_init(&xs->mutex); INIT_LIST_HEAD(&xs->map_list); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index c5181a9044add..aa9788f20d0db 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -267,13 +267,17 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, void xp_clear_dev(struct xsk_buff_pool *pool) { + struct net_device *netdev = pool->netdev; + if (!pool->netdev) return; + netdev_lock_ops(netdev); xp_disable_drv_zc(pool); xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); - dev_put(pool->netdev); pool->netdev = NULL; + netdev_unlock_ops(netdev); + dev_put(netdev); } static void xp_release_deferred(struct work_struct *work) diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 09dcea0cbbed9..0e0bca031c039 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -119,7 +119,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, if ((req->xdiag_show & XDP_SHOW_INFO) && nla_put_u32(nlskb, XDP_DIAG_UID, - from_kuid_munged(user_ns, sock_i_uid(sk)))) + from_kuid_munged(user_ns, sk_uid(sk)))) goto out_nlmsg_trim; if ((req->xdiag_show & XDP_SHOW_RING_CFG) && diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index d62f76161d83e..d2819baea414f 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -145,10 +145,6 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return NULL; } - /* This skb was already validated on the upper/virtual dev */ - if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) - return skb; - local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); err = !skb_queue_empty(&sd->xfrm_backlog); @@ -159,8 +155,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) || - unlikely(xmit_xfrm_check_overflow(skb)))) { + if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ @@ -256,6 +251,11 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } + if (xuo->flags & XFRM_OFFLOAD_INBOUND && x->if_id) { + NL_SET_ERR_MSG(extack, "XFRM if_id is not supported in RX path"); + return -EINVAL; + } + is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; /* We don't yet support TFC padding. */ @@ -305,7 +305,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - xfrm_set_type_offload(x); if (!x->type_offload) { NL_SET_ERR_MSG(extack, "Type doesn't support offload"); dev_put(dev); @@ -314,7 +313,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->dev = dev; netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); - xso->real_dev = dev; if (xuo->flags & XFRM_OFFLOAD_INBOUND) xso->dir = XFRM_DEV_OFFLOAD_IN; @@ -326,11 +324,10 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, else xso->type = XFRM_DEV_OFFLOAD_CRYPTO; - err = dev->xfrmdev_ops->xdo_dev_state_add(x, extack); + err = dev->xfrmdev_ops->xdo_dev_state_add(dev, x, extack); if (err) { xso->dev = NULL; xso->dir = 0; - xso->real_dev = NULL; netdev_put(dev, &xso->dev_tracker); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; @@ -378,7 +375,6 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, xdo->dev = dev; netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC); - xdo->real_dev = dev; xdo->type = XFRM_DEV_OFFLOAD_PACKET; switch (dir) { case XFRM_POLICY_IN: @@ -400,7 +396,6 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack); if (err) { xdo->dev = NULL; - xdo->real_dev = NULL; xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; xdo->dir = 0; netdev_put(dev, &xdo->dev_tracker); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 7e6a71b9d6a3a..c9ddef869aa55 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -503,6 +503,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; + dev_put(skb->dev); seq = XFRM_SKB_CB(skb)->seq.input.low; goto resume; } @@ -649,18 +650,18 @@ lock: XFRM_SKB_CB(skb)->seq.input.low = seq; XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; - dev_hold(skb->dev); - - if (crypto_done) + if (crypto_done) { nexthdr = x->type_offload->input_tail(x, skb); - else + } else { + dev_hold(skb->dev); + nexthdr = x->type->input(x, skb); + if (nexthdr == -EINPROGRESS) + return 0; - if (nexthdr == -EINPROGRESS) - return 0; + dev_put(skb->dev); + } resume: - dev_put(skb->dev); - spin_lock(&x->lock); if (nexthdr < 0) { if (nexthdr == -EBADMSG) { diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 622445f041d32..330a05286a56f 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -875,7 +875,7 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], return -EINVAL; } - if (p.collect_md) { + if (p.collect_md || xi->p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } @@ -886,11 +886,6 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], } else { if (xi->dev != dev) return -EEXIST; - if (xi->p.collect_md) { - NL_SET_ERR_MSG(extack, - "device can't be changed to collect_md"); - return -EINVAL; - } } return xfrmi_update(xi, &p); @@ -952,32 +947,28 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list, - struct list_head *dev_to_kill) +static void __net_exit xfrmi_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + int i; - ASSERT_RTNL(); - list_for_each_entry(net, net_exit_list, exit_list) { - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - struct xfrm_if __rcu **xip; - struct xfrm_if *xi; - int i; - - for (i = 0; i < XFRMI_HASH_SIZE; i++) { - for (xip = &xfrmn->xfrmi[i]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - unregister_netdevice_queue(xi->dev, dev_to_kill); - } - xi = rtnl_dereference(xfrmn->collect_md_xfrmi); - if (xi) + for (i = 0; i < XFRMI_HASH_SIZE; i++) { + for (xip = &xfrmn->xfrmi[i]; + (xi = rtnl_net_dereference(net, *xip)) != NULL; + xip = &xi->next) unregister_netdevice_queue(xi->dev, dev_to_kill); } + + xi = rtnl_net_dereference(net, xfrmn->collect_md_xfrmi); + if (xi) + unregister_netdevice_queue(xi->dev, dev_to_kill); } static struct pernet_operations xfrmi_net_ops = { - .exit_batch_rtnl = xfrmi_exit_batch_rtnl, + .exit_rtnl = xfrmi_exit_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 907c3ccb440da..43fdc6ed8dd17 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -97,7 +97,7 @@ static int ipcomp_input_done2(struct sk_buff *skb, int err) struct ip_comp_hdr *ipch = ip_comp_hdr(skb); const int plen = skb->len; - skb_reset_transport_header(skb); + skb->transport_header = skb->network_header + sizeof(*ipch); return ipcomp_post_acomp(skb, err, 0) ?: skb->len < (plen + sizeof(ip_comp_hdr)) ? -EINVAL : @@ -313,7 +313,6 @@ void ipcomp_destroy(struct xfrm_state *x) struct ipcomp_data *ipcd = x->data; if (!ipcd) return; - xfrm_state_delete_tunnel(x); ipcomp_free_data(ipcd); kfree(ipcd); } diff --git a/net/xfrm/xfrm_nat_keepalive.c b/net/xfrm/xfrm_nat_keepalive.c index 82f0a301683f0..ebf95d48e86c1 100644 --- a/net/xfrm/xfrm_nat_keepalive.c +++ b/net/xfrm/xfrm_nat_keepalive.c @@ -9,9 +9,13 @@ #include <net/ip6_checksum.h> #include <net/xfrm.h> -static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv4); +static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv4) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #if IS_ENABLED(CONFIG_IPV6) -static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv6); +static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv6) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #endif struct nat_keepalive { @@ -56,10 +60,12 @@ static int nat_keepalive_send_ipv4(struct sk_buff *skb, skb_dst_set(skb, &rt->dst); - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv4); + local_lock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); + sk = this_cpu_read(nat_keepalive_sk_ipv4.sock); sock_net_set(sk, net); err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos); sock_net_set(sk, &init_net); + local_unlock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); return err; } @@ -89,15 +95,19 @@ static int nat_keepalive_send_ipv6(struct sk_buff *skb, fl6.fl6_sport = ka->encap_sport; fl6.fl6_dport = ka->encap_dport; - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv6); + local_lock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); + sk = this_cpu_read(nat_keepalive_sk_ipv6.sock); sock_net_set(sk, net); dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL); - if (IS_ERR(dst)) + if (IS_ERR(dst)) { + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return PTR_ERR(dst); + } skb_dst_set(skb, dst); err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0); sock_net_set(sk, &init_net); + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return err; } #endif @@ -202,7 +212,7 @@ static void nat_keepalive_work(struct work_struct *work) (ctx.next_run - ctx.now) * HZ); } -static int nat_keepalive_sk_init(struct sock * __percpu *socks, +static int nat_keepalive_sk_init(struct sock_bh_locked __percpu *socks, unsigned short family) { struct sock *sk; @@ -214,22 +224,22 @@ static int nat_keepalive_sk_init(struct sock * __percpu *socks, if (err < 0) goto err; - *per_cpu_ptr(socks, i) = sk; + per_cpu_ptr(socks, i)->sock = sk; } return 0; err: for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); return err; } -static void nat_keepalive_sk_fini(struct sock * __percpu *socks) +static void nat_keepalive_sk_fini(struct sock_bh_locked __percpu *socks) { int i; for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); } void xfrm_nat_keepalive_state_updated(struct xfrm_state *x) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f4bad8c895d66..c5035a9bc3bb2 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -353,7 +353,7 @@ static inline unsigned long make_jiffies(long secs) static void xfrm_policy_timer(struct timer_list *t) { - struct xfrm_policy *xp = from_timer(xp, t, timer); + struct xfrm_policy *xp = timer_container_of(xp, t, timer); time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; @@ -2898,7 +2898,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; - struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer); + struct xfrm_policy *pol = timer_container_of(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; @@ -3925,7 +3925,7 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ - if (dst->obsolete < 0 && !stale_bundle(dst)) + if (READ_ONCE(dst->obsolete) < 0 && !stale_bundle(dst)) return dst; return NULL; @@ -3953,7 +3953,7 @@ static void xfrm_link_failure(struct sk_buff *skb) static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { - if (dst->obsolete) + if (READ_ONCE(dst->obsolete)) sk_dst_reset(sk); } @@ -4633,7 +4633,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, struct xfrm_user_offload *xuo) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4666,7 +4666,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; - xc = xfrm_state_migrate(x, mp, encap); + xc = xfrm_state_migrate(x, mp, encap, net, xuo, extack); if (xc) { x_new[nx_new] = xc; nx_new++; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 07fe8e5daa32b..77db3b5fe4aca 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -424,11 +424,10 @@ void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, } EXPORT_SYMBOL(xfrm_unregister_type_offload); -void xfrm_set_type_offload(struct xfrm_state *x) +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load) { const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; - bool try_load = true; retry: afinfo = xfrm_state_get_afinfo(x->props.family); @@ -593,20 +592,21 @@ void xfrm_state_free(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_free); -static void ___xfrm_state_destroy(struct xfrm_state *x) +static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (x->mode_cbs && x->mode_cbs->destroy_state) x->mode_cbs->destroy_state(x); hrtimer_cancel(&x->mtimer); timer_delete_sync(&x->rtimer); - kfree(x->aead); - kfree(x->aalg); - kfree(x->ealg); + kfree_sensitive(x->aead); + kfree_sensitive(x->aalg); + kfree_sensitive(x->ealg); kfree(x->calg); kfree(x->encap); kfree(x->coaddr); kfree(x->replay_esn); kfree(x->preplay_esn); + xfrm_unset_type_offload(x); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -631,7 +631,7 @@ static void xfrm_state_gc_task(struct work_struct *work) synchronize_rcu(); hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) - ___xfrm_state_destroy(x); + xfrm_state_gc_destroy(x); } static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) @@ -767,7 +767,7 @@ void xfrm_dev_state_delete(struct xfrm_state *x) struct net_device *dev = READ_ONCE(xso->dev); if (dev) { - dev->xfrmdev_ops->xdo_dev_state_delete(x); + dev->xfrmdev_ops->xdo_dev_state_delete(dev, x); spin_lock_bh(&xfrm_state_dev_gc_lock); hlist_add_head(&x->dev_gclist, &xfrm_state_dev_gc_list); spin_unlock_bh(&xfrm_state_dev_gc_lock); @@ -780,8 +780,6 @@ void xfrm_dev_state_free(struct xfrm_state *x) struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = READ_ONCE(xso->dev); - xfrm_unset_type_offload(x); - if (dev && dev->xfrmdev_ops) { spin_lock_bh(&xfrm_state_dev_gc_lock); if (!hlist_unhashed(&x->dev_gclist)) @@ -789,7 +787,7 @@ void xfrm_dev_state_free(struct xfrm_state *x) spin_unlock_bh(&xfrm_state_dev_gc_lock); if (dev->xfrmdev_ops->xdo_dev_state_free) - dev->xfrmdev_ops->xdo_dev_state_free(x); + dev->xfrmdev_ops->xdo_dev_state_free(dev, x); WRITE_ONCE(xso->dev, NULL); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; netdev_put(dev, &xso->dev_tracker); @@ -797,22 +795,18 @@ void xfrm_dev_state_free(struct xfrm_state *x) } #endif -void __xfrm_state_destroy(struct xfrm_state *x, bool sync) +void __xfrm_state_destroy(struct xfrm_state *x) { WARN_ON(x->km.state != XFRM_STATE_DEAD); - if (sync) { - synchronize_rcu(); - ___xfrm_state_destroy(x); - } else { - spin_lock_bh(&xfrm_state_gc_lock); - hlist_add_head(&x->gclist, &xfrm_state_gc_list); - spin_unlock_bh(&xfrm_state_gc_lock); - schedule_work(&xfrm_state_gc_work); - } + spin_lock_bh(&xfrm_state_gc_lock); + hlist_add_head(&x->gclist, &xfrm_state_gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + schedule_work(&xfrm_state_gc_work); } EXPORT_SYMBOL(__xfrm_state_destroy); +static void xfrm_state_delete_tunnel(struct xfrm_state *x); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -840,6 +834,8 @@ int __xfrm_state_delete(struct xfrm_state *x) xfrm_dev_state_delete(x); + xfrm_state_delete_tunnel(x); + /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that * is what we are dropping here. @@ -921,7 +917,7 @@ xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool } #endif -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync) +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) { int i, err = 0, cnt = 0; @@ -943,10 +939,7 @@ restart: err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); - if (sync) - xfrm_state_put_sync(x); - else - xfrm_state_put(x); + xfrm_state_put(x); if (!err) cnt++; @@ -1307,14 +1300,8 @@ static void xfrm_hash_grow_check(struct net *net, int have_hash_collision) static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, const struct flowi *fl, unsigned short family, struct xfrm_state **best, int *acq_in_progress, - int *error) + int *error, unsigned int pcpu_id) { - /* We need the cpu id just as a lookup key, - * we don't require it to be stable. - */ - unsigned int pcpu_id = get_cpu(); - put_cpu(); - /* Resolution logic: * 1. There is a valid state with matching selector. Done. * 2. Valid state with inappropriate selector. Skip. @@ -1381,14 +1368,15 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, /* We need the cpu id just as a lookup key, * we don't require it to be stable. */ - pcpu_id = get_cpu(); - put_cpu(); + pcpu_id = raw_smp_processor_id(); to_put = NULL; sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); rcu_read_lock(); + xfrm_hash_ptrs_get(net, &state_ptrs); + hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && @@ -1400,7 +1388,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, encap_family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } if (best) @@ -1417,7 +1405,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } cached: @@ -1429,8 +1417,6 @@ cached: else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ WARN_ON(1); - xfrm_hash_ptrs_get(net, &state_ptrs); - h = __xfrm_dst_hash(daddr, saddr, tmpl->reqid, encap_family, state_ptrs.hmask); hlist_for_each_entry_rcu(x, state_ptrs.bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD @@ -1460,7 +1446,7 @@ cached: tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } if (best || acquire_in_progress) goto found; @@ -1495,7 +1481,7 @@ cached: tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, - &best, &acquire_in_progress, &error); + &best, &acquire_in_progress, &error, pcpu_id); } found: @@ -1548,19 +1534,19 @@ found: if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { struct xfrm_dev_offload *xdo = &pol->xdo; struct xfrm_dev_offload *xso = &x->xso; + struct net_device *dev = xdo->dev; xso->type = XFRM_DEV_OFFLOAD_PACKET; xso->dir = xdo->dir; - xso->dev = xdo->dev; - xso->real_dev = xdo->real_dev; + xso->dev = dev; xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ; - netdev_hold(xso->dev, &xso->dev_tracker, GFP_ATOMIC); - error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL); + netdev_hold(dev, &xso->dev_tracker, GFP_ATOMIC); + error = dev->xfrmdev_ops->xdo_dev_state_add(dev, x, + NULL); if (error) { xso->dir = 0; - netdev_put(xso->dev, &xso->dev_tracker); + netdev_put(dev, &xso->dev_tracker); xso->dev = NULL; - xso->real_dev = NULL; xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; x->km.state = XFRM_STATE_DEAD; to_put = x; @@ -1711,6 +1697,26 @@ struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, } EXPORT_SYMBOL(xfrm_state_lookup_byspi); +static struct xfrm_state *xfrm_state_lookup_spi_proto(struct net *net, __be32 spi, u8 proto) +{ + struct xfrm_state *x; + unsigned int i; + + rcu_read_lock(); + for (i = 0; i <= net->xfrm.state_hmask; i++) { + hlist_for_each_entry_rcu(x, &net->xfrm.state_byspi[i], byspi) { + if (x->id.spi == spi && x->id.proto == proto) { + if (!xfrm_state_hold_rcu(x)) + continue; + rcu_read_unlock(); + return x; + } + } + } + rcu_read_unlock(); + return NULL; +} + static void __xfrm_state_insert(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -1958,8 +1964,9 @@ static inline int clone_security(struct xfrm_state *x, struct xfrm_sec_ctx *secu return 0; } -static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, - struct xfrm_encap_tmpl *encap) +static struct xfrm_state *xfrm_state_clone_and_setup(struct xfrm_state *orig, + struct xfrm_encap_tmpl *encap, + struct xfrm_migrate *m) { struct net *net = xs_net(orig); struct xfrm_state *x = xfrm_state_alloc(net); @@ -2058,6 +2065,11 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, goto error; } + + x->props.family = m->new_family; + memcpy(&x->id.daddr, &m->new_daddr, sizeof(x->id.daddr)); + memcpy(&x->props.saddr, &m->new_saddr, sizeof(x->props.saddr)); + return x; error: @@ -2120,21 +2132,23 @@ EXPORT_SYMBOL(xfrm_migrate_state_find); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, - struct xfrm_encap_tmpl *encap) + struct xfrm_encap_tmpl *encap, + struct net *net, + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack) { struct xfrm_state *xc; - xc = xfrm_state_clone(x, encap); + xc = xfrm_state_clone_and_setup(x, encap, m); if (!xc) return NULL; - xc->props.family = m->new_family; - if (xfrm_init_state(xc) < 0) goto error; - memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); - memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); + /* configure the hardware if offload is requested */ + if (xuo && xfrm_dev_state_add(net, xc, xuo, extack)) + goto error; /* add state */ if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) { @@ -2254,7 +2268,12 @@ EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { - xfrm_dev_state_update_stats(x); + /* All counters which are needed to decide if state is expired + * are handled by SW for non-packet offload modes. Simply skip + * the following update and save extra boilerplate in drivers. + */ + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + xfrm_dev_state_update_stats(x); if (!READ_ONCE(x->curlft.use_time)) WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); @@ -2547,10 +2566,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, unsigned int h; struct xfrm_state *x0; int err = -ENOENT; - __be32 minspi = htonl(low); - __be32 maxspi = htonl(high); + u32 range = high - low + 1; __be32 newspi = 0; - u32 mark = x->mark.v & x->mark.m; spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_DEAD) { @@ -2564,38 +2581,34 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, err = -ENOENT; - if (minspi == maxspi) { - x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family); - if (x0) { - NL_SET_ERR_MSG(extack, "Requested SPI is already in use"); - xfrm_state_put(x0); + for (h = 0; h < range; h++) { + u32 spi = (low == high) ? low : get_random_u32_inclusive(low, high); + newspi = htonl(spi); + + spin_lock_bh(&net->xfrm.xfrm_state_lock); + x0 = xfrm_state_lookup_spi_proto(net, newspi, x->id.proto); + if (!x0) { + x->id.spi = newspi; + h = xfrm_spi_hash(net, &x->id.daddr, newspi, x->id.proto, x->props.family); + XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + err = 0; goto unlock; } - newspi = minspi; - } else { - u32 spi = 0; - for (h = 0; h < high-low+1; h++) { - spi = get_random_u32_inclusive(low, high); - x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); - if (x0 == NULL) { - newspi = htonl(spi); - break; - } - xfrm_state_put(x0); + xfrm_state_put(x0); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + + if (signal_pending(current)) { + err = -ERESTARTSYS; + goto unlock; } + + if (low == high) + break; } - if (newspi) { - spin_lock_bh(&net->xfrm.xfrm_state_lock); - x->id.spi = newspi; - h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, - x->xso.type); - spin_unlock_bh(&net->xfrm.xfrm_state_lock); - err = 0; - } else { + if (err) NL_SET_ERR_MSG(extack, "No SPI available in the requested range"); - } unlock: spin_unlock_bh(&x->lock); @@ -2689,7 +2702,7 @@ EXPORT_SYMBOL(xfrm_state_walk_done); static void xfrm_replay_timer_handler(struct timer_list *t) { - struct xfrm_state *x = from_timer(x, t, rtimer); + struct xfrm_state *x = timer_container_of(x, t, rtimer); spin_lock(&x->lock); @@ -3069,20 +3082,17 @@ void xfrm_flush_gc(void) } EXPORT_SYMBOL(xfrm_flush_gc); -/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ -void xfrm_state_delete_tunnel(struct xfrm_state *x) +static void xfrm_state_delete_tunnel(struct xfrm_state *x) { if (x->tunnel) { struct xfrm_state *t = x->tunnel; - if (atomic_read(&t->tunnel_users) == 2) + if (atomic_dec_return(&t->tunnel_users) == 1) xfrm_state_delete(t); - atomic_dec(&t->tunnel_users); - xfrm_state_put_sync(t); + xfrm_state_put(t); x->tunnel = NULL; } } -EXPORT_SYMBOL(xfrm_state_delete_tunnel); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { @@ -3287,8 +3297,8 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false); flush_work(&xfrm_state_gc_work); - xfrm_state_flush(net, 0, false, true); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 784a2d124749f..684239018bec4 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -178,11 +178,27 @@ static inline int verify_replay(struct xfrm_usersa_info *p, "Replay seq and seq_hi should be 0 for output SA"); return -EINVAL; } - if (rs->oseq_hi && !(p->flags & XFRM_STATE_ESN)) { - NL_SET_ERR_MSG( - extack, - "Replay oseq_hi should be 0 in non-ESN mode for output SA"); - return -EINVAL; + + if (!(p->flags & XFRM_STATE_ESN)) { + if (rs->oseq_hi) { + NL_SET_ERR_MSG( + extack, + "Replay oseq_hi should be 0 in non-ESN mode for output SA"); + return -EINVAL; + } + if (rs->oseq == U32_MAX) { + NL_SET_ERR_MSG( + extack, + "Replay oseq should be less than 0xFFFFFFFF in non-ESN mode for output SA"); + return -EINVAL; + } + } else { + if (rs->oseq == U32_MAX && rs->oseq_hi == U32_MAX) { + NL_SET_ERR_MSG( + extack, + "Replay oseq and oseq_hi should be less than 0xFFFFFFFF for output SA"); + return -EINVAL; + } } if (rs->bmp_len) { NL_SET_ERR_MSG(extack, "Replay bmp_len should 0 for output SA"); @@ -196,11 +212,27 @@ static inline int verify_replay(struct xfrm_usersa_info *p, "Replay oseq and oseq_hi should be 0 for input SA"); return -EINVAL; } - if (rs->seq_hi && !(p->flags & XFRM_STATE_ESN)) { - NL_SET_ERR_MSG( - extack, - "Replay seq_hi should be 0 in non-ESN mode for input SA"); - return -EINVAL; + if (!(p->flags & XFRM_STATE_ESN)) { + if (rs->seq_hi) { + NL_SET_ERR_MSG( + extack, + "Replay seq_hi should be 0 in non-ESN mode for input SA"); + return -EINVAL; + } + + if (rs->seq == U32_MAX) { + NL_SET_ERR_MSG( + extack, + "Replay seq should be less than 0xFFFFFFFF in non-ESN mode for input SA"); + return -EINVAL; + } + } else { + if (rs->seq == U32_MAX && rs->seq_hi == U32_MAX) { + NL_SET_ERR_MSG( + extack, + "Replay seq and seq_hi should be less than 0xFFFFFFFF for input SA"); + return -EINVAL; + } } } @@ -945,6 +977,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, /* override default values from above */ xfrm_update_ae_params(x, attrs, 0); + xfrm_set_type_offload(x, attrs[XFRMA_OFFLOAD_DEV]); /* configure the hardware if offload is requested */ if (attrs[XFRMA_OFFLOAD_DEV]) { err = xfrm_dev_state_add(net, x, @@ -1173,7 +1206,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) if (!nla) return -EMSGSIZE; algo = nla_data(nla); - strscpy_pad(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); + strscpy_pad(algo->alg_name, auth->alg_name); if (redact_secret && auth->alg_key_len) memset(algo->alg_key, 0, (auth->alg_key_len + 7) / 8); @@ -1186,7 +1219,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) if (!nla) return -EMSGSIZE; ap = nla_data(nla); - strscpy_pad(ap->alg_name, auth->alg_name, sizeof(ap->alg_name)); + strscpy_pad(ap->alg_name, auth->alg_name); ap->alg_key_len = auth->alg_key_len; ap->alg_trunc_len = auth->alg_trunc_len; if (redact_secret && auth->alg_key_len) @@ -1207,7 +1240,7 @@ static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - strscpy_pad(ap->alg_name, aead->alg_name, sizeof(ap->alg_name)); + strscpy_pad(ap->alg_name, aead->alg_name); ap->alg_key_len = aead->alg_key_len; ap->alg_icv_len = aead->alg_icv_len; @@ -1229,7 +1262,7 @@ static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - strscpy_pad(ap->alg_name, ealg->alg_name, sizeof(ap->alg_name)); + strscpy_pad(ap->alg_name, ealg->alg_name); ap->alg_key_len = ealg->alg_key_len; if (redact_secret && ealg->alg_key_len) @@ -1250,7 +1283,7 @@ static int copy_to_user_calg(struct xfrm_algo *calg, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - strscpy_pad(ap->alg_name, calg->alg_name, sizeof(ap->alg_name)); + strscpy_pad(ap->alg_name, calg->alg_name); ap->alg_key_len = 0; return 0; @@ -2602,7 +2635,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_usersa_flush *p = nlmsg_data(nlh); int err; - err = xfrm_state_flush(net, p->proto, true, false); + err = xfrm_state_flush(net, p->proto, true); if (err) { if (err == -ESRCH) /* empty table */ return 0; @@ -3069,6 +3102,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); struct xfrm_encap_tmpl *encap = NULL; + struct xfrm_user_offload *xuo = NULL; u32 if_id = 0; if (!attrs[XFRMA_MIGRATE]) { @@ -3099,11 +3133,19 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (attrs[XFRMA_OFFLOAD_DEV]) { + xuo = kmemdup(nla_data(attrs[XFRMA_OFFLOAD_DEV]), + sizeof(*xuo), GFP_KERNEL); + if (!xuo) { + err = -ENOMEM; + goto error; + } + } err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, - if_id, extack); - + if_id, extack, xuo); +error: kfree(encap); - + kfree(xuo); return err; } #else |