summaryrefslogtreecommitdiff
path: root/net/mctp/route.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-07-30 08:58:55 -0700
commit8be4d31cb8aaeea27bde4b7ddb26e28a89062ebf (patch)
treefec3039a08284cd87f4ec9c3bea5b5a439f1859f /net/mctp/route.c
parent4b290aae788e06561754b28c6842e4080957d3f7 (diff)
parentfa582ca7e187a15e772e6a72fe035f649b387a60 (diff)
Merge tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Wrap datapath globals into net_aligned_data, to avoid false sharing - Preserve MSG_ZEROCOPY in forwarding (e.g. out of a container) - Add SO_INQ and SCM_INQ support to AF_UNIX - Add SIOCINQ support to AF_VSOCK - Add TCP_MAXSEG sockopt to MPTCP - Add IPv6 force_forwarding sysctl to enable forwarding per interface - Make TCP validation of whether packet fully fits in the receive window and the rcv_buf more strict. With increased use of HW aggregation a single "packet" can be multiple 100s of kB - Add MSG_MORE flag to optimize large TCP transmissions via sockmap, improves latency up to 33% for sockmap users - Convert TCP send queue handling from tasklet to BH workque - Improve BPF iteration over TCP sockets to see each socket exactly once - Remove obsolete and unused TCP RFC3517/RFC6675 loss recovery code - Support enabling kernel threads for NAPI processing on per-NAPI instance basis rather than a whole device. Fully stop the kernel NAPI thread when threaded NAPI gets disabled. Previously thread would stick around until ifdown due to tricky synchronization - Allow multicast routing to take effect on locally-generated packets - Add output interface argument for End.X in segment routing - MCTP: add support for gateway routing, improve bind() handling - Don't require rtnl_lock when fetching an IPv6 neighbor over Netlink - Add a new neighbor flag ("extern_valid"), which cedes refresh responsibilities to userspace. This is needed for EVPN multi-homing where a neighbor entry for a multi-homed host needs to be synced across all the VTEPs among which the host is multi-homed - Support NUD_PERMANENT for proxy neighbor entries - Add a new queuing discipline for IETF RFC9332 DualQ Coupled AQM - Add sequence numbers to netconsole messages. Unregister netconsole's console when all net targets are removed. Code refactoring. Add a number of selftests - Align IPSec inbound SA lookup to RFC 4301. Only SPI and protocol should be used for an inbound SA lookup - Support inspecting ref_tracker state via DebugFS - Don't force bonding advertisement frames tx to ~333 ms boundaries. Add broadcast_neighbor option to send ARP/ND on all bonded links - Allow providing upcall pid for the 'execute' command in openvswitch - Remove DCCP support from Netfilter's conntrack - Disallow multiple packet duplications in the queuing layer - Prevent use of deprecated iptables code on PREEMPT_RT Driver API: - Support RSS and hashing configuration over ethtool Netlink - Add dedicated ethtool callbacks for getting and setting hashing fields - Add support for power budget evaluation strategy in PSE / Power-over-Ethernet. Generate Netlink events for overcurrent etc - Support DPLL phase offset monitoring across all device inputs. Support providing clock reference and SYNC over separate DPLL inputs - Support traffic classes in devlink rate API for bandwidth management - Remove rtnl_lock dependency from UDP tunnel port configuration Device drivers: - Add a new Broadcom driver for 800G Ethernet (bnge) - Add a standalone driver for Microchip ZL3073x DPLL - Remove IBM's NETIUCV device driver - Ethernet high-speed NICs: - Broadcom (bnxt): - support zero-copy Tx of DMABUF memory - take page size into account for page pool recycling rings - Intel (100G, ice, idpf): - idpf: XDP and AF_XDP support preparations - idpf: add flow steering - add link_down_events statistic - clean up the TSPLL code - preparations for live VM migration - nVidia/Mellanox: - support zero-copy Rx/Tx interfaces (DMABUF and io_uring) - optimize context memory usage for matchers - expose serial numbers in devlink info - support PCIe congestion metrics - Meta (fbnic): - add 25G, 50G, and 100G link modes to phylink - support dumping FW logs - Marvell/Cavium: - support for CN20K generation of the Octeon chips - Amazon: - add HW clock (without timestamping, just hypervisor time access) - Ethernet virtual: - VirtIO net: - support segmentation of UDP-tunnel-encapsulated packets - Google (gve): - support packet timestamping and clock synchronization - Microsoft vNIC: - add handler for device-originated servicing events - allow dynamic MSI-X vector allocation - support Tx bandwidth clamping - Ethernet NICs consumer, and embedded: - AMD: - amd-xgbe: hardware timestamping and PTP clock support - Broadcom integrated MACs (bcmgenet, bcmasp): - use napi_complete_done() return value to support NAPI polling - add support for re-starting auto-negotiation - Broadcom switches (b53): - support BCM5325 switches - add bcm63xx EPHY power control - Synopsys (stmmac): - lots of code refactoring and cleanups - TI: - icssg-prueth: read firmware-names from device tree - icssg: PRP offload support - Microchip: - lan78xx: convert to PHYLINK for improved PHY and MAC management - ksz: add KSZ8463 switch support - Intel: - support similar queue priority scheme in multi-queue and time-sensitive networking (taprio) - support packet pre-emption in both - RealTek (r8169): - enable EEE at 5Gbps on RTL8126 - Airoha: - add PPPoE offload support - MDIO bus controller for Airoha AN7583 - Ethernet PHYs: - support for the IPQ5018 internal GE PHY - micrel KSZ9477 switch-integrated PHYs: - add MDI/MDI-X control support - add RX error counters - add cable test support - add Signal Quality Indicator (SQI) reporting - dp83tg720: improve reset handling and reduce link recovery time - support bcm54811 (and its MII-Lite interface type) - air_en8811h: support resume/suspend - support PHY counters for QCA807x and QCA808x - support WoL for QCA807x - CAN drivers: - rcar_canfd: support for Transceiver Delay Compensation - kvaser: report FW versions via devlink dev info - WiFi: - extended regulatory info support (6 GHz) - add statistics and beacon monitor for Multi-Link Operation (MLO) - support S1G aggregation, improve S1G support - add Radio Measurement action fields - support per-radio RTS threshold - some work around how FIPS affects wifi, which was wrong (RC4 is used by TKIP, not only WEP) - improvements for unsolicited probe response handling - WiFi drivers: - RealTek (rtw88): - IBSS mode for SDIO devices - RealTek (rtw89): - BT coexistence for MLO/WiFi7 - concurrent station + P2P support - support for USB devices RTL8851BU/RTL8852BU - Intel (iwlwifi): - use embedded PNVM in (to be released) FW images to fix compatibility issues - many cleanups (unused FW APIs, PCIe code, WoWLAN) - some FIPS interoperability - MediaTek (mt76): - firmware recovery improvements - more MLO work - Qualcomm/Atheros (ath12k): - fix scan on multi-radio devices - more EHT/Wi-Fi 7 features - encapsulation/decapsulation offload - Broadcom (brcm80211): - support SDIO 43751 device - Bluetooth: - hci_event: add support for handling LE BIG Sync Lost event - ISO: add socket option to report packet seqnum via CMSG - ISO: support SCM_TIMESTAMPING for ISO TS - Bluetooth drivers: - intel_pcie: support Function Level Reset - nxpuart: add support for 4M baudrate - nxpuart: implement powerup sequence, reset, FW dump, and FW loading" * tag 'net-next-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1742 commits) dpll: zl3073x: Fix build failure selftests: bpf: fix legacy netfilter options ipv6: annotate data-races around rt->fib6_nsiblings ipv6: fix possible infinite loop in fib6_info_uses_dev() ipv6: prevent infinite loop in rt6_nlmsg_size() ipv6: add a retry logic in net6_rt_notify() vrf: Drop existing dst reference in vrf_ip6_input_dst net/sched: taprio: align entry index attr validation with mqprio net: fsl_pq_mdio: use dev_err_probe selftests: rtnetlink.sh: remove esp4_offload after test vsock: remove unnecessary null check in vsock_getname() igb: xsk: solve negative overflow of nb_pkts in zerocopy mode stmmac: xsk: fix negative overflow of budget in zerocopy mode dt-bindings: ieee802154: Convert at86rf230.txt yaml format net: dsa: microchip: Disable PTP function of KSZ8463 net: dsa: microchip: Setup fiber ports for KSZ8463 net: dsa: microchip: Write switch MAC address differently for KSZ8463 net: dsa: microchip: Use different registers for KSZ8463 net: dsa: microchip: Add KSZ8463 switch support to KSZ DSA driver dt-bindings: net: dsa: microchip: Add KSZ8463 switch support ...
Diffstat (limited to 'net/mctp/route.c')
-rw-r--r--net/mctp/route.c653
1 files changed, 450 insertions, 203 deletions
diff --git a/net/mctp/route.c b/net/mctp/route.c
index d9c8e5a5f9ce..2b2b958ef6a3 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -17,6 +17,8 @@
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
+#include <kunit/static_stub.h>
+
#include <uapi/linux/if_arp.h>
#include <net/mctp.h>
@@ -32,39 +34,42 @@ static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ;
static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev);
/* route output callbacks */
-static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
+static int mctp_dst_discard(struct mctp_dst *dst, struct sk_buff *skb)
{
kfree_skb(skb);
return 0;
}
-static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
+static struct mctp_sock *mctp_lookup_bind_details(struct net *net,
+ struct sk_buff *skb,
+ u8 type, u8 dest,
+ u8 src, bool allow_net_any)
{
struct mctp_skb_cb *cb = mctp_cb(skb);
- struct mctp_hdr *mh;
struct sock *sk;
- u8 type;
+ u8 hash;
- WARN_ON(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held());
- /* TODO: look up in skb->cb? */
- mh = mctp_hdr(skb);
-
- if (!skb_headlen(skb))
- return NULL;
+ hash = mctp_bind_hash(type, dest, src);
- type = (*(u8 *)skb->data) & 0x7f;
-
- sk_for_each_rcu(sk, &net->mctp.binds) {
+ sk_for_each_rcu(sk, &net->mctp.binds[hash]) {
struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+ if (!allow_net_any && msk->bind_net == MCTP_NET_ANY)
+ continue;
+
if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net)
continue;
if (msk->bind_type != type)
continue;
- if (!mctp_address_matches(msk->bind_addr, mh->dest))
+ if (msk->bind_peer_set &&
+ !mctp_address_matches(msk->bind_peer_addr, src))
+ continue;
+
+ if (!mctp_address_matches(msk->bind_local_addr, dest))
continue;
return msk;
@@ -73,6 +78,54 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
return NULL;
}
+static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
+{
+ struct mctp_sock *msk;
+ struct mctp_hdr *mh;
+ u8 type;
+
+ /* TODO: look up in skb->cb? */
+ mh = mctp_hdr(skb);
+
+ if (!skb_headlen(skb))
+ return NULL;
+
+ type = (*(u8 *)skb->data) & 0x7f;
+
+ /* Look for binds in order of widening scope. A given destination or
+ * source address also implies matching on a particular network.
+ *
+ * - Matching destination and source
+ * - Matching destination
+ * - Matching source
+ * - Matching network, any address
+ * - Any network or address
+ */
+
+ msk = mctp_lookup_bind_details(net, skb, type, mh->dest, mh->src,
+ false);
+ if (msk)
+ return msk;
+ msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY, mh->src,
+ false);
+ if (msk)
+ return msk;
+ msk = mctp_lookup_bind_details(net, skb, type, mh->dest, MCTP_ADDR_ANY,
+ false);
+ if (msk)
+ return msk;
+ msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY,
+ MCTP_ADDR_ANY, false);
+ if (msk)
+ return msk;
+ msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY,
+ MCTP_ADDR_ANY, true);
+ if (msk)
+ return msk;
+
+ return NULL;
+}
+
/* A note on the key allocations.
*
* struct net->mctp.keys contains our set of currently-allocated keys for
@@ -368,7 +421,7 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
return 0;
}
-static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
+static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb)
{
struct mctp_sk_key *key, *any_key = NULL;
struct net *net = dev_net(skb->dev);
@@ -392,6 +445,9 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
*/
skb_orphan(skb);
+ if (skb->pkt_type == PACKET_OUTGOING)
+ skb->pkt_type = PACKET_LOOPBACK;
+
/* ensure we have enough data for a header and a type */
if (skb->len < sizeof(struct mctp_hdr) + 1)
goto out;
@@ -556,39 +612,31 @@ out:
return rc;
}
-static unsigned int mctp_route_mtu(struct mctp_route *rt)
-{
- return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu);
-}
-
-static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
+static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
{
- struct mctp_skb_cb *cb = mctp_cb(skb);
- struct mctp_hdr *hdr = mctp_hdr(skb);
char daddr_buf[MAX_ADDR_LEN];
char *daddr = NULL;
- unsigned int mtu;
int rc;
skb->protocol = htons(ETH_P_MCTP);
+ skb->pkt_type = PACKET_OUTGOING;
- mtu = READ_ONCE(skb->dev->mtu);
- if (skb->len > mtu) {
+ if (skb->len > dst->mtu) {
kfree_skb(skb);
return -EMSGSIZE;
}
- if (cb->ifindex) {
- /* direct route; use the hwaddr we stashed in sendmsg */
- if (cb->halen != skb->dev->addr_len) {
+ /* direct route; use the hwaddr we stashed in sendmsg */
+ if (dst->halen) {
+ if (dst->halen != skb->dev->addr_len) {
/* sanity check, sendmsg should have already caught this */
kfree_skb(skb);
return -EMSGSIZE;
}
- daddr = cb->haddr;
+ daddr = dst->haddr;
} else {
/* If lookup fails let the device handle daddr==NULL */
- if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0)
+ if (mctp_neigh_lookup(dst->dev, dst->nexthop, daddr_buf) == 0)
daddr = daddr_buf;
}
@@ -599,7 +647,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
return -EHOSTUNREACH;
}
- mctp_flow_prepare_output(skb, route->dev);
+ mctp_flow_prepare_output(skb, dst->dev);
rc = dev_queue_xmit(skb);
if (rc)
@@ -612,7 +660,8 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
static void mctp_route_release(struct mctp_route *rt)
{
if (refcount_dec_and_test(&rt->refs)) {
- mctp_dev_put(rt->dev);
+ if (rt->dst_type == MCTP_ROUTE_DIRECT)
+ mctp_dev_put(rt->dev);
kfree_rcu(rt, rcu);
}
}
@@ -628,7 +677,7 @@ static struct mctp_route *mctp_route_alloc(void)
INIT_LIST_HEAD(&rt->list);
refcount_set(&rt->refs, 1);
- rt->output = mctp_route_discard;
+ rt->output = mctp_dst_discard;
return rt;
}
@@ -801,10 +850,16 @@ static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk,
}
/* routing lookups */
+static unsigned int mctp_route_netid(struct mctp_route *rt)
+{
+ return rt->dst_type == MCTP_ROUTE_DIRECT ?
+ READ_ONCE(rt->dev->net) : rt->gateway.net;
+}
+
static bool mctp_rt_match_eid(struct mctp_route *rt,
unsigned int net, mctp_eid_t eid)
{
- return READ_ONCE(rt->dev->net) == net &&
+ return mctp_route_netid(rt) == net &&
rt->min <= eid && rt->max >= eid;
}
@@ -813,54 +868,150 @@ static bool mctp_rt_compare_exact(struct mctp_route *rt1,
struct mctp_route *rt2)
{
ASSERT_RTNL();
- return rt1->dev->net == rt2->dev->net &&
+ return mctp_route_netid(rt1) == mctp_route_netid(rt2) &&
rt1->min == rt2->min &&
rt1->max == rt2->max;
}
-struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet,
- mctp_eid_t daddr)
+/* must only be called on a direct route, as the final output hop */
+static void mctp_dst_from_route(struct mctp_dst *dst, mctp_eid_t eid,
+ unsigned int mtu, struct mctp_route *route)
+{
+ mctp_dev_hold(route->dev);
+ dst->nexthop = eid;
+ dst->dev = route->dev;
+ dst->mtu = READ_ONCE(dst->dev->dev->mtu);
+ if (mtu)
+ dst->mtu = min(dst->mtu, mtu);
+ dst->halen = 0;
+ dst->output = route->output;
+}
+
+int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex,
+ unsigned char halen, const unsigned char *haddr)
{
- struct mctp_route *tmp, *rt = NULL;
+ struct net_device *netdev;
+ struct mctp_dev *dev;
+ int rc = -ENOENT;
+
+ if (halen > sizeof(dst->haddr))
+ return -EINVAL;
rcu_read_lock();
- list_for_each_entry_rcu(tmp, &net->mctp.routes, list) {
- /* TODO: add metrics */
- if (mctp_rt_match_eid(tmp, dnet, daddr)) {
- if (refcount_inc_not_zero(&tmp->refs)) {
- rt = tmp;
- break;
- }
- }
+ netdev = dev_get_by_index_rcu(net, ifindex);
+ if (!netdev)
+ goto out_unlock;
+
+ if (netdev->addr_len != halen) {
+ rc = -EINVAL;
+ goto out_unlock;
}
+ dev = __mctp_dev_get(netdev);
+ if (!dev)
+ goto out_unlock;
+
+ dst->dev = dev;
+ dst->mtu = READ_ONCE(netdev->mtu);
+ dst->halen = halen;
+ dst->output = mctp_dst_output;
+ dst->nexthop = 0;
+ memcpy(dst->haddr, haddr, halen);
+
+ rc = 0;
+
+out_unlock:
rcu_read_unlock();
+ return rc;
+}
- return rt;
+void mctp_dst_release(struct mctp_dst *dst)
+{
+ mctp_dev_put(dst->dev);
+}
+
+static struct mctp_route *mctp_route_lookup_single(struct net *net,
+ unsigned int dnet,
+ mctp_eid_t daddr)
+{
+ struct mctp_route *rt;
+
+ list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
+ if (mctp_rt_match_eid(rt, dnet, daddr))
+ return rt;
+ }
+
+ return NULL;
}
-static struct mctp_route *mctp_route_lookup_null(struct net *net,
- struct net_device *dev)
+/* populates *dst on successful lookup, if set */
+int mctp_route_lookup(struct net *net, unsigned int dnet,
+ mctp_eid_t daddr, struct mctp_dst *dst)
{
- struct mctp_route *tmp, *rt = NULL;
+ const unsigned int max_depth = 32;
+ unsigned int depth, mtu = 0;
+ int rc = -EHOSTUNREACH;
rcu_read_lock();
- list_for_each_entry_rcu(tmp, &net->mctp.routes, list) {
- if (tmp->dev->dev == dev && tmp->type == RTN_LOCAL &&
- refcount_inc_not_zero(&tmp->refs)) {
- rt = tmp;
+ for (depth = 0; depth < max_depth; depth++) {
+ struct mctp_route *rt;
+
+ rt = mctp_route_lookup_single(net, dnet, daddr);
+ if (!rt)
+ break;
+
+ /* clamp mtu to the smallest in the path, allowing 0
+ * to specify no restrictions
+ */
+ if (mtu && rt->mtu)
+ mtu = min(mtu, rt->mtu);
+ else
+ mtu = mtu ?: rt->mtu;
+
+ if (rt->dst_type == MCTP_ROUTE_DIRECT) {
+ if (dst)
+ mctp_dst_from_route(dst, daddr, mtu, rt);
+ rc = 0;
break;
+
+ } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) {
+ daddr = rt->gateway.eid;
}
}
rcu_read_unlock();
- return rt;
+ return rc;
+}
+
+static int mctp_route_lookup_null(struct net *net, struct net_device *dev,
+ struct mctp_dst *dst)
+{
+ int rc = -EHOSTUNREACH;
+ struct mctp_route *rt;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
+ if (rt->dst_type != MCTP_ROUTE_DIRECT || rt->type != RTN_LOCAL)
+ continue;
+
+ if (rt->dev->dev != dev)
+ continue;
+
+ mctp_dst_from_route(dst, 0, 0, rt);
+ rc = 0;
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return rc;
}
-static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
+static int mctp_do_fragment_route(struct mctp_dst *dst, struct sk_buff *skb,
unsigned int mtu, u8 tag)
{
const unsigned int hlen = sizeof(struct mctp_hdr);
@@ -933,7 +1084,7 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
skb_ext_copy(skb2, skb);
/* do route */
- rc = rt->output(rt, skb2);
+ rc = dst->output(dst, skb2);
if (rc)
break;
@@ -945,68 +1096,34 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
return rc;
}
-int mctp_local_output(struct sock *sk, struct mctp_route *rt,
+int mctp_local_output(struct sock *sk, struct mctp_dst *dst,
struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
{
struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
- struct mctp_skb_cb *cb = mctp_cb(skb);
- struct mctp_route tmp_rt = {0};
struct mctp_sk_key *key;
struct mctp_hdr *hdr;
unsigned long flags;
unsigned int netid;
unsigned int mtu;
mctp_eid_t saddr;
- bool ext_rt;
int rc;
u8 tag;
- rc = -ENODEV;
-
- if (rt) {
- ext_rt = false;
- if (WARN_ON(!rt->dev))
- goto out_release;
-
- } else if (cb->ifindex) {
- struct net_device *dev;
-
- ext_rt = true;
- rt = &tmp_rt;
-
- rcu_read_lock();
- dev = dev_get_by_index_rcu(sock_net(sk), cb->ifindex);
- if (!dev) {
- rcu_read_unlock();
- goto out_free;
- }
- rt->dev = __mctp_dev_get(dev);
- rcu_read_unlock();
-
- if (!rt->dev)
- goto out_release;
+ KUNIT_STATIC_STUB_REDIRECT(mctp_local_output, sk, dst, skb, daddr,
+ req_tag);
- /* establish temporary route - we set up enough to keep
- * mctp_route_output happy
- */
- rt->output = mctp_route_output;
- rt->mtu = 0;
-
- } else {
- rc = -EINVAL;
- goto out_free;
- }
+ rc = -ENODEV;
- spin_lock_irqsave(&rt->dev->addrs_lock, flags);
- if (rt->dev->num_addrs == 0) {
+ spin_lock_irqsave(&dst->dev->addrs_lock, flags);
+ if (dst->dev->num_addrs == 0) {
rc = -EHOSTUNREACH;
} else {
/* use the outbound interface's first address as our source */
- saddr = rt->dev->addrs[0];
+ saddr = dst->dev->addrs[0];
rc = 0;
}
- spin_unlock_irqrestore(&rt->dev->addrs_lock, flags);
- netid = READ_ONCE(rt->dev->net);
+ spin_unlock_irqrestore(&dst->dev->addrs_lock, flags);
+ netid = READ_ONCE(dst->dev->net);
if (rc)
goto out_release;
@@ -1032,15 +1149,13 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
tag = req_tag & MCTP_TAG_MASK;
}
+ skb->pkt_type = PACKET_OUTGOING;
skb->protocol = htons(ETH_P_MCTP);
skb->priority = 0;
skb_reset_transport_header(skb);
skb_push(skb, sizeof(struct mctp_hdr));
skb_reset_network_header(skb);
- skb->dev = rt->dev->dev;
-
- /* cb->net will have been set on initial ingress */
- cb->src = saddr;
+ skb->dev = dst->dev->dev;
/* set up common header fields */
hdr = mctp_hdr(skb);
@@ -1048,73 +1163,64 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
hdr->dest = daddr;
hdr->src = saddr;
- mtu = mctp_route_mtu(rt);
+ mtu = dst->mtu;
if (skb->len + sizeof(struct mctp_hdr) <= mtu) {
hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM |
MCTP_HDR_FLAG_EOM | tag;
- rc = rt->output(rt, skb);
+ rc = dst->output(dst, skb);
} else {
- rc = mctp_do_fragment_route(rt, skb, mtu, tag);
+ rc = mctp_do_fragment_route(dst, skb, mtu, tag);
}
/* route output functions consume the skb, even on error */
skb = NULL;
out_release:
- if (!ext_rt)
- mctp_route_release(rt);
-
- mctp_dev_put(tmp_rt.dev);
-
-out_free:
kfree_skb(skb);
return rc;
}
/* route management */
-static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
- unsigned int daddr_extent, unsigned int mtu,
- unsigned char type)
+
+/* mctp_route_add(): Add the provided route, previously allocated via
+ * mctp_route_alloc(). On success, takes ownership of @rt, which includes a
+ * hold on rt->dev for usage in the route table. On failure a caller will want
+ * to mctp_route_release().
+ *
+ * We expect that the caller has set rt->type, rt->dst_type, rt->min, rt->max,
+ * rt->mtu and either rt->dev (with a reference held appropriately) or
+ * rt->gateway. Other fields will be populated.
+ */
+static int mctp_route_add(struct net *net, struct mctp_route *rt)
{
- int (*rtfn)(struct mctp_route *rt, struct sk_buff *skb);
- struct net *net = dev_net(mdev->dev);
- struct mctp_route *rt, *ert;
+ struct mctp_route *ert;
- if (!mctp_address_unicast(daddr_start))
+ if (!mctp_address_unicast(rt->min) || !mctp_address_unicast(rt->max))
return -EINVAL;
- if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255)
+ if (rt->dst_type == MCTP_ROUTE_DIRECT && !rt->dev)
+ return -EINVAL;
+
+ if (rt->dst_type == MCTP_ROUTE_GATEWAY && !rt->gateway.eid)
return -EINVAL;
- switch (type) {
+ switch (rt->type) {
case RTN_LOCAL:
- rtfn = mctp_route_input;
+ rt->output = mctp_dst_input;
break;
case RTN_UNICAST:
- rtfn = mctp_route_output;
+ rt->output = mctp_dst_output;
break;
default:
return -EINVAL;
}
- rt = mctp_route_alloc();
- if (!rt)
- return -ENOMEM;
-
- rt->min = daddr_start;
- rt->max = daddr_start + daddr_extent;
- rt->mtu = mtu;
- rt->dev = mdev;
- mctp_dev_hold(rt->dev);
- rt->type = type;
- rt->output = rtfn;
-
ASSERT_RTNL();
+
/* Prevent duplicate identical routes. */
list_for_each_entry(ert, &net->mctp.routes, list) {
if (mctp_rt_compare_exact(rt, ert)) {
- mctp_route_release(rt);
return -EEXIST;
}
}
@@ -1124,10 +1230,10 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
return 0;
}
-static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start,
- unsigned int daddr_extent, unsigned char type)
+static int mctp_route_remove(struct net *net, unsigned int netid,
+ mctp_eid_t daddr_start, unsigned int daddr_extent,
+ unsigned char type)
{
- struct net *net = dev_net(mdev->dev);
struct mctp_route *rt, *tmp;
mctp_eid_t daddr_end;
bool dropped;
@@ -1141,7 +1247,7 @@ static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start,
ASSERT_RTNL();
list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
- if (rt->dev == mdev &&
+ if (mctp_route_netid(rt) == netid &&
rt->min == daddr_start && rt->max == daddr_end &&
rt->type == type) {
list_del_rcu(&rt->list);
@@ -1156,12 +1262,32 @@ static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start,
int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr)
{
- return mctp_route_add(mdev, addr, 0, 0, RTN_LOCAL);
+ struct mctp_route *rt;
+ int rc;
+
+ rt = mctp_route_alloc();
+ if (!rt)
+ return -ENOMEM;
+
+ rt->min = addr;
+ rt->max = addr;
+ rt->dst_type = MCTP_ROUTE_DIRECT;
+ rt->dev = mdev;
+ rt->type = RTN_LOCAL;
+
+ mctp_dev_hold(rt->dev);
+
+ rc = mctp_route_add(dev_net(mdev->dev), rt);
+ if (rc)
+ mctp_route_release(rt);
+
+ return rc;
}
int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr)
{
- return mctp_route_remove(mdev, addr, 0, RTN_LOCAL);
+ return mctp_route_remove(dev_net(mdev->dev), mdev->net,
+ addr, 0, RTN_LOCAL);
}
/* removes all entries for a given device */
@@ -1172,7 +1298,7 @@ void mctp_route_remove_dev(struct mctp_dev *mdev)
ASSERT_RTNL();
list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
- if (rt->dev == mdev) {
+ if (rt->dst_type == MCTP_ROUTE_DIRECT && rt->dev == mdev) {
list_del_rcu(&rt->list);
/* TODO: immediate RTM_DELROUTE */
mctp_route_release(rt);
@@ -1189,8 +1315,9 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev,
struct net *net = dev_net(dev);
struct mctp_dev *mdev;
struct mctp_skb_cb *cb;
- struct mctp_route *rt;
+ struct mctp_dst dst;
struct mctp_hdr *mh;
+ int rc;
rcu_read_lock();
mdev = __mctp_dev_get(dev);
@@ -1232,17 +1359,17 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev,
cb->net = READ_ONCE(mdev->net);
cb->ifindex = dev->ifindex;
- rt = mctp_route_lookup(net, cb->net, mh->dest);
+ rc = mctp_route_lookup(net, cb->net, mh->dest, &dst);
/* NULL EID, but addressed to our physical address */
- if (!rt && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST)
- rt = mctp_route_lookup_null(net, dev);
+ if (rc && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST)
+ rc = mctp_route_lookup_null(net, dev, &dst);
- if (!rt)
+ if (rc)
goto err_drop;
- rt->output(rt, skb);
- mctp_route_release(rt);
+ dst.output(&dst, skb);
+ mctp_dst_release(&dst);
mctp_dev_put(mdev);
return NET_RX_SUCCESS;
@@ -1264,19 +1391,28 @@ static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = {
[RTA_DST] = { .type = NLA_U8 },
[RTA_METRICS] = { .type = NLA_NESTED },
[RTA_OIF] = { .type = NLA_U32 },
+ [RTA_GATEWAY] = NLA_POLICY_EXACT_LEN(sizeof(struct mctp_fq_addr)),
+};
+
+static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = {
+ [RTAX_MTU] = { .type = NLA_U32 },
};
-/* Common part for RTM_NEWROUTE and RTM_DELROUTE parsing.
- * tb must hold RTA_MAX+1 elements.
+/* base parsing; common to both _lookup and _populate variants.
+ *
+ * For gateway routes (which have a RTA_GATEWAY, and no RTA_OIF), we populate
+ * *gatweayp. for direct routes (RTA_OIF, no RTA_GATEWAY), we populate *mdev.
*/
-static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack,
- struct nlattr **tb, struct rtmsg **rtm,
- struct mctp_dev **mdev, mctp_eid_t *daddr_start)
+static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct nlattr **tb, struct rtmsg **rtm,
+ struct mctp_dev **mdev,
+ struct mctp_fq_addr *gatewayp,
+ mctp_eid_t *daddr_start)
{
- struct net *net = sock_net(skb->sk);
+ struct mctp_fq_addr *gateway = NULL;
+ unsigned int ifindex = 0;
struct net_device *dev;
- unsigned int ifindex;
int rc;
rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX,
@@ -1292,11 +1428,44 @@ static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh,
}
*daddr_start = nla_get_u8(tb[RTA_DST]);
- if (!tb[RTA_OIF]) {
- NL_SET_ERR_MSG(extack, "ifindex missing");
+ if (tb[RTA_OIF])
+ ifindex = nla_get_u32(tb[RTA_OIF]);
+
+ if (tb[RTA_GATEWAY])
+ gateway = nla_data(tb[RTA_GATEWAY]);
+
+ if (ifindex && gateway) {
+ NL_SET_ERR_MSG(extack,
+ "cannot specify both ifindex and gateway");
+ return -EINVAL;
+
+ } else if (ifindex) {
+ dev = __dev_get_by_index(net, ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "bad ifindex");
+ return -ENODEV;
+ }
+ *mdev = mctp_dev_get_rtnl(dev);
+ if (!*mdev)
+ return -ENODEV;
+ gatewayp->eid = 0;
+
+ } else if (gateway) {
+ if (!mctp_address_unicast(gateway->eid)) {
+ NL_SET_ERR_MSG(extack, "bad gateway");
+ return -EINVAL;
+ }
+
+ gatewayp->eid = gateway->eid;
+ gatewayp->net = gateway->net != MCTP_NET_ANY ?
+ gateway->net :
+ READ_ONCE(net->mctp.default_net);
+ *mdev = NULL;
+
+ } else {
+ NL_SET_ERR_MSG(extack, "no route output provided");
return -EINVAL;
}
- ifindex = nla_get_u32(tb[RTA_OIF]);
*rtm = nlmsg_data(nlh);
if ((*rtm)->rtm_family != AF_MCTP) {
@@ -1304,82 +1473,157 @@ static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
- dev = __dev_get_by_index(net, ifindex);
- if (!dev) {
- NL_SET_ERR_MSG(extack, "bad ifindex");
- return -ENODEV;
- }
- *mdev = mctp_dev_get_rtnl(dev);
- if (!*mdev)
- return -ENODEV;
-
- if (dev->flags & IFF_LOOPBACK) {
- NL_SET_ERR_MSG(extack, "no routes to loopback");
+ if ((*rtm)->rtm_type != RTN_UNICAST) {
+ NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST");
return -EINVAL;
}
return 0;
}
-static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = {
- [RTAX_MTU] = { .type = NLA_U32 },
-};
-
-static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+/* Route parsing for lookup operations; we only need the "route target"
+ * components (ie., network and dest-EID range).
+ */
+static int mctp_route_nlparse_lookup(struct net *net, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ unsigned char *type, unsigned int *netid,
+ mctp_eid_t *daddr_start,
+ unsigned int *daddr_extent)
{
struct nlattr *tb[RTA_MAX + 1];
+ struct mctp_fq_addr gw;
+ struct mctp_dev *mdev;
+ struct rtmsg *rtm;
+ int rc;
+
+ rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm,
+ &mdev, &gw, daddr_start);
+ if (rc)
+ return rc;
+
+ if (mdev) {
+ *netid = mdev->net;
+ } else if (gw.eid) {
+ *netid = gw.net;
+ } else {
+ /* bug: _nlparse_common should not allow this */
+ return -1;
+ }
+
+ *type = rtm->rtm_type;
+ *daddr_extent = rtm->rtm_dst_len;
+
+ return 0;
+}
+
+/* Full route parse for RTM_NEWROUTE: populate @rt. On success,
+ * MCTP_ROUTE_DIRECT routes (ie, those with a direct dev) will hold a reference
+ * to that dev.
+ */
+static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct mctp_route *rt)
+{
struct nlattr *tbx[RTAX_MAX + 1];
+ struct nlattr *tb[RTA_MAX + 1];
+ unsigned int daddr_extent;
+ struct mctp_fq_addr gw;
mctp_eid_t daddr_start;
- struct mctp_dev *mdev;
+ struct mctp_dev *dev;
struct rtmsg *rtm;
- unsigned int mtu;
+ u32 mtu = 0;
int rc;
- rc = mctp_route_nlparse(skb, nlh, extack, tb,
- &rtm, &mdev, &daddr_start);
- if (rc < 0)
+ rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm,
+ &dev, &gw, &daddr_start);
+ if (rc)
return rc;
- if (rtm->rtm_type != RTN_UNICAST) {
- NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST");
+ daddr_extent = rtm->rtm_dst_len;
+
+ if (daddr_extent > 0xff || daddr_extent + daddr_start >= 255) {
+ NL_SET_ERR_MSG(extack, "invalid eid range");
return -EINVAL;
}
- mtu = 0;
if (tb[RTA_METRICS]) {
rc = nla_parse_nested(tbx, RTAX_MAX, tb[RTA_METRICS],
rta_metrics_policy, NULL);
- if (rc < 0)
+ if (rc < 0) {
+ NL_SET_ERR_MSG(extack, "incorrect RTA_METRICS format");
return rc;
+ }
if (tbx[RTAX_MTU])
mtu = nla_get_u32(tbx[RTAX_MTU]);
}
- rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu,
- rtm->rtm_type);
+ rt->type = rtm->rtm_type;
+ rt->min = daddr_start;
+ rt->max = daddr_start + daddr_extent;
+ rt->mtu = mtu;
+ if (gw.eid) {
+ rt->dst_type = MCTP_ROUTE_GATEWAY;
+ rt->gateway.eid = gw.eid;
+ rt->gateway.net = gw.net;
+ } else {
+ rt->dst_type = MCTP_ROUTE_DIRECT;
+ rt->dev = dev;
+ mctp_dev_hold(rt->dev);
+ }
+
+ return 0;
+}
+
+static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct mctp_route *rt;
+ int rc;
+
+ rt = mctp_route_alloc();
+ if (!rt)
+ return -ENOMEM;
+
+ rc = mctp_route_nlparse_populate(net, nlh, extack, rt);
+ if (rc < 0)
+ goto err_free;
+
+ if (rt->dst_type == MCTP_ROUTE_DIRECT &&
+ rt->dev->dev->flags & IFF_LOOPBACK) {
+ NL_SET_ERR_MSG(extack, "no routes to loopback");
+ rc = -EINVAL;
+ goto err_free;
+ }
+
+ rc = mctp_route_add(net, rt);
+ if (!rc)
+ return 0;
+
+err_free:
+ mctp_route_release(rt);
return rc;
}
static int mctp_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
- struct nlattr *tb[RTA_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ unsigned int netid, daddr_extent;
+ unsigned char type = RTN_UNSPEC;
mctp_eid_t daddr_start;
- struct mctp_dev *mdev;
- struct rtmsg *rtm;
int rc;
- rc = mctp_route_nlparse(skb, nlh, extack, tb,
- &rtm, &mdev, &daddr_start);
+ rc = mctp_route_nlparse_lookup(net, nlh, extack, &type, &netid,
+ &daddr_start, &daddr_extent);
if (rc < 0)
return rc;
/* we only have unicast routes */
- if (rtm->rtm_type != RTN_UNICAST)
+ if (type != RTN_UNICAST)
return -EINVAL;
- rc = mctp_route_remove(mdev, daddr_start, rtm->rtm_dst_len, RTN_UNICAST);
+ rc = mctp_route_remove(net, netid, daddr_start, daddr_extent, type);
return rc;
}
@@ -1405,7 +1649,6 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt,
hdr->rtm_tos = 0;
hdr->rtm_table = RT_TABLE_DEFAULT;
hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */
- hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */
hdr->rtm_type = rt->type;
if (nla_put_u8(skb, RTA_DST, rt->min))
@@ -1422,13 +1665,17 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt,
nla_nest_end(skb, metrics);
- if (rt->dev) {
+ if (rt->dst_type == MCTP_ROUTE_DIRECT) {
+ hdr->rtm_scope = RT_SCOPE_LINK;
if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex))
goto cancel;
+ } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) {
+ hdr->rtm_scope = RT_SCOPE_UNIVERSE;
+ if (nla_put(skb, RTA_GATEWAY,
+ sizeof(rt->gateway), &rt->gateway))
+ goto cancel;
}
- /* TODO: conditional neighbour physaddr? */
-
nlmsg_end(skb, nlh);
return 0;
@@ -1475,7 +1722,7 @@ static int __net_init mctp_routes_net_init(struct net *net)
struct netns_mctp *ns = &net->mctp;
INIT_LIST_HEAD(&ns->routes);
- INIT_HLIST_HEAD(&ns->binds);
+ hash_init(ns->binds);
mutex_init(&ns->bind_lock);
INIT_HLIST_HEAD(&ns->keys);
spin_lock_init(&ns->keys_lock);