summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/bpf.h20
-rw-r--r--include/linux/filter.h19
-rw-r--r--include/net/xdp.h1
-rw-r--r--include/trace/events/xdp.h6
-rw-r--r--include/uapi/linux/bpf.h14
-rw-r--r--kernel/bpf/cpumap.c3
-rw-r--r--kernel/bpf/devmap.c306
-rw-r--r--net/core/filter.c37
-rw-r--r--net/core/xdp.c28
-rw-r--r--net/xdp/xskmap.c3
-rw-r--r--samples/bpf/Makefile3
-rw-r--r--samples/bpf/xdp_redirect_map_multi_kern.c88
-rw-r--r--samples/bpf/xdp_redirect_map_multi_user.c302
-rw-r--r--tools/include/uapi/linux/bpf.h14
-rw-r--r--tools/testing/selftests/bpf/Makefile3
-rw-r--r--tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c94
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_redirect_multi.sh204
-rw-r--r--tools/testing/selftests/bpf/xdp_redirect_multi.c226
18 files changed, 1306 insertions, 65 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1e9a0ff3217b..86dec5001ae2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1501,8 +1501,13 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx);
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+ struct bpf_map *map, bool exclude_ingress);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
struct bpf_prog *xdp_prog);
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ bool exclude_ingress);
bool dev_map_can_have_prog(struct bpf_map *map);
void __cpu_map_flush(void);
@@ -1670,6 +1675,13 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
return 0;
}
+static inline
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+ struct bpf_map *map, bool exclude_ingress)
+{
+ return 0;
+}
+
struct sk_buff;
static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
@@ -1679,6 +1691,14 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
return 0;
}
+static inline
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ bool exclude_ingress)
+{
+ return 0;
+}
+
static inline void __cpu_map_flush(void)
{
}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9a09547bc7ba..c5ad7df029ed 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -646,6 +646,7 @@ struct bpf_redirect_info {
u32 flags;
u32 tgt_index;
void *tgt_value;
+ struct bpf_map *map;
u32 map_id;
enum bpf_map_type map_type;
u32 kern_flags;
@@ -1464,17 +1465,19 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
}
#endif /* IS_ENABLED(CONFIG_IPV6) */
-static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags,
+static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex,
+ u64 flags, const u64 flag_mask,
void *lookup_elem(struct bpf_map *map, u32 key))
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
/* Lower bits of the flags are used as return code on lookup failure */
- if (unlikely(flags > XDP_TX))
+ if (unlikely(flags & ~(action_mask | flag_mask)))
return XDP_ABORTED;
ri->tgt_value = lookup_elem(map, ifindex);
- if (unlikely(!ri->tgt_value)) {
+ if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
/* If the lookup fails we want to clear out the state in the
* redirect_info struct completely, so that if an eBPF program
* performs multiple lookups, the last one always takes
@@ -1482,13 +1485,21 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind
*/
ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
ri->map_type = BPF_MAP_TYPE_UNSPEC;
- return flags;
+ return flags & action_mask;
}
ri->tgt_index = ifindex;
ri->map_id = map->id;
ri->map_type = map->map_type;
+ if (flags & BPF_F_BROADCAST) {
+ WRITE_ONCE(ri->map, map);
+ ri->flags = flags;
+ } else {
+ WRITE_ONCE(ri->map, NULL);
+ ri->flags = 0;
+ }
+
return XDP_REDIRECT;
}
diff --git a/include/net/xdp.h b/include/net/xdp.h
index a5bc214a49d9..5533f0ab2afc 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -170,6 +170,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct net_device *dev);
int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
static inline
void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index fcad3645a70b..c40fc97f9417 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -110,7 +110,11 @@ DECLARE_EVENT_CLASS(xdp_redirect_template,
u32 ifindex = 0, map_index = index;
if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
+ /* Just leave to_ifindex to 0 if do broadcast redirect,
+ * as tgt will be NULL.
+ */
+ if (tgt)
+ ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex;
} else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
ifindex = index;
map_index = 0;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 562adeac1d67..2c1ba70abbf1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2555,8 +2555,12 @@ union bpf_attr {
* The lower two bits of *flags* are used as the return code if
* the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to **XDP_TX**, as chosen
- * by the caller. Any higher bits in the *flags* argument must be
- * unset.
+ * by the caller. The higher bits of *flags* can be set to
+ * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
+ *
+ * With BPF_F_BROADCAST the packet will be broadcasted to all the
+ * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
+ * interface will be excluded when do broadcasting.
*
* See also **bpf_redirect**\ (), which only supports redirecting
* to an ifindex, but doesn't require a map to do so.
@@ -5122,6 +5126,12 @@ enum {
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
};
+/* Flags for bpf_redirect_map helper */
+enum {
+ BPF_F_BROADCAST = (1ULL << 3),
+ BPF_F_EXCLUDE_INGRESS = (1ULL << 4),
+};
+
#define __bpf_md_ptr(type, name) \
union { \
type name; \
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 5dd3e866599a..a1a0c4e791c6 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -601,7 +601,8 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ __cpu_map_lookup_elem);
}
static int cpu_map_btf_id;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index d60d617ec0d7..f9148daab0e3 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -57,6 +57,7 @@ struct xdp_dev_bulk_queue {
struct list_head flush_node;
struct net_device *dev;
struct net_device *dev_rx;
+ struct bpf_prog *xdp_prog;
unsigned int count;
};
@@ -197,6 +198,7 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
+ bpf_clear_redirect_map(map);
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -326,22 +328,71 @@ bool dev_map_can_have_prog(struct bpf_map *map)
return false;
}
+static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
+ struct xdp_frame **frames, int n,
+ struct net_device *dev)
+{
+ struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (unlikely(err < 0))
+ xdp_return_frame_rx_napi(xdpf);
+ else
+ frames[nframes++] = xdpf;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dev, xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame_rx_napi(xdpf);
+ break;
+ }
+ }
+ return nframes; /* sent frames count */
+}
+
static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
- int sent = 0, err = 0;
+ int sent = 0, drops = 0, err = 0;
+ unsigned int cnt = bq->count;
+ int to_send = cnt;
int i;
- if (unlikely(!bq->count))
+ if (unlikely(!cnt))
return;
- for (i = 0; i < bq->count; i++) {
+ for (i = 0; i < cnt; i++) {
struct xdp_frame *xdpf = bq->q[i];
prefetch(xdpf);
}
- sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+ if (bq->xdp_prog) {
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+ if (!to_send)
+ goto out;
+
+ drops = cnt - to_send;
+ }
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
if (sent < 0) {
/* If ndo_xdp_xmit fails with an errno, no frames have
* been xmit'ed.
@@ -353,13 +404,13 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
/* If not all frames have been transmitted, it is our
* responsibility to free them
*/
- for (i = sent; unlikely(i < bq->count); i++)
+ for (i = sent; unlikely(i < to_send); i++)
xdp_return_frame_rx_napi(bq->q[i]);
- trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, bq->count - sent, err);
- bq->dev_rx = NULL;
+out:
+ drops = cnt - sent;
bq->count = 0;
- __list_del_clearprev(&bq->flush_node);
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
}
/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
@@ -377,8 +428,12 @@ void __dev_flush(void)
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
- list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
bq_xmit_all(bq, XDP_XMIT_FLUSH);
+ bq->dev_rx = NULL;
+ bq->xdp_prog = NULL;
+ __list_del_clearprev(&bq->flush_node);
+ }
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -401,7 +456,7 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
* Thus, safe percpu variable access.
*/
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
- struct net_device *dev_rx)
+ struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -412,18 +467,22 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
* from net_device drivers NAPI func end.
+ *
+ * Do the same with xdp_prog and flush_list since these fields
+ * are only ever modified together.
*/
- if (!bq->dev_rx)
+ if (!bq->dev_rx) {
bq->dev_rx = dev_rx;
+ bq->xdp_prog = xdp_prog;
+ list_add(&bq->flush_node, flush_list);
+ }
bq->q[bq->count++] = xdpf;
-
- if (!bq->flush_node.prev)
- list_add(&bq->flush_node, flush_list);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+ struct net_device *dev_rx,
+ struct bpf_prog *xdp_prog)
{
struct xdp_frame *xdpf;
int err;
@@ -439,55 +498,115 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
if (unlikely(!xdpf))
return -EOVERFLOW;
- bq_enqueue(dev, xdpf, dev_rx);
+ bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
-static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
- struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
{
- struct xdp_txq_info txq = { .dev = dev };
- u32 act;
+ return __xdp_enqueue(dev, xdp, dev_rx, NULL);
+}
- xdp_set_data_meta_invalid(xdp);
- xdp->txq = &txq;
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
+{
+ struct net_device *dev = dst->dev;
- act = bpf_prog_run_xdp(xdp_prog, xdp);
- switch (act) {
- case XDP_PASS:
- return xdp;
- case XDP_DROP:
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- fallthrough;
- case XDP_ABORTED:
- trace_xdp_exception(dev, xdp_prog, act);
- break;
- }
+ return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
+}
- xdp_return_buff(xdp);
- return NULL;
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp,
+ int exclude_ifindex)
+{
+ if (!obj || obj->dev->ifindex == exclude_ifindex ||
+ !obj->dev->netdev_ops->ndo_xdp_xmit)
+ return false;
+
+ if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+ return false;
+
+ return true;
}
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
+ struct net_device *dev_rx,
+ struct xdp_frame *xdpf)
{
- return __xdp_enqueue(dev, xdp, dev_rx);
+ struct xdp_frame *nxdpf;
+
+ nxdpf = xdpf_clone(xdpf);
+ if (!nxdpf)
+ return -ENOMEM;
+
+ bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+ return 0;
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+ struct bpf_map *map, bool exclude_ingress)
{
- struct net_device *dev = dst->dev;
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int exclude_ifindex = exclude_ingress ? dev_rx->ifindex : 0;
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ struct hlist_head *head;
+ struct xdp_frame *xdpf;
+ unsigned int i;
+ int err;
- if (dst->xdp_prog) {
- xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog);
- if (!xdp)
- return 0;
+ xdpf = xdp_convert_buff_to_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = READ_ONCE(dtab->netdev_map[i]);
+ if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_rcu(dst, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock)) {
+ if (!is_valid_dst(dst, xdp, exclude_ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
}
- return __xdp_enqueue(dev, xdp, dev_rx);
+
+ /* consume the last copy of the frame */
+ if (last_dst)
+ bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
+ else
+ xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
+
+ return 0;
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -504,6 +623,87 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
return 0;
}
+static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
+ struct sk_buff *skb,
+ struct bpf_prog *xdp_prog)
+{
+ struct sk_buff *nskb;
+ int err;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ err = dev_map_generic_redirect(dst, nskb, xdp_prog);
+ if (unlikely(err)) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ return 0;
+}
+
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ int exclude_ifindex = exclude_ingress ? dev->ifindex : 0;
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ struct hlist_head *head;
+ struct hlist_node *next;
+ unsigned int i;
+ int err;
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = READ_ONCE(dtab->netdev_map[i]);
+ if (!dst || dst->dev->ifindex == exclude_ifindex)
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+ if (!dst || dst->dev->ifindex == exclude_ifindex)
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the first skb and return */
+ if (last_dst)
+ return dev_map_generic_redirect(last_dst, skb, xdp_prog);
+
+ /* dtab is empty */
+ consume_skb(skb);
+ return 0;
+}
+
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
@@ -730,12 +930,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_lookup_elem);
}
static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_hash_lookup_elem);
}
static int dev_map_btf_id;
diff --git a/net/core/filter.c b/net/core/filter.c
index 582ac196fd94..caa88955562e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3930,6 +3930,23 @@ void xdp_do_flush(void)
}
EXPORT_SYMBOL_GPL(xdp_do_flush);
+void bpf_clear_redirect_map(struct bpf_map *map)
+{
+ struct bpf_redirect_info *ri;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+ /* Avoid polluting remote cacheline due to writes if
+ * not needed. Once we pass this test, we need the
+ * cmpxchg() to make sure it hasn't been changed in
+ * the meantime by remote CPU.
+ */
+ if (unlikely(READ_ONCE(ri->map) == map))
+ cmpxchg(&ri->map, map, NULL);
+ }
+}
+
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
@@ -3937,6 +3954,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
u32 map_id = ri->map_id;
+ struct bpf_map *map;
int err;
ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
@@ -3946,7 +3964,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
- err = dev_map_enqueue(fwd, xdp, dev);
+ map = READ_ONCE(ri->map);
+ if (unlikely(map)) {
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_enqueue_multi(xdp, dev, map,
+ ri->flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_enqueue(fwd, xdp, dev);
+ }
break;
case BPF_MAP_TYPE_CPUMAP:
err = cpu_map_enqueue(fwd, xdp, dev);
@@ -3988,13 +4013,21 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
enum bpf_map_type map_type, u32 map_id)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_map *map;
int err;
switch (map_type) {
case BPF_MAP_TYPE_DEVMAP:
fallthrough;
case BPF_MAP_TYPE_DEVMAP_HASH:
- err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+ map = READ_ONCE(ri->map);
+ if (unlikely(map)) {
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+ ri->flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+ }
if (unlikely(err))
goto err;
break;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 858276e72c68..725d20f1b100 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -584,3 +584,31 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
return __xdp_build_skb_from_frame(xdpf, skb, dev);
}
EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
+
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
+{
+ unsigned int headroom, totalsize;
+ struct xdp_frame *nxdpf;
+ struct page *page;
+ void *addr;
+
+ headroom = xdpf->headroom + sizeof(*xdpf);
+ totalsize = headroom + xdpf->len;
+
+ if (unlikely(totalsize > PAGE_SIZE))
+ return NULL;
+ page = dev_alloc_page();
+ if (!page)
+ return NULL;
+ addr = page_to_virt(page);
+
+ memcpy(addr, xdpf, totalsize);
+
+ nxdpf = addr;
+ nxdpf->data = addr + headroom;
+ nxdpf->frame_sz = PAGE_SIZE;
+ nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+ nxdpf->mem.id = 0;
+
+ return nxdpf;
+}
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 67b4ce504852..9df75ea4a567 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -226,7 +226,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem);
+ return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ __xsk_map_lookup_elem);
}
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 45ceca4e2c70..520434ea966f 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -41,6 +41,7 @@ tprogs-y += test_map_in_map
tprogs-y += per_socket_stats_example
tprogs-y += xdp_redirect
tprogs-y += xdp_redirect_map
+tprogs-y += xdp_redirect_map_multi
tprogs-y += xdp_redirect_cpu
tprogs-y += xdp_monitor
tprogs-y += xdp_rxq_info
@@ -99,6 +100,7 @@ test_map_in_map-objs := test_map_in_map_user.o
per_socket_stats_example-objs := cookie_uid_helper_example.o
xdp_redirect-objs := xdp_redirect_user.o
xdp_redirect_map-objs := xdp_redirect_map_user.o
+xdp_redirect_map_multi-objs := xdp_redirect_map_multi_user.o
xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o
xdp_monitor-objs := xdp_monitor_user.o
xdp_rxq_info-objs := xdp_rxq_info_user.o
@@ -160,6 +162,7 @@ always-y += tcp_tos_reflect_kern.o
always-y += tcp_dumpstats_kern.o
always-y += xdp_redirect_kern.o
always-y += xdp_redirect_map_kern.o
+always-y += xdp_redirect_map_multi_kern.o
always-y += xdp_redirect_cpu_kern.o
always-y += xdp_monitor_kern.o
always-y += xdp_rxq_info_kern.o
diff --git a/samples/bpf/xdp_redirect_map_multi_kern.c b/samples/bpf/xdp_redirect_map_multi_kern.c
new file mode 100644
index 000000000000..71aa23d1cb2b
--- /dev/null
+++ b/samples/bpf/xdp_redirect_map_multi_kern.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __uint(max_entries, 32);
+} forward_map_general SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(struct bpf_devmap_val));
+ __uint(max_entries, 32);
+} forward_map_native SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, u32);
+ __type(value, long);
+ __uint(max_entries, 1);
+} rxcnt SEC(".maps");
+
+/* map to store egress interfaces mac addresses, set the
+ * max_entries to 1 and extend it in user sapce prog.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, u32);
+ __type(value, __be64);
+ __uint(max_entries, 1);
+} mac_map SEC(".maps");
+
+static int xdp_redirect_map(struct xdp_md *ctx, void *forward_map)
+{
+ long *value;
+ u32 key = 0;
+
+ /* count packet in global counter */
+ value = bpf_map_lookup_elem(&rxcnt, &key);
+ if (value)
+ *value += 1;
+
+ return bpf_redirect_map(forward_map, key,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
+}
+
+SEC("xdp_redirect_general")
+int xdp_redirect_map_general(struct xdp_md *ctx)
+{
+ return xdp_redirect_map(ctx, &forward_map_general);
+}
+
+SEC("xdp_redirect_native")
+int xdp_redirect_map_native(struct xdp_md *ctx)
+{
+ return xdp_redirect_map(ctx, &forward_map_native);
+}
+
+SEC("xdp_devmap/map_prog")
+int xdp_devmap_prog(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ u32 key = ctx->egress_ifindex;
+ struct ethhdr *eth = data;
+ __be64 *mac;
+ u64 nh_off;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return XDP_DROP;
+
+ mac = bpf_map_lookup_elem(&mac_map, &key);
+ if (mac)
+ __builtin_memcpy(eth->h_source, mac, ETH_ALEN);
+
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_redirect_map_multi_user.c b/samples/bpf/xdp_redirect_map_multi_user.c
new file mode 100644
index 000000000000..84cdbbed20b7
--- /dev/null
+++ b/samples/bpf/xdp_redirect_map_multi_user.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/if.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <sys/resource.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#define MAX_IFACE_NUM 32
+
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+static int ifaces[MAX_IFACE_NUM] = {};
+static int rxcnt_map_fd;
+
+static void int_exit(int sig)
+{
+ __u32 prog_id = 0;
+ int i;
+
+ for (i = 0; ifaces[i] > 0; i++) {
+ if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) {
+ printf("bpf_get_link_xdp_id failed\n");
+ exit(1);
+ }
+ if (prog_id)
+ bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags);
+ }
+
+ exit(0);
+}
+
+static void poll_stats(int interval)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ __u64 values[nr_cpus], prev[nr_cpus];
+
+ memset(prev, 0, sizeof(prev));
+
+ while (1) {
+ __u64 sum = 0;
+ __u32 key = 0;
+ int i;
+
+ sleep(interval);
+ assert(bpf_map_lookup_elem(rxcnt_map_fd, &key, values) == 0);
+ for (i = 0; i < nr_cpus; i++)
+ sum += (values[i] - prev[i]);
+ if (sum)
+ printf("Forwarding %10llu pkt/s\n", sum / interval);
+ memcpy(prev, values, sizeof(values));
+ }
+}
+
+static int get_mac_addr(unsigned int ifindex, void *mac_addr)
+{
+ char ifname[IF_NAMESIZE];
+ struct ifreq ifr;
+ int fd, ret = -1;
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (fd < 0)
+ return ret;
+
+ if (!if_indextoname(ifindex, ifname))
+ goto err_out;
+
+ strcpy(ifr.ifr_name, ifname);
+
+ if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
+ goto err_out;
+
+ memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
+ ret = 0;
+
+err_out:
+ close(fd);
+ return ret;
+}
+
+static int update_mac_map(struct bpf_object *obj)
+{
+ int i, ret = -1, mac_map_fd;
+ unsigned char mac_addr[6];
+ unsigned int ifindex;
+
+ mac_map_fd = bpf_object__find_map_fd_by_name(obj, "mac_map");
+ if (mac_map_fd < 0) {
+ printf("find mac map fd failed\n");
+ return ret;
+ }
+
+ for (i = 0; ifaces[i] > 0; i++) {
+ ifindex = ifaces[i];
+
+ ret = get_mac_addr(ifindex, mac_addr);
+ if (ret < 0) {
+ printf("get interface %d mac failed\n", ifindex);
+ return ret;
+ }
+
+ ret = bpf_map_update_elem(mac_map_fd, &ifindex, mac_addr, 0);
+ if (ret) {
+ perror("bpf_update_elem mac_map_fd");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "usage: %s [OPTS] <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n"
+ "OPTS:\n"
+ " -S use skb-mode\n"
+ " -N enforce native mode\n"
+ " -F force loading prog\n"
+ " -X load xdp program on egress\n",
+ prog);
+}
+
+int main(int argc, char **argv)
+{
+ int i, ret, opt, forward_map_fd, max_ifindex = 0;
+ struct bpf_program *ingress_prog, *egress_prog;
+ int ingress_prog_fd, egress_prog_fd = 0;
+ struct bpf_devmap_val devmap_val;
+ bool attach_egress_prog = false;
+ char ifname[IF_NAMESIZE];
+ struct bpf_map *mac_map;
+ struct bpf_object *obj;
+ unsigned int ifindex;
+ char filename[256];
+
+ while ((opt = getopt(argc, argv, "SNFX")) != -1) {
+ switch (opt) {
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ /* default, set below */
+ break;
+ case 'F':
+ xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+ break;
+ case 'X':
+ attach_egress_prog = true;
+ break;
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) {
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+ } else if (attach_egress_prog) {
+ printf("Load xdp program on egress with SKB mode not supported yet\n");
+ return 1;
+ }
+
+ if (optind == argc) {
+ printf("usage: %s <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n", argv[0]);
+ return 1;
+ }
+
+ printf("Get interfaces");
+ for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) {
+ ifaces[i] = if_nametoindex(argv[optind + i]);
+ if (!ifaces[i])
+ ifaces[i] = strtoul(argv[optind + i], NULL, 0);
+ if (!if_indextoname(ifaces[i], ifname)) {
+ perror("Invalid interface name or i");
+ return 1;
+ }
+
+ /* Find the largest index number */
+ if (ifaces[i] > max_ifindex)
+ max_ifindex = ifaces[i];
+
+ printf(" %d", ifaces[i]);
+ }
+ printf("\n");
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ obj = bpf_object__open(filename);
+ if (libbpf_get_error(obj)) {
+ printf("ERROR: opening BPF object file failed\n");
+ obj = NULL;
+ goto err_out;
+ }
+
+ /* Reset the map size to max ifindex + 1 */
+ if (attach_egress_prog) {
+ mac_map = bpf_object__find_map_by_name(obj, "mac_map");
+ ret = bpf_map__resize(mac_map, max_ifindex + 1);
+ if (ret < 0) {
+ printf("ERROR: reset mac map size failed\n");
+ goto err_out;
+ }
+ }
+
+ /* load BPF program */
+ if (bpf_object__load(obj)) {
+ printf("ERROR: loading BPF object file failed\n");
+ goto err_out;
+ }
+
+ if (xdp_flags & XDP_FLAGS_SKB_MODE) {
+ ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_general");
+ forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_general");
+ } else {
+ ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_native");
+ forward_map_fd = bpf_object__find_map_fd_by_name(obj, "forward_map_native");
+ }
+ if (!ingress_prog || forward_map_fd < 0) {
+ printf("finding ingress_prog/forward_map in obj file failed\n");
+ goto err_out;
+ }
+
+ ingress_prog_fd = bpf_program__fd(ingress_prog);
+ if (ingress_prog_fd < 0) {
+ printf("find ingress_prog fd failed\n");
+ goto err_out;
+ }
+
+ rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
+ if (rxcnt_map_fd < 0) {
+ printf("bpf_object__find_map_fd_by_name failed\n");
+ goto err_out;
+ }
+
+ if (attach_egress_prog) {
+ /* Update mac_map with all egress interfaces' mac addr */
+ if (update_mac_map(obj) < 0) {
+ printf("Error: update mac map failed");
+ goto err_out;
+ }
+
+ /* Find egress prog fd */
+ egress_prog = bpf_object__find_program_by_name(obj, "xdp_devmap_prog");
+ if (!egress_prog) {
+ printf("finding egress_prog in obj file failed\n");
+ goto err_out;
+ }
+ egress_prog_fd = bpf_program__fd(egress_prog);
+ if (egress_prog_fd < 0) {
+ printf("find egress_prog fd failed\n");
+ goto err_out;
+ }
+ }
+
+ /* Remove attached program when program is interrupted or killed */
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ /* Init forward multicast groups */
+ for (i = 0; ifaces[i] > 0; i++) {
+ ifindex = ifaces[i];
+
+ /* bind prog_fd to each interface */
+ ret = bpf_set_link_xdp_fd(ifindex, ingress_prog_fd, xdp_flags);
+ if (ret) {
+ printf("Set xdp fd failed on %d\n", ifindex);
+ goto err_out;
+ }
+
+ /* Add all the interfaces to forward group and attach
+ * egress devmap programe if exist
+ */
+ devmap_val.ifindex = ifindex;
+ devmap_val.bpf_prog.fd = egress_prog_fd;
+ ret = bpf_map_update_elem(forward_map_fd, &ifindex, &devmap_val, 0);
+ if (ret) {
+ perror("bpf_map_update_elem forward_map");
+ goto err_out;
+ }
+ }
+
+ poll_stats(2);
+
+ return 0;
+
+err_out:
+ return 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 562adeac1d67..2c1ba70abbf1 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2555,8 +2555,12 @@ union bpf_attr {
* The lower two bits of *flags* are used as the return code if
* the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to **XDP_TX**, as chosen
- * by the caller. Any higher bits in the *flags* argument must be
- * unset.
+ * by the caller. The higher bits of *flags* can be set to
+ * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
+ *
+ * With BPF_F_BROADCAST the packet will be broadcasted to all the
+ * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
+ * interface will be excluded when do broadcasting.
*
* See also **bpf_redirect**\ (), which only supports redirecting
* to an ifindex, but doesn't require a map to do so.
@@ -5122,6 +5126,12 @@ enum {
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
};
+/* Flags for bpf_redirect_map helper */
+enum {
+ BPF_F_BROADCAST = (1ULL << 3),
+ BPF_F_EXCLUDE_INGRESS = (1ULL << 4),
+};
+
#define __bpf_md_ptr(type, name) \
union { \
type name; \
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 525e4b3fb514..f405b20c1e6c 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -54,6 +54,7 @@ TEST_FILES = xsk_prereqs.sh \
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
test_xdp_redirect.sh \
+ test_xdp_redirect_multi.sh \
test_xdp_meta.sh \
test_xdp_veth.sh \
test_offload.py \
@@ -84,7 +85,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
- xdpxceiver
+ xdpxceiver xdp_redirect_multi
TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
diff --git a/tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c b/tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c
new file mode 100644
index 000000000000..880debcbcd65
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_redirect_multi_kern.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+#define KBUILD_MODNAME "foo"
+#include <string.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+/* One map use devmap, another one use devmap_hash for testing */
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __uint(max_entries, 1024);
+} map_all SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(struct bpf_devmap_val));
+ __uint(max_entries, 128);
+} map_egress SEC(".maps");
+
+/* map to store egress interfaces mac addresses */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, __u32);
+ __type(value, __be64);
+ __uint(max_entries, 128);
+} mac_map SEC(".maps");
+
+SEC("xdp_redirect_map_multi")
+int xdp_redirect_map_multi_prog(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ int if_index = ctx->ingress_ifindex;
+ struct ethhdr *eth = data;
+ __u16 h_proto;
+ __u64 nh_off;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return XDP_DROP;
+
+ h_proto = eth->h_proto;
+
+ /* Using IPv4 for (BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS) testing */
+ if (h_proto == bpf_htons(ETH_P_IP))
+ return bpf_redirect_map(&map_all, 0,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
+ /* Using IPv6 for none flag testing */
+ else if (h_proto == bpf_htons(ETH_P_IPV6))
+ return bpf_redirect_map(&map_all, if_index, 0);
+ /* All others for BPF_F_BROADCAST testing */
+ else
+ return bpf_redirect_map(&map_all, 0, BPF_F_BROADCAST);
+}
+
+/* The following 2 progs are for 2nd devmap prog testing */
+SEC("xdp_redirect_map_ingress")
+int xdp_redirect_map_all_prog(struct xdp_md *ctx)
+{
+ return bpf_redirect_map(&map_egress, 0,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS);
+}
+
+SEC("xdp_devmap/map_prog")
+int xdp_devmap_prog(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ __u32 key = ctx->egress_ifindex;
+ struct ethhdr *eth = data;
+ __u64 nh_off;
+ __be64 *mac;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return XDP_DROP;
+
+ mac = bpf_map_lookup_elem(&mac_map, &key);
+ if (mac)
+ __builtin_memcpy(eth->h_source, mac, ETH_ALEN);
+
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh
new file mode 100755
index 000000000000..1538373157e3
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_redirect_multi.sh
@@ -0,0 +1,204 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test topology:
+# - - - - - - - - - - - - - - - - - - - - - - - - -
+# | veth1 veth2 veth3 | ... init net
+# - -| - - - - - - | - - - - - - | - -
+# --------- --------- ---------
+# | veth0 | | veth0 | | veth0 | ...
+# --------- --------- ---------
+# ns1 ns2 ns3
+#
+# Test modules:
+# XDP modes: generic, native, native + egress_prog
+#
+# Test cases:
+# ARP: Testing BPF_F_BROADCAST, the ingress interface also should receive
+# the redirects.
+# ns1 -> gw: ns1, ns2, ns3, should receive the arp request
+# IPv4: Testing BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, the ingress
+# interface should not receive the redirects.
+# ns1 -> gw: ns1 should not receive, ns2, ns3 should receive redirects.
+# IPv6: Testing none flag, all the pkts should be redirected back
+# ping test: ns1 -> ns2 (block), echo requests will be redirect back
+# egress_prog:
+# all src mac should be egress interface's mac
+
+# netns numbers
+NUM=3
+IFACES=""
+DRV_MODE="xdpgeneric xdpdrv xdpegress"
+PASS=0
+FAIL=0
+
+test_pass()
+{
+ echo "Pass: $@"
+ PASS=$((PASS + 1))
+}
+
+test_fail()
+{
+ echo "fail: $@"
+ FAIL=$((FAIL + 1))
+}
+
+clean_up()
+{
+ for i in $(seq $NUM); do
+ ip link del veth$i 2> /dev/null
+ ip netns del ns$i 2> /dev/null
+ done
+}
+
+# Kselftest framework requirement - SKIP code is 4.
+check_env()
+{
+ ip link set dev lo xdpgeneric off &>/dev/null
+ if [ $? -ne 0 ];then
+ echo "selftests: [SKIP] Could not run test without the ip xdpgeneric support"
+ exit 4
+ fi
+
+ which tcpdump &>/dev/null
+ if [ $? -ne 0 ];then
+ echo "selftests: [SKIP] Could not run test without tcpdump"
+ exit 4
+ fi
+}
+
+setup_ns()
+{
+ local mode=$1
+ IFACES=""
+
+ if [ "$mode" = "xdpegress" ]; then
+ mode="xdpdrv"
+ fi
+
+ for i in $(seq $NUM); do
+ ip netns add ns$i
+ ip link add veth$i type veth peer name veth0 netns ns$i
+ ip link set veth$i up
+ ip -n ns$i link set veth0 up
+
+ ip -n ns$i addr add 192.0.2.$i/24 dev veth0
+ ip -n ns$i addr add 2001:db8::$i/64 dev veth0
+ # Add a neigh entry for IPv4 ping test
+ ip -n ns$i neigh add 192.0.2.253 lladdr 00:00:00:00:00:01 dev veth0
+ ip -n ns$i link set veth0 $mode obj \
+ xdp_dummy.o sec xdp_dummy &> /dev/null || \
+ { test_fail "Unable to load dummy xdp" && exit 1; }
+ IFACES="$IFACES veth$i"
+ veth_mac[$i]=$(ip link show veth$i | awk '/link\/ether/ {print $2}')
+ done
+}
+
+do_egress_tests()
+{
+ local mode=$1
+
+ # mac test
+ ip netns exec ns2 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-2_${mode}.log &
+ ip netns exec ns3 tcpdump -e -i veth0 -nn -l -e &> mac_ns1-3_${mode}.log &
+ sleep 0.5
+ ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null
+ sleep 0.5
+ pkill -9 tcpdump
+
+ # mac check
+ grep -q "${veth_mac[2]} > ff:ff:ff:ff:ff:ff" mac_ns1-2_${mode}.log && \
+ test_pass "$mode mac ns1-2" || test_fail "$mode mac ns1-2"
+ grep -q "${veth_mac[3]} > ff:ff:ff:ff:ff:ff" mac_ns1-3_${mode}.log && \
+ test_pass "$mode mac ns1-3" || test_fail "$mode mac ns1-3"
+}
+
+do_ping_tests()
+{
+ local mode=$1
+
+ # ping6 test: echo request should be redirect back to itself, not others
+ ip netns exec ns1 ip neigh add 2001:db8::2 dev veth0 lladdr 00:00:00:00:00:02
+
+ ip netns exec ns1 tcpdump -i veth0 -nn -l -e &> ns1-1_${mode}.log &
+ ip netns exec ns2 tcpdump -i veth0 -nn -l -e &> ns1-2_${mode}.log &
+ ip netns exec ns3 tcpdump -i veth0 -nn -l -e &> ns1-3_${mode}.log &
+ sleep 0.5
+ # ARP test
+ ip netns exec ns1 ping 192.0.2.254 -i 0.1 -c 4 &> /dev/null
+ # IPv4 test
+ ip netns exec ns1 ping 192.0.2.253 -i 0.1 -c 4 &> /dev/null
+ # IPv6 test
+ ip netns exec ns1 ping6 2001:db8::2 -i 0.1 -c 2 &> /dev/null
+ sleep 0.5
+ pkill -9 tcpdump
+
+ # All netns should receive the redirect arp requests
+ [ $(grep -c "who-has 192.0.2.254" ns1-1_${mode}.log) -gt 4 ] && \
+ test_pass "$mode arp(F_BROADCAST) ns1-1" || \
+ test_fail "$mode arp(F_BROADCAST) ns1-1"
+ [ $(grep -c "who-has 192.0.2.254" ns1-2_${mode}.log) -le 4 ] && \
+ test_pass "$mode arp(F_BROADCAST) ns1-2" || \
+ test_fail "$mode arp(F_BROADCAST) ns1-2"
+ [ $(grep -c "who-has 192.0.2.254" ns1-3_${mode}.log) -le 4 ] && \
+ test_pass "$mode arp(F_BROADCAST) ns1-3" || \
+ test_fail "$mode arp(F_BROADCAST) ns1-3"
+
+ # ns1 should not receive the redirect echo request, others should
+ [ $(grep -c "ICMP echo request" ns1-1_${mode}.log) -eq 4 ] && \
+ test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1" || \
+ test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-1"
+ [ $(grep -c "ICMP echo request" ns1-2_${mode}.log) -eq 4 ] && \
+ test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2" || \
+ test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-2"
+ [ $(grep -c "ICMP echo request" ns1-3_${mode}.log) -eq 4 ] && \
+ test_pass "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3" || \
+ test_fail "$mode IPv4 (F_BROADCAST|F_EXCLUDE_INGRESS) ns1-3"
+
+ # ns1 should receive the echo request, ns2 should not
+ [ $(grep -c "ICMP6, echo request" ns1-1_${mode}.log) -eq 4 ] && \
+ test_pass "$mode IPv6 (no flags) ns1-1" || \
+ test_fail "$mode IPv6 (no flags) ns1-1"
+ [ $(grep -c "ICMP6, echo request" ns1-2_${mode}.log) -eq 0 ] && \
+ test_pass "$mode IPv6 (no flags) ns1-2" || \
+ test_fail "$mode IPv6 (no flags) ns1-2"
+}
+
+do_tests()
+{
+ local mode=$1
+ local drv_p
+
+ case ${mode} in
+ xdpdrv) drv_p="-N";;
+ xdpegress) drv_p="-X";;
+ xdpgeneric) drv_p="-S";;
+ esac
+
+ ./xdp_redirect_multi $drv_p $IFACES &> xdp_redirect_${mode}.log &
+ xdp_pid=$!
+ sleep 1
+
+ if [ "$mode" = "xdpegress" ]; then
+ do_egress_tests $mode
+ else
+ do_ping_tests $mode
+ fi
+
+ kill $xdp_pid
+}
+
+trap clean_up 0 2 3 6 9
+
+check_env
+rm -f xdp_redirect_*.log ns*.log mac_ns*.log
+
+for mode in ${DRV_MODE}; do
+ setup_ns $mode
+ do_tests $mode
+ clean_up
+done
+
+echo "Summary: PASS $PASS, FAIL $FAIL"
+[ $FAIL -eq 0 ] && exit 0 || exit 1
diff --git a/tools/testing/selftests/bpf/xdp_redirect_multi.c b/tools/testing/selftests/bpf/xdp_redirect_multi.c
new file mode 100644
index 000000000000..3696a8f32c23
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdp_redirect_multi.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <net/if.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <sys/resource.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#define MAX_IFACE_NUM 32
+#define MAX_INDEX_NUM 1024
+
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+static int ifaces[MAX_IFACE_NUM] = {};
+
+static void int_exit(int sig)
+{
+ __u32 prog_id = 0;
+ int i;
+
+ for (i = 0; ifaces[i] > 0; i++) {
+ if (bpf_get_link_xdp_id(ifaces[i], &prog_id, xdp_flags)) {
+ printf("bpf_get_link_xdp_id failed\n");
+ exit(1);
+ }
+ if (prog_id)
+ bpf_set_link_xdp_fd(ifaces[i], -1, xdp_flags);
+ }
+
+ exit(0);
+}
+
+static int get_mac_addr(unsigned int ifindex, void *mac_addr)
+{
+ char ifname[IF_NAMESIZE];
+ struct ifreq ifr;
+ int fd, ret = -1;
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (fd < 0)
+ return ret;
+
+ if (!if_indextoname(ifindex, ifname))
+ goto err_out;
+
+ strcpy(ifr.ifr_name, ifname);
+
+ if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
+ goto err_out;
+
+ memcpy(mac_addr, ifr.ifr_hwaddr.sa_data, 6 * sizeof(char));
+ ret = 0;
+
+err_out:
+ close(fd);
+ return ret;
+}
+
+static void usage(const char *prog)
+{
+ fprintf(stderr,
+ "usage: %s [OPTS] <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n"
+ "OPTS:\n"
+ " -S use skb-mode\n"
+ " -N enforce native mode\n"
+ " -F force loading prog\n"
+ " -X load xdp program on egress\n",
+ prog);
+}
+
+int main(int argc, char **argv)
+{
+ int prog_fd, group_all, mac_map;
+ struct bpf_program *ingress_prog, *egress_prog;
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_UNSPEC,
+ };
+ int i, ret, opt, egress_prog_fd = 0;
+ struct bpf_devmap_val devmap_val;
+ bool attach_egress_prog = false;
+ unsigned char mac_addr[6];
+ char ifname[IF_NAMESIZE];
+ struct bpf_object *obj;
+ unsigned int ifindex;
+ char filename[256];
+
+ while ((opt = getopt(argc, argv, "SNFX")) != -1) {
+ switch (opt) {
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'N':
+ /* default, set below */
+ break;
+ case 'F':
+ xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+ break;
+ case 'X':
+ attach_egress_prog = true;
+ break;
+ default:
+ usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (!(xdp_flags & XDP_FLAGS_SKB_MODE)) {
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+ } else if (attach_egress_prog) {
+ printf("Load xdp program on egress with SKB mode not supported yet\n");
+ goto err_out;
+ }
+
+ if (optind == argc) {
+ printf("usage: %s <IFNAME|IFINDEX> <IFNAME|IFINDEX> ...\n", argv[0]);
+ goto err_out;
+ }
+
+ printf("Get interfaces");
+ for (i = 0; i < MAX_IFACE_NUM && argv[optind + i]; i++) {
+ ifaces[i] = if_nametoindex(argv[optind + i]);
+ if (!ifaces[i])
+ ifaces[i] = strtoul(argv[optind + i], NULL, 0);
+ if (!if_indextoname(ifaces[i], ifname)) {
+ perror("Invalid interface name or i");
+ goto err_out;
+ }
+ if (ifaces[i] > MAX_INDEX_NUM) {
+ printf("Interface index to large\n");
+ goto err_out;
+ }
+ printf(" %d", ifaces[i]);
+ }
+ printf("\n");
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ prog_load_attr.file = filename;
+
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+ goto err_out;
+
+ if (attach_egress_prog)
+ group_all = bpf_object__find_map_fd_by_name(obj, "map_egress");
+ else
+ group_all = bpf_object__find_map_fd_by_name(obj, "map_all");
+ mac_map = bpf_object__find_map_fd_by_name(obj, "mac_map");
+
+ if (group_all < 0 || mac_map < 0) {
+ printf("bpf_object__find_map_fd_by_name failed\n");
+ goto err_out;
+ }
+
+ if (attach_egress_prog) {
+ /* Find ingress/egress prog for 2nd xdp prog */
+ ingress_prog = bpf_object__find_program_by_name(obj, "xdp_redirect_map_all_prog");
+ egress_prog = bpf_object__find_program_by_name(obj, "xdp_devmap_prog");
+ if (!ingress_prog || !egress_prog) {
+ printf("finding ingress/egress_prog in obj file failed\n");
+ goto err_out;
+ }
+ prog_fd = bpf_program__fd(ingress_prog);
+ egress_prog_fd = bpf_program__fd(egress_prog);
+ if (prog_fd < 0 || egress_prog_fd < 0) {
+ printf("find egress_prog fd failed\n");
+ goto err_out;
+ }
+ }
+
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ /* Init forward multicast groups and exclude group */
+ for (i = 0; ifaces[i] > 0; i++) {
+ ifindex = ifaces[i];
+
+ if (attach_egress_prog) {
+ ret = get_mac_addr(ifindex, mac_addr);
+ if (ret < 0) {
+ printf("get interface %d mac failed\n", ifindex);
+ goto err_out;
+ }
+ ret = bpf_map_update_elem(mac_map, &ifindex, mac_addr, 0);
+ if (ret) {
+ perror("bpf_update_elem mac_map failed\n");
+ goto err_out;
+ }
+ }
+
+ /* Add all the interfaces to group all */
+ devmap_val.ifindex = ifindex;
+ devmap_val.bpf_prog.fd = egress_prog_fd;
+ ret = bpf_map_update_elem(group_all, &ifindex, &devmap_val, 0);
+ if (ret) {
+ perror("bpf_map_update_elem");
+ goto err_out;
+ }
+
+ /* bind prog_fd to each interface */
+ ret = bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags);
+ if (ret) {
+ printf("Set xdp fd failed on %d\n", ifindex);
+ goto err_out;
+ }
+ }
+
+ /* sleep some time for testing */
+ sleep(999);
+
+ return 0;
+
+err_out:
+ return 1;
+}