Age | Commit message (Collapse) | Author |
|
Last major reorg happened in commit 9115e8cd2a0c ("net: reorganize
struct sock for better data locality")
Since then, many changes have been done.
Before SO_PEEK_OFF support is added to TCP, we need
to move sk_peek_off to a better location.
It is time to make another pass, and add six groups,
without explicit alignment.
- sock_write_rx (following sk_refcnt) read-write fields in rx path.
- sock_read_rx read-mostly fields in rx path.
- sock_read_rxtx read-mostly fields in both rx and tx paths.
- sock_write_rxtx read-write fields in both rx and tx paths.
- sock_write_tx read-write fields in tx paths.
- sock_read_tx read-mostly fields in tx paths.
Results on TCP_RR benchmarks seem to show a gain (4 to 5 %).
It is possible UDP needs a change, because sk_peek_off
shares a cache line with sk_receive_queue.
If this the case, we can exchange roles of sk->sk_receive
and up->reader_queue queues.
After this change, we have the following layout:
struct sock {
struct sock_common __sk_common; /* 0 0x88 */
/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
__u8 __cacheline_group_begin__sock_write_rx[0]; /* 0x88 0 */
atomic_t sk_drops; /* 0x88 0x4 */
__s32 sk_peek_off; /* 0x8c 0x4 */
struct sk_buff_head sk_error_queue; /* 0x90 0x18 */
struct sk_buff_head sk_receive_queue; /* 0xa8 0x18 */
/* --- cacheline 3 boundary (192 bytes) --- */
struct {
atomic_t rmem_alloc; /* 0xc0 0x4 */
int len; /* 0xc4 0x4 */
struct sk_buff * head; /* 0xc8 0x8 */
struct sk_buff * tail; /* 0xd0 0x8 */
} sk_backlog; /* 0xc0 0x18 */
struct {
atomic_t rmem_alloc; /* 0 0x4 */
int len; /* 0x4 0x4 */
struct sk_buff * head; /* 0x8 0x8 */
struct sk_buff * tail; /* 0x10 0x8 */
/* size: 24, cachelines: 1, members: 4 */
/* last cacheline: 24 bytes */
};
__u8 __cacheline_group_end__sock_write_rx[0]; /* 0xd8 0 */
__u8 __cacheline_group_begin__sock_read_rx[0]; /* 0xd8 0 */
rcu * sk_rx_dst; /* 0xd8 0x8 */
int sk_rx_dst_ifindex; /* 0xe0 0x4 */
u32 sk_rx_dst_cookie; /* 0xe4 0x4 */
unsigned int sk_ll_usec; /* 0xe8 0x4 */
unsigned int sk_napi_id; /* 0xec 0x4 */
u16 sk_busy_poll_budget; /* 0xf0 0x2 */
u8 sk_prefer_busy_poll; /* 0xf2 0x1 */
u8 sk_userlocks; /* 0xf3 0x1 */
int sk_rcvbuf; /* 0xf4 0x4 */
rcu * sk_filter; /* 0xf8 0x8 */
/* --- cacheline 4 boundary (256 bytes) --- */
union {
rcu * sk_wq; /* 0x100 0x8 */
struct socket_wq * sk_wq_raw; /* 0x100 0x8 */
}; /* 0x100 0x8 */
union {
rcu * sk_wq; /* 0 0x8 */
struct socket_wq * sk_wq_raw; /* 0 0x8 */
};
void (*sk_data_ready)(struct sock *); /* 0x108 0x8 */
long sk_rcvtimeo; /* 0x110 0x8 */
int sk_rcvlowat; /* 0x118 0x4 */
__u8 __cacheline_group_end__sock_read_rx[0]; /* 0x11c 0 */
__u8 __cacheline_group_begin__sock_read_rxtx[0]; /* 0x11c 0 */
int sk_err; /* 0x11c 0x4 */
struct socket * sk_socket; /* 0x120 0x8 */
struct mem_cgroup * sk_memcg; /* 0x128 0x8 */
rcu * sk_policy[2]; /* 0x130 0x10 */
/* --- cacheline 5 boundary (320 bytes) --- */
__u8 __cacheline_group_end__sock_read_rxtx[0]; /* 0x140 0 */
__u8 __cacheline_group_begin__sock_write_rxtx[0]; /* 0x140 0 */
socket_lock_t sk_lock; /* 0x140 0x20 */
u32 sk_reserved_mem; /* 0x160 0x4 */
int sk_forward_alloc; /* 0x164 0x4 */
u32 sk_tsflags; /* 0x168 0x4 */
__u8 __cacheline_group_end__sock_write_rxtx[0]; /* 0x16c 0 */
__u8 __cacheline_group_begin__sock_write_tx[0]; /* 0x16c 0 */
int sk_write_pending; /* 0x16c 0x4 */
atomic_t sk_omem_alloc; /* 0x170 0x4 */
int sk_sndbuf; /* 0x174 0x4 */
int sk_wmem_queued; /* 0x178 0x4 */
refcount_t sk_wmem_alloc; /* 0x17c 0x4 */
/* --- cacheline 6 boundary (384 bytes) --- */
unsigned long sk_tsq_flags; /* 0x180 0x8 */
union {
struct sk_buff * sk_send_head; /* 0x188 0x8 */
struct rb_root tcp_rtx_queue; /* 0x188 0x8 */
}; /* 0x188 0x8 */
union {
struct sk_buff * sk_send_head; /* 0 0x8 */
struct rb_root tcp_rtx_queue; /* 0 0x8 */
};
struct sk_buff_head sk_write_queue; /* 0x190 0x18 */
u32 sk_dst_pending_confirm; /* 0x1a8 0x4 */
u32 sk_pacing_status; /* 0x1ac 0x4 */
struct page_frag sk_frag; /* 0x1b0 0x10 */
/* --- cacheline 7 boundary (448 bytes) --- */
struct timer_list sk_timer; /* 0x1c0 0x28 */
/* XXX last struct has 4 bytes of padding */
unsigned long sk_pacing_rate; /* 0x1e8 0x8 */
atomic_t sk_zckey; /* 0x1f0 0x4 */
atomic_t sk_tskey; /* 0x1f4 0x4 */
__u8 __cacheline_group_end__sock_write_tx[0]; /* 0x1f8 0 */
__u8 __cacheline_group_begin__sock_read_tx[0]; /* 0x1f8 0 */
unsigned long sk_max_pacing_rate; /* 0x1f8 0x8 */
/* --- cacheline 8 boundary (512 bytes) --- */
long sk_sndtimeo; /* 0x200 0x8 */
u32 sk_priority; /* 0x208 0x4 */
u32 sk_mark; /* 0x20c 0x4 */
rcu * sk_dst_cache; /* 0x210 0x8 */
netdev_features_t sk_route_caps; /* 0x218 0x8 */
u16 sk_gso_type; /* 0x220 0x2 */
u16 sk_gso_max_segs; /* 0x222 0x2 */
unsigned int sk_gso_max_size; /* 0x224 0x4 */
gfp_t sk_allocation; /* 0x228 0x4 */
u32 sk_txhash; /* 0x22c 0x4 */
u8 sk_pacing_shift; /* 0x230 0x1 */
bool sk_use_task_frag; /* 0x231 0x1 */
__u8 __cacheline_group_end__sock_read_tx[0]; /* 0x232 0 */
u8 sk_gso_disabled:1; /* 0x232: 0 0x1 */
u8 sk_kern_sock:1; /* 0x232:0x1 0x1 */
u8 sk_no_check_tx:1; /* 0x232:0x2 0x1 */
u8 sk_no_check_rx:1; /* 0x232:0x3 0x1 */
/* XXX 4 bits hole, try to pack */
u8 sk_shutdown; /* 0x233 0x1 */
u16 sk_type; /* 0x234 0x2 */
u16 sk_protocol; /* 0x236 0x2 */
unsigned long sk_lingertime; /* 0x238 0x8 */
/* --- cacheline 9 boundary (576 bytes) --- */
struct proto * sk_prot_creator; /* 0x240 0x8 */
rwlock_t sk_callback_lock; /* 0x248 0x8 */
int sk_err_soft; /* 0x250 0x4 */
u32 sk_ack_backlog; /* 0x254 0x4 */
u32 sk_max_ack_backlog; /* 0x258 0x4 */
kuid_t sk_uid; /* 0x25c 0x4 */
spinlock_t sk_peer_lock; /* 0x260 0x4 */
int sk_bind_phc; /* 0x264 0x4 */
struct pid * sk_peer_pid; /* 0x268 0x8 */
const struct cred * sk_peer_cred; /* 0x270 0x8 */
ktime_t sk_stamp; /* 0x278 0x8 */
/* --- cacheline 10 boundary (640 bytes) --- */
int sk_disconnects; /* 0x280 0x4 */
u8 sk_txrehash; /* 0x284 0x1 */
u8 sk_clockid; /* 0x285 0x1 */
u8 sk_txtime_deadline_mode:1; /* 0x286: 0 0x1 */
u8 sk_txtime_report_errors:1; /* 0x286:0x1 0x1 */
u8 sk_txtime_unused:6; /* 0x286:0x2 0x1 */
/* XXX 1 byte hole, try to pack */
void * sk_user_data; /* 0x288 0x8 */
void * sk_security; /* 0x290 0x8 */
struct sock_cgroup_data sk_cgrp_data; /* 0x298 0x8 */
void (*sk_state_change)(struct sock *); /* 0x2a0 0x8 */
void (*sk_write_space)(struct sock *); /* 0x2a8 0x8 */
void (*sk_error_report)(struct sock *); /* 0x2b0 0x8 */
int (*sk_backlog_rcv)(struct sock *, struct sk_buff *); /* 0x2b8 0x8 */
/* --- cacheline 11 boundary (704 bytes) --- */
void (*sk_destruct)(struct sock *); /* 0x2c0 0x8 */
rcu * sk_reuseport_cb; /* 0x2c8 0x8 */
rcu * sk_bpf_storage; /* 0x2d0 0x8 */
struct callback_head sk_rcu __attribute__((__aligned__(8))); /* 0x2d8 0x10 */
netns_tracker ns_tracker; /* 0x2e8 0x8 */
/* size: 752, cachelines: 12, members: 105 */
/* sum members: 749, holes: 1, sum holes: 1 */
/* sum bitfield members: 12 bits, bit holes: 1, sum bit holes: 4 bits */
/* paddings: 1, sum paddings: 4 */
/* forced alignments: 1 */
/* last cacheline: 48 bytes */
};
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240216162006.2342759-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
|
Netronome graciously transferred the original NIPA repo
to our new netdev umbrella org. Link to that instead of
my private fork.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240216161945.2208842-1-kuba@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
|
The variable len being initialized with a value that is never read, an
if statement is initializing it in both paths of the if statement.
The initialization is redundant and can be removed.
Cleans up clang scan build warning:
net/ipv4/tcp_ao.c:512:11: warning: Value stored to 'len' during its
initialization is never read [deadcode.DeadStores]
Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com>
Link: https://lore.kernel.org/r/20240216125443.2107244-1-colin.i.king@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
|
syzkaller reported an overflown write in arp_req_get(). [0]
When ioctl(SIOCGARP) is issued, arp_req_get() looks up an neighbour
entry and copies neigh->ha to struct arpreq.arp_ha.sa_data.
The arp_ha here is struct sockaddr, not struct sockaddr_storage, so
the sa_data buffer is just 14 bytes.
In the splat below, 2 bytes are overflown to the next int field,
arp_flags. We initialise the field just after the memcpy(), so it's
not a problem.
However, when dev->addr_len is greater than 22 (e.g. MAX_ADDR_LEN),
arp_netmask is overwritten, which could be set as htonl(0xFFFFFFFFUL)
in arp_ioctl() before calling arp_req_get().
To avoid the overflow, let's limit the max length of memcpy().
Note that commit b5f0de6df6dc ("net: dev: Convert sa_data to flexible
array in struct sockaddr") just silenced syzkaller.
[0]:
memcpy: detected field-spanning write (size 16) of single field "r->arp_ha.sa_data" at net/ipv4/arp.c:1128 (size 14)
WARNING: CPU: 0 PID: 144638 at net/ipv4/arp.c:1128 arp_req_get+0x411/0x4a0 net/ipv4/arp.c:1128
Modules linked in:
CPU: 0 PID: 144638 Comm: syz-executor.4 Not tainted 6.1.74 #31
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
RIP: 0010:arp_req_get+0x411/0x4a0 net/ipv4/arp.c:1128
Code: fd ff ff e8 41 42 de fb b9 0e 00 00 00 4c 89 fe 48 c7 c2 20 6d ab 87 48 c7 c7 80 6d ab 87 c6 05 25 af 72 04 01 e8 5f 8d ad fb <0f> 0b e9 6c fd ff ff e8 13 42 de fb be 03 00 00 00 4c 89 e7 e8 a6
RSP: 0018:ffffc900050b7998 EFLAGS: 00010286
RAX: 0000000000000000 RBX: ffff88803a815000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff8641a44a RDI: 0000000000000001
RBP: ffffc900050b7a98 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 203a7970636d656d R12: ffff888039c54000
R13: 1ffff92000a16f37 R14: ffff88803a815084 R15: 0000000000000010
FS: 00007f172bf306c0(0000) GS:ffff88805aa00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f172b3569f0 CR3: 0000000057f12005 CR4: 0000000000770ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
<TASK>
arp_ioctl+0x33f/0x4b0 net/ipv4/arp.c:1261
inet_ioctl+0x314/0x3a0 net/ipv4/af_inet.c:981
sock_do_ioctl+0xdf/0x260 net/socket.c:1204
sock_ioctl+0x3ef/0x650 net/socket.c:1321
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:870 [inline]
__se_sys_ioctl fs/ioctl.c:856 [inline]
__x64_sys_ioctl+0x18e/0x220 fs/ioctl.c:856
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x37/0x90 arch/x86/entry/common.c:81
entry_SYSCALL_64_after_hwframe+0x64/0xce
RIP: 0033:0x7f172b262b8d
Code: 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f172bf300b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007f172b3abf80 RCX: 00007f172b262b8d
RDX: 0000000020000000 RSI: 0000000000008954 RDI: 0000000000000003
RBP: 00007f172b2d3493 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 000000000000000b R14: 00007f172b3abf80 R15: 00007f172bf10000
</TASK>
Reported-by: syzkaller <syzkaller@googlegroups.com>
Reported-by: Bjoern Doebel <doebel@amazon.de>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240215230516.31330-1-kuniyu@amazon.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
|
The pernet operations structure for the subsystem must be registered
before registering the generic netlink family.
Make an unregister in case of unsuccessful registration.
Fixes: 687125b5799c ("devlink: split out core code")
Signed-off-by: Vasiliy Kovalev <kovalev@altlinux.org>
Link: https://lore.kernel.org/r/20240215203400.29976-1-kovalev@altlinux.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
|
The pernet operations structure for the subsystem must be registered
before registering the generic netlink family.
Fixes: 915d7e5e5930 ("ipv6: sr: add code base for control plane support of SR-IPv6")
Signed-off-by: Vasiliy Kovalev <kovalev@altlinux.org>
Link: https://lore.kernel.org/r/20240215202717.29815-1-kovalev@altlinux.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
|
Reduce the scope of the variable "err" to the individual cases. This
is to avoid the mistake of setting "err" in the mistaken belief that
it will be evaluated later.
Reviewed-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/all/20240220-raw-setsockopt-v1-1-7d34cb1377fc@pengutronix.de
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
|
|
Mina Almasry says:
====================
Abstract page from net stack
This series is a prerequisite to the devmem TCP series. For a full
snapshot of the code which includes these changes, feel free to check:
https://github.com/mina/linux/commits/tcpdevmem-rfcv5/
Currently these components in the net stack use the struct page
directly:
1. Drivers.
2. Page pool.
3. skb_frag_t.
To add support for new (non struct page) memory types to the net stack, we
must first abstract the current memory type.
Originally the plan was to reuse struct page* for the new memory types,
and to set the LSB on the page* to indicate it's not really a page.
However, for safe compiler type checking we need to introduce a new type.
struct netmem is introduced to abstract the underlying memory type.
Currently it's a no-op abstraction that is always a struct page underneath.
In parallel there is an undergoing effort to add support for devmem to the
net stack:
https://lore.kernel.org/netdev/20231208005250.2910004-1-almasrymina@google.com/
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
====================
Link: https://lore.kernel.org/r/20240214223405.1972973-1-almasrymina@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
|
Use struct netmem* instead of page in skb_frag_t. Currently struct
netmem* is always a struct page underneath, but the abstraction
allows efforts to add support for skb frags not backed by pages.
There is unfortunately 1 instance where the skb_frag_t is assumed to be
a exactly a bio_vec in kcm. For this case, WARN_ON_ONCE and return error
before doing a cast.
Add skb[_frag]_fill_netmem_*() and skb_add_rx_frag_netmem() helpers so
that the API can be used to create netmem skbs.
Signed-off-by: Mina Almasry <almasrymina@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
|
Add the netmem_ref type, an abstraction for network memory.
To add support for new memory types to the net stack, we must first
abstract the current memory type. Currently parts of the net stack
use struct page directly:
- page_pool
- drivers
- skb_frag_t
Originally the plan was to reuse struct page* for the new memory types,
and to set the LSB on the page* to indicate it's not really a page.
However, for compiler type checking we need to introduce a new type.
netmem_ref is introduced to abstract the underlying memory type.
Currently it's a no-op abstraction that is always a struct page
underneath. In parallel there is an undergoing effort to add support
for devmem to the net stack:
https://lore.kernel.org/netdev/20231208005250.2910004-1-almasrymina@google.com/
netmem_ref can be pointers to different underlying memory types, and the
low bits are set to indicate the memory type. Helpers are provided
to convert netmem pointers to the underlying memory type (currently only
struct page). In the devmem series helpers are provided so that calling
code can use netmem without worrying about the underlying memory type
unless absolutely necessary.
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
|
The code for the CAN_RAW_XL_VCID_OPTS getsockopt() was incompletely adopted
from the CAN_RAW_FILTER getsockopt().
Add the missing put_user() and return statements.
Flagged by Smatch.
Fixes: c83c22ec1493 ("can: canxl: add virtual CAN network identifier support")
Reported-by: Simon Horman <horms@kernel.org>
Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Link: https://lore.kernel.org/all/20240219200021.12113-1-socketcan@hartkopp.net
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
|
|
Creation of sysfs entries is expensive, mainly for workloads that
constantly creates netdev and netns often.
Do not create BQL sysfs entries for devices that don't need,
basically those that do not have a real queue, i.e, devices that has
NETIF_F_LLTX and IFF_NO_QUEUE, such as `lo` interface.
This will remove the /sys/class/net/eth0/queues/tx-X/byte_queue_limits/
directory for these devices.
In the example below, eth0 has the `byte_queue_limits` directory but not
`lo`.
# ls /sys/class/net/lo/queues/tx-0/
traffic_class tx_maxrate tx_timeout xps_cpus xps_rxqs
# ls /sys/class/net/eth0/queues/tx-0/byte_queue_limits/
hold_time inflight limit limit_max limit_min
This also removes the #ifdefs, since we can also use netdev_uses_bql() to
check if the config is enabled. (as suggested by Jakub).
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20240216094154.3263843-1-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
Use global percpu page_pool_recycle_stats counter for system page_pool
allocator instead of allocating a separate percpu variable for each
(also percpu) page pool instance.
Reviewed-by: Toke Hoiland-Jorgensen <toke@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/87f572425e98faea3da45f76c3c68815c01a20ee.1708075412.git.lorenzo@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
Now that direct recycling is performed basing on pool->cpuid when set,
memory leaks are possible:
1. A pool is destroyed.
2. Alloc cache is emptied (it's done only once).
3. pool->cpuid is still set.
4. napi_pp_put_page() does direct recycling basing on pool->cpuid.
5. Now alloc cache is not empty, but it won't ever be freed.
In order to avoid that, rewrite pool->cpuid to -1 when unlinking NAPI to
make sure no direct recycling will be possible after emptying the cache.
This involves a bit of overhead as pool->cpuid now must be accessed
via READ_ONCE() to avoid partial reads.
Rename page_pool_unlink_napi() -> page_pool_disable_direct_recycling()
to reflect what it actually does and unexport it.
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://lore.kernel.org/r/20240215113905.96817-1-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
iMX8QM have iommu. Add proerty 'iommus'.
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240201-8qm_smmu-v2-2-3d12a80201a3@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
|
|
Some code manipulating the vif list is still missing some srcu_read_lock /
srcu_read_unlock, and so can trigger RCU warnings:
=============================
WARNING: suspicious RCU usage
6.8.0-rc1+ #37 Not tainted
-----------------------------
drivers/net/wireless/microchip/wilc1000/hif.c:110 RCU-list traversed without holding the required lock!!
[...]
stack backtrace:
CPU: 0 PID: 6 Comm: kworker/0:0 Not tainted 6.8.0-rc1+ #37
Hardware name: Atmel SAMA5
Workqueue: events sdio_irq_work
unwind_backtrace from show_stack+0x18/0x1c
show_stack from dump_stack_lvl+0x34/0x58
dump_stack_lvl from wilc_get_vif_from_idx+0x158/0x180
wilc_get_vif_from_idx from wilc_network_info_received+0x80/0x48c
wilc_network_info_received from wilc_handle_isr+0xa10/0xd30
wilc_handle_isr from wilc_sdio_interrupt+0x44/0x58
wilc_sdio_interrupt from process_sdio_pending_irqs+0x1c8/0x60c
process_sdio_pending_irqs from sdio_irq_work+0x6c/0x14c
sdio_irq_work from process_one_work+0x8d4/0x169c
process_one_work from worker_thread+0x8cc/0x1340
worker_thread from kthread+0x448/0x510
kthread from ret_from_fork+0x14/0x28
Fix those warnings by adding the needed lock around the corresponding
critical sections
Signed-off-by: Ajay Singh <ajay.kathat@microchip.com>
Co-developed-by: Alexis Lothoré <alexis.lothore@bootlin.com>
Signed-off-by: Alexis Lothoré <alexis.lothore@bootlin.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240215-wilc_fix_rcu_usage-v1-4-f610e46c6f82@bootlin.com
|
|
Fix reverse-christmas tree order in some functions before adding more
variables
Signed-off-by: Alexis Lothoré <alexis.lothore@bootlin.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240215-wilc_fix_rcu_usage-v1-3-f610e46c6f82@bootlin.com
|
|
Enabling CONFIG_PROVE_RCU_LIST raises many warnings in wilc driver, even on
some places already protected by a read critical section. An example of
such case is in wilc_get_available_idx:
=============================
WARNING: suspicious RCU usage
6.8.0-rc1+ #32 Not tainted
-----------------------------
drivers/net/wireless/microchip/wilc1000/netdev.c:944 RCU-list traversed in non-reader section!!
[...]
stack backtrace:
CPU: 0 PID: 26 Comm: kworker/0:3 Not tainted 6.8.0-rc1+ #32
Hardware name: Atmel SAMA5
Workqueue: events_freezable mmc_rescan
unwind_backtrace from show_stack+0x18/0x1c
show_stack from dump_stack_lvl+0x34/0x58
dump_stack_lvl from wilc_netdev_ifc_init+0x788/0x8ec
wilc_netdev_ifc_init from wilc_cfg80211_init+0x690/0x910
wilc_cfg80211_init from wilc_sdio_probe+0x168/0x490
wilc_sdio_probe from sdio_bus_probe+0x230/0x3f4
sdio_bus_probe from really_probe+0x270/0xdf4
really_probe from __driver_probe_device+0x1dc/0x580
__driver_probe_device from driver_probe_device+0x60/0x140
driver_probe_device from __device_attach_driver+0x268/0x364
__device_attach_driver from bus_for_each_drv+0x15c/0x1cc
bus_for_each_drv from __device_attach+0x1ec/0x3e8
__device_attach from bus_probe_device+0x190/0x1c0
bus_probe_device from device_add+0x10dc/0x18e4
device_add from sdio_add_func+0x1c0/0x2c0
sdio_add_func from mmc_attach_sdio+0xa08/0xe1c
mmc_attach_sdio from mmc_rescan+0xa00/0xfe0
mmc_rescan from process_one_work+0x8d4/0x169c
process_one_work from worker_thread+0x8cc/0x1340
worker_thread from kthread+0x448/0x510
kthread from ret_from_fork+0x14/0x28
This warning is due to the section being protected by a srcu critical read
section, but the list traversal being done with classic RCU API. Fix the
warning by using corresponding SRCU read lock/unlock APIs. While doing so,
since we always manipulate the same list (managed through a pointer
embedded in struct_wilc), add a macro to reduce the corresponding
boilerplate in each call site.
Signed-off-by: Alexis Lothoré <alexis.lothore@bootlin.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240215-wilc_fix_rcu_usage-v1-2-f610e46c6f82@bootlin.com
|
|
Move netif_wake_queue and its surrounding RCU operations in a dedicated
function to clarify wilc_txq_task and ease refactoring
Signed-off-by: Alexis Lothoré <alexis.lothore@bootlin.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240215-wilc_fix_rcu_usage-v1-1-f610e46c6f82@bootlin.com
|
|
Reset hardware state to prevent hardware stays at abnormal state during
setting channel. Besides, add preparation for MLO/DBCC before setting
channel, and reconfigure registers after that.
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240215055741.14148-5-pkshih@realtek.com
|
|
Configure RF registers according to band, channel, bandwidth. Since this
chip will support MLO, it needs check the operating mode to decide paths
we are going to configure.
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240215055741.14148-4-pkshih@realtek.com
|
|
In additional to configure band, channel and bandwidth registers, it also
configure CCK support on 2GHZ band, spur elimination, and RX gain.
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240215055741.14148-3-pkshih@realtek.com
|
|
To set channel, add a function to get TXSB (TX subband) that is hardware
index to indicate primary channel. Then, configure band, channel,
bandwidth and TXSB via registers.
Signed-off-by: Ping-Ke Shih <pkshih@realtek.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240215055741.14148-2-pkshih@realtek.com
|
|
Incorporate a test case to assess the handling of invalid flags or
task__nullable parameters passed to bpf_iter_task_new(). Prior to the
preceding commit, this scenario could potentially trigger a kernel panic.
However, with the previous commit, this test case is expected to function
correctly.
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240217114152.1623-3-laoar.shao@gmail.com
|
|
Failure to initialize it->pos, coupled with the presence of an invalid
value in the flags variable, can lead to it->pos referencing an invalid
task, potentially resulting in a kernel panic. To mitigate this risk, it's
crucial to ensure proper initialization of it->pos to NULL.
Fixes: ac8148d957f5 ("bpf: bpf_iter_task_next: use next_task(kit->task) rather than next_task(kit->pos)")
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/bpf/20240217114152.1623-2-laoar.shao@gmail.com
|
|
bpf_timer_cancel
This selftest is based on a Alexei's test adopted from an internal
user to troubleshoot another bug. During this exercise, a separate
racing bug was discovered between bpf_timer_cancel_and_free
and bpf_timer_cancel. The details can be found in the previous
patch.
This patch is to add a selftest that can trigger the bug.
I can trigger the UAF everytime in my qemu setup with KASAN. The idea
is to have multiple user space threads running in a tight loop to exercise
both bpf_map_update_elem (which calls into bpf_timer_cancel_and_free)
and bpf_timer_cancel.
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20240215211218.990808-2-martin.lau@linux.dev
|
|
The following race is possible between bpf_timer_cancel_and_free
and bpf_timer_cancel. It will lead a UAF on the timer->timer.
bpf_timer_cancel();
spin_lock();
t = timer->time;
spin_unlock();
bpf_timer_cancel_and_free();
spin_lock();
t = timer->timer;
timer->timer = NULL;
spin_unlock();
hrtimer_cancel(&t->timer);
kfree(t);
/* UAF on t */
hrtimer_cancel(&t->timer);
In bpf_timer_cancel_and_free, this patch frees the timer->timer
after a rcu grace period. This requires a rcu_head addition
to the "struct bpf_hrtimer". Another kfree(t) happens in bpf_timer_init,
this does not need a kfree_rcu because it is still under the
spin_lock and timer->timer has not been visible by others yet.
In bpf_timer_cancel, rcu_read_lock() is added because this helper
can be used in a non rcu critical section context (e.g. from
a sleepable bpf prog). Other timer->timer usages in helpers.c
have been audited, bpf_timer_cancel() is the only place where
timer->timer is used outside of the spin_lock.
Another solution considered is to mark a t->flag in bpf_timer_cancel
and clear it after hrtimer_cancel() is done. In bpf_timer_cancel_and_free,
it busy waits for the flag to be cleared before kfree(t). This patch
goes with a straight forward solution and frees timer->timer after
a rcu grace period.
Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.")
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/bpf/20240215211218.990808-1-martin.lau@linux.dev
|
|
Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).
As found with Coccinelle[1], add __counted_by for struct tc_pedit.
Additionally, since the element count member must be set before accessing
the annotated flexible array member, move its initialization earlier.
Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1]
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
FORTIFY_SOURCE has been ignoring 0-sized destinations while the kernel
code base has been converted to flexible arrays. In order to enforce
the 0-sized destinations (e.g. with __counted_by), the remaining 0-sized
destinations need to be handled. Unfortunately, struct vic_provinfo
resists full conversion, as it contains a flexible array of flexible
arrays, which is only possible with the 0-sized fake flexible array.
Use unsafe_memcpy() to avoid future false positives under
CONFIG_FORTIFY_SOURCE.
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Since there is a utility available for this, use
the API rather than open code.
Fixes: 13943d6c8273 ("ionic: prevent pci disable of already disabled device")
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Shannon Nelson says:
====================
pds_core: AER handling
Add simple handlers for the PCI AER callbacks, and improve
the reset handling.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
We get the benefit of all the PCI reset locking and recovery if
we use the existing pci_reset_function() that will call our
local reset handlers.
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
When the VF is hit with a reset, remove the aux device in
the prepare for reset and try to restore it after the reset.
The userland mechanics will need to recover and rebuild whatever
uses the device afterwards.
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Set up the pci_error_handlers error_detected and resume to be
useful in handling AER events.
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
-queue
Tony Nguyen says:
====================
i40e: Simplify VSI and VEB handling
Ivan Vecera says:
The series simplifies handling of VSIs and VEBs by introducing for-each
iterating macros, 'find' helper functions. Also removes the VEB
recursion because the VEBs cannot have sub-VEBs according datasheet and
fixes the support for floating VEBs.
The series content:
Patch 1 - Uses existing helper function for find FDIR VSI instead of loop
Patch 2 - Adds and uses macros to iterate VSI and VEB arrays
Patch 3 - Adds 2 helper functions to find VSIs and VEBs by their SEID
Patch 4 - Fixes broken support for floating VEBs
Patch 5 - Removes VEB recursion and simplifies VEB handling
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
While working on the patchset to remove extent locking I got a lockdep
splat with fiemap and pagefaulting with my new extent lock replacement
lock.
This deadlock exists with our normal code, we just don't have lockdep
annotations with the extent locking so we've never noticed it.
Since we're copying the fiemap extent to user space on every iteration
we have the chance of pagefaulting. Because we hold the extent lock for
the entire range we could mkwrite into a range in the file that we have
mmap'ed. This would deadlock with the following stack trace
[<0>] lock_extent+0x28d/0x2f0
[<0>] btrfs_page_mkwrite+0x273/0x8a0
[<0>] do_page_mkwrite+0x50/0xb0
[<0>] do_fault+0xc1/0x7b0
[<0>] __handle_mm_fault+0x2fa/0x460
[<0>] handle_mm_fault+0xa4/0x330
[<0>] do_user_addr_fault+0x1f4/0x800
[<0>] exc_page_fault+0x7c/0x1e0
[<0>] asm_exc_page_fault+0x26/0x30
[<0>] rep_movs_alternative+0x33/0x70
[<0>] _copy_to_user+0x49/0x70
[<0>] fiemap_fill_next_extent+0xc8/0x120
[<0>] emit_fiemap_extent+0x4d/0xa0
[<0>] extent_fiemap+0x7f8/0xad0
[<0>] btrfs_fiemap+0x49/0x80
[<0>] __x64_sys_ioctl+0x3e1/0xb50
[<0>] do_syscall_64+0x94/0x1a0
[<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
I wrote an fstest to reproduce this deadlock without my replacement lock
and verified that the deadlock exists with our existing locking.
To fix this simply don't take the extent lock for the entire duration of
the fiemap. This is safe in general because we keep track of where we
are when we're searching the tree, so if an ordered extent updates in
the middle of our fiemap call we'll still emit the correct extents
because we know what offset we were on before.
The only place we maintain the lock is searching delalloc. Since the
delalloc stuff can change during writeback we want to lock the extent
range so we have a consistent view of delalloc at the time we're
checking to see if we need to set the delalloc flag.
With this patch applied we no longer deadlock with my testcase.
CC: stable@vger.kernel.org # 6.1+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
|
|
[BUG]
With the following file extent layout, defrag would do unnecessary IO
and result more on-disk space usage.
# mkfs.btrfs -f $dev
# mount $dev $mnt
# xfs_io -f -c "pwrite 0 40m" $mnt/foobar
# sync
# xfs_io -f -c "pwrite 40m 16k" $mnt/foobar
# sync
Above command would lead to the following file extent layout:
item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53
generation 7 type 1 (regular)
extent data disk byte 298844160 nr 41943040
extent data offset 0 nr 41943040 ram 41943040
extent compression 0 (none)
item 7 key (257 EXTENT_DATA 41943040) itemoff 15763 itemsize 53
generation 8 type 1 (regular)
extent data disk byte 13631488 nr 16384
extent data offset 0 nr 16384 ram 16384
extent compression 0 (none)
Which is mostly fine. We can allow the final 16K to be merged with the
previous 40M, but it's upon the end users' preference.
But if we defrag the file using the default parameters, it would result
worse file layout:
# btrfs filesystem defrag $mnt/foobar
# sync
item 6 key (257 EXTENT_DATA 0) itemoff 15816 itemsize 53
generation 7 type 1 (regular)
extent data disk byte 298844160 nr 41943040
extent data offset 0 nr 8650752 ram 41943040
extent compression 0 (none)
item 7 key (257 EXTENT_DATA 8650752) itemoff 15763 itemsize 53
generation 9 type 1 (regular)
extent data disk byte 340787200 nr 33292288
extent data offset 0 nr 33292288 ram 33292288
extent compression 0 (none)
item 8 key (257 EXTENT_DATA 41943040) itemoff 15710 itemsize 53
generation 8 type 1 (regular)
extent data disk byte 13631488 nr 16384
extent data offset 0 nr 16384 ram 16384
extent compression 0 (none)
Note the original 40M extent is still there, but a new 32M extent is
created for no benefit at all.
[CAUSE]
There is an existing check to make sure we won't defrag a large enough
extent (the threshold is by default 32M).
But the check is using the length to the end of the extent:
range_len = em->len - (cur - em->start);
/* Skip too large extent */
if (range_len >= extent_thresh)
goto next;
This means, for the first 8MiB of the extent, the range_len is always
smaller than the default threshold, and would not be defragged.
But after the first 8MiB, the remaining part would fit the requirement,
and be defragged.
Such different behavior inside the same extent caused the above problem,
and we should avoid different defrag decision inside the same extent.
[FIX]
Instead of using @range_len, just use @em->len, so that we have a
consistent decision among the same file extent.
Now with this fix, we won't touch the extent, thus not making it any
worse.
Reported-by: Filipe Manana <fdmanana@suse.com>
Fixes: 0cb5950f3f3b ("btrfs: fix deadlock when reserving space during defrag")
CC: stable@vger.kernel.org # 6.1+
Reviewed-by: Boris Burkov <boris@bur.io>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
|
|
If message contains unknown attribute and user passes
"--process-unknown" command line option, _decode() gets called with space
arg set to None. In that case, attr_space variable is not initialized
used which leads to following trace:
Traceback (most recent call last):
File "./tools/net/ynl/cli.py", line 77, in <module>
main()
File "./tools/net/ynl/cli.py", line 68, in main
reply = ynl.dump(args.dump, attrs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "tools/net/ynl/lib/ynl.py", line 909, in dump
return self._op(method, vals, [], dump=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "tools/net/ynl/lib/ynl.py", line 894, in _op
rsp_msg = self._decode(decoded.raw_attrs, op.attr_set.name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "tools/net/ynl/lib/ynl.py", line 639, in _decode
self._rsp_add(rsp, attr_name, None, self._decode_unknown(attr))
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "tools/net/ynl/lib/ynl.py", line 569, in _decode_unknown
return self._decode(NlAttrs(attr.raw), None)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "tools/net/ynl/lib/ynl.py", line 630, in _decode
search_attrs = SpaceAttrs(attr_space, rsp, outer_attrs)
^^^^^^^^^^
UnboundLocalError: cannot access local variable 'attr_space' where it is not associated with a value
Fix this by moving search_attrs assignment under the if statement
above it to make sure attr_space is initialized.
Fixes: bf8b832374fb ("tools/net/ynl: Support sub-messages in nested attribute spaces")
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
In bond priority testing, we set the primary interface to eth1 and add
eth0,1,2 to bond in serial. This is OK in normal times. But when in
debug kernel, the bridge port that eth0,1,2 connected would start
slowly (enter blocking, forwarding state), which caused the primary
interface down for a while after enslaving and active slave changed.
Here is a test log from Jakub's debug test[1].
[ 400.399070][ T50] br0: port 1(s0) entered disabled state
[ 400.400168][ T50] br0: port 4(s2) entered disabled state
[ 400.941504][ T2791] bond0: (slave eth0): making interface the new active one
[ 400.942603][ T2791] bond0: (slave eth0): Enslaving as an active interface with an up link
[ 400.943633][ T2766] br0: port 1(s0) entered blocking state
[ 400.944119][ T2766] br0: port 1(s0) entered forwarding state
[ 401.128792][ T2792] bond0: (slave eth1): making interface the new active one
[ 401.130771][ T2792] bond0: (slave eth1): Enslaving as an active interface with an up link
[ 401.131643][ T69] br0: port 2(s1) entered blocking state
[ 401.132067][ T69] br0: port 2(s1) entered forwarding state
[ 401.346201][ T2793] bond0: (slave eth2): Enslaving as a backup interface with an up link
[ 401.348414][ T50] br0: port 4(s2) entered blocking state
[ 401.348857][ T50] br0: port 4(s2) entered forwarding state
[ 401.519669][ T250] bond0: (slave eth0): link status definitely down, disabling slave
[ 401.526522][ T250] bond0: (slave eth1): link status definitely down, disabling slave
[ 401.526986][ T250] bond0: (slave eth2): making interface the new active one
[ 401.629470][ T250] bond0: (slave eth0): link status definitely up
[ 401.630089][ T250] bond0: (slave eth1): link status definitely up
[...]
# TEST: prio (active-backup ns_ip6_target primary_reselect 1) [FAIL]
# Current active slave is eth2 but not eth1
Fix it by setting active slave to primary slave specifically before
testing.
[1] https://netdev-3.bots.linux.dev/vmksft-bonding-dbg/results/464301/1-bond-options-sh/stdout
Fixes: 481b56e0391e ("selftests: bonding: re-format bond option tests")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
|
|
Avoid the following warnings by removing the ena_select_queue() function
and rely on the net core to do the queue selection, The issue happen
when an skb received from an interface with more queues than ena is
forwarded to the ena interface.
[ 1176.159959] eth0 selects TX queue 11, but real number of TX queues is 8
[ 1176.863976] eth0 selects TX queue 14, but real number of TX queues is 8
[ 1180.767877] eth0 selects TX queue 14, but real number of TX queues is 8
[ 1188.703742] eth0 selects TX queue 14, but real number of TX queues is 8
Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)")
Signed-off-by: Kamal Heib <kheib@redhat.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild
Pull Kbuild fixes from Masahiro Yamada:
- Reformat nested if-conditionals in Makefiles with 4 spaces
- Fix CONFIG_DEBUG_INFO_BTF builds for big endian
- Fix modpost for module srcversion
- Fix an escape sequence warning in gen_compile_commands.py
- Fix kallsyms to ignore ARMv4 thunk symbols
* tag 'kbuild-fixes-v6.8-2' of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild:
kallsyms: ignore ARMv4 thunks along with others
modpost: trim leading spaces when processing source files list
gen_compile_commands: fix invalid escape sequence warning
kbuild: Fix changing ELF file type for output of gen_btf for big endian
docs: kconfig: Fix grammar and formatting
kbuild: use 4-space indentation when followed by conditionals
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fix from Borislav Petkov:
- Use a GB page for identity mapping only when memory of this size is
requested so that mapping of reserved regions is prevented which
would otherwise lead to system crashes on UV machines
* tag 'x86_urgent_for_v6.8_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mm/ident_map: Use gbpages only where full GB page should be mapped.
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull irq fixes from Borislav Petkov:
- Fix GICv4.1 affinity update
- Restore a quirk for ACPI-based GICv4 systems
- Handle non-coherent GICv4 redistributors properly
- Prevent spurious interrupts on Broadcom devices using GIC v3
architecture
- Other minor fixes
* tag 'irq_urgent_for_v6.8_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
irqchip/gic-v3-its: Fix GICv4.1 VPE affinity update
irqchip/gic-v3-its: Restore quirk probing for ACPI-based systems
irqchip/gic-v3-its: Handle non-coherent GICv4 redistributors
irqchip/qcom-mpm: Fix IS_ERR() vs NULL check in qcom_mpm_init()
irqchip/loongson-eiointc: Use correct struct type in eiointc_domain_alloc()
irqchip/irq-brcmstb-l2: Add write memory barrier before exit
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux
Pull i2c fixes from Wolfram Sang:
"Two fixes for i801 and qcom-geni devices. Meanwhile, a fix from Arnd
addresses a compilation error encountered during compile test on
powerpc"
* tag 'i2c-for-6.8-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux:
i2c: i801: Fix block process call transactions
i2c: pasemi: split driver into two separate modules
i2c: qcom-geni: Correct I2C TRE sequence
|
|
Aquantia AQR813 is the Octal Port variant of the AQR113. Add PHY ID for
it to provide support for it.
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Justin Chen says:
====================
net: bcmasp: bug fixes for bcmasp
Fix two bugs.
- Indicate that PM is managed by mac to prevent double pm calls. This
doesn't lead to a crash, but waste a noticable amount of time
suspending/resuming.
- Sanity check for OOB write was off by one. Leading to a false error
when using the full array.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
A sanity check for OOB write is off by one leading to a false positive
when the array is full.
Fixes: 9b90aca97f6d ("net: ethernet: bcmasp: fix possible OOB write in bcmasp_netfilt_get_all_active()")
Signed-off-by: Justin Chen <justin.chen@broadcom.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Avoid the PHY library call unnecessarily into the suspend/resume
functions by setting phydev->mac_managed_pm to true. The ASP driver
essentially does exactly what mdio_bus_phy_resume() does.
Fixes: 490cb412007d ("net: bcmasp: Add support for ASP2.0 Ethernet controller")
Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Justin Chen <justin.chen@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Matthieu Baerts says:
====================
mptcp: misc. fixes for v6.8
This series includes 4 types of fixes:
Patches 1 and 2 force the path-managers not to allocate a new address
entry when dealing with the "special" ID 0, reserved to the address of
the initial subflow. These patches can be backported up to v5.19 and
v5.12 respectively.
Patch 3 to 6 fix the in-kernel path-manager not to create duplicated
subflows. Patch 6 is the main fix, but patches 3 to 5 are some kind of
pre-requisities: they fix some data races that could also lead to the
creation of unexpected subflows. These patches can be backported up to
v5.7, v5.10, v6.0, and v5.15 respectively.
Note that patch 3 modifies the existing ULP API. No better solutions
have been found for -net, and there is some similar prior art, see
commit 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info"). Please
also note that TLS ULP Diag has likely the same issue.
Patches 7 to 9 fix issues in the selftests, when executing them on older
kernels, e.g. when testing the last version of these kselftests on the
v5.15.148 kernel as it is done by LKFT when validating stable kernels.
These patches only avoid printing expected errors the console and
marking some tests as "OK" while they have been skipped. Patches 7 and 8
can be backported up to v6.6.
Patches 10 to 13 make sure all MPTCP selftests subtests have a unique
name. It is important to have a unique (sub)test name in TAP, because
that's the test identifier. Some CI environments might drop tests with
duplicated names. Patches 10 to 12 can be backported up to v6.6.
====================
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
|