diff options
78 files changed, 1619 insertions, 476 deletions
@@ -721,7 +721,8 @@ Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com> Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com> Shuah Khan <shuah@kernel.org> <shuahkh@osg.samsung.com> Shuah Khan <shuah@kernel.org> <shuah.kh@samsung.com> -Sibi Sankar <quic_sibis@quicinc.com> <sibis@codeaurora.org> +Sibi Sankar <sibi.sankar@oss.qualcomm.com> <sibis@codeaurora.org> +Sibi Sankar <sibi.sankar@oss.qualcomm.com> <quic_sibis@quicinc.com> Sid Manning <quic_sidneym@quicinc.com> <sidneym@codeaurora.org> Simon Arlott <simon@octiron.net> <simon@fire.lp0.eu> Simona Vetter <simona.vetter@ffwll.ch> <daniel.vetter@ffwll.ch> diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 8ccc5af5ea1e..86d7902a657f 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -134,47 +134,72 @@ The above command can be used with -v to get more debug information. After the system starts, use `delaytop` to get the system-wide delay information, which includes system-wide PSI information and Top-N high-latency tasks. +Note: PSI support requires `CONFIG_PSI=y` and `psi=1` for full functionality. -`delaytop` supports sorting by CPU latency in descending order by default, -displays the top 20 high-latency tasks by default, and refreshes the latency -data every 2 seconds by default. +`delaytop` is an interactive tool for monitoring system pressure and task delays. +It supports multiple sorting options, display modes, and real-time keyboard controls. -Get PSI information and Top-N tasks delay, since system boot:: +Basic usage with default settings (sorts by CPU delay, shows top 20 tasks, refreshes every 2 seconds):: bash# ./delaytop - System Pressure Information: (avg10/avg60/avg300/total) - CPU some: 0.0%/ 0.0%/ 0.0%/ 345(ms) + System Pressure Information: (avg10/avg60vg300/total) + CPU some: 0.0%/ 0.0%/ 0.0%/ 106137(ms) CPU full: 0.0%/ 0.0%/ 0.0%/ 0(ms) Memory full: 0.0%/ 0.0%/ 0.0%/ 0(ms) Memory some: 0.0%/ 0.0%/ 0.0%/ 0(ms) - IO full: 0.0%/ 0.0%/ 0.0%/ 65(ms) - IO some: 0.0%/ 0.0%/ 0.0%/ 79(ms) + IO full: 0.0%/ 0.0%/ 0.0%/ 2240(ms) + IO some: 0.0%/ 0.0%/ 0.0%/ 2783(ms) IRQ full: 0.0%/ 0.0%/ 0.0%/ 0(ms) - Top 20 processes (sorted by CPU delay): - PID TGID COMMAND CPU(ms) IO(ms) SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms) - ---------------------------------------------------------------------------------------------- - 161 161 zombie_memcg_re 1.40 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 130 130 blkcg_punt_bio 1.37 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 444 444 scsi_tmf_0 0.73 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 1280 1280 rsyslogd 0.53 0.04 0.00 0.00 0.00 0.00 0.00 0.00 - 12 12 ksoftirqd/0 0.47 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 1277 1277 nbd-server 0.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 308 308 kworker/2:2-sys 0.41 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 55 55 netns 0.36 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 1187 1187 acpid 0.31 0.03 0.00 0.00 0.00 0.00 0.00 0.00 - 6184 6184 kworker/1:2-sys 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 186 186 kaluad 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 18 18 ksoftirqd/1 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 185 185 kmpath_rdacd 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 190 190 kstrp 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 2759 2759 agetty 0.20 0.03 0.00 0.00 0.00 0.00 0.00 0.00 - 1190 1190 kworker/0:3-sys 0.19 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 1272 1272 sshd 0.15 0.04 0.00 0.00 0.00 0.00 0.00 0.00 - 1156 1156 license 0.15 0.11 0.00 0.00 0.00 0.00 0.00 0.00 - 134 134 md 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - 6142 6142 kworker/3:2-xfs 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 - -Dynamic interactive interface of delaytop:: + [o]sort [M]memverbose [q]quit + Top 20 processes (sorted by cpu delay): + PID TGID COMMAND CPU(ms) IO(ms) IRQ(ms) MEM(ms) + ------------------------------------------------------------------------ + 110 110 kworker/15:0H-s 27.91 0.00 0.00 0.00 + 57 57 cpuhp/7 3.18 0.00 0.00 0.00 + 99 99 cpuhp/14 2.97 0.00 0.00 0.00 + 51 51 cpuhp/6 0.90 0.00 0.00 0.00 + 44 44 kworker/4:0H-sy 0.80 0.00 0.00 0.00 + 60 60 ksoftirqd/7 0.74 0.00 0.00 0.00 + 76 76 idle_inject/10 0.31 0.00 0.00 0.00 + 100 100 idle_inject/14 0.30 0.00 0.00 0.00 + 1309 1309 systemsettings 0.29 0.00 0.00 0.00 + 45 45 cpuhp/5 0.22 0.00 0.00 0.00 + 63 63 cpuhp/8 0.20 0.00 0.00 0.00 + 87 87 cpuhp/12 0.18 0.00 0.00 0.00 + 93 93 cpuhp/13 0.17 0.00 0.00 0.00 + 1265 1265 acpid 0.17 0.00 0.00 0.00 + 1552 1552 sshd 0.17 0.00 0.00 0.00 + 2584 2584 sddm-helper 0.16 0.00 0.00 0.00 + 1284 1284 rtkit-daemon 0.15 0.00 0.00 0.00 + 1326 1326 nde-netfilter 0.14 0.00 0.00 0.00 + 27 27 cpuhp/2 0.13 0.00 0.00 0.00 + 631 631 kworker/11:2-rc 0.11 0.00 0.00 0.00 + +Interactive keyboard controls during runtime:: + + o - Select sort field (CPU, IO, IRQ, Memory, etc.) + M - Toggle display mode (Default/Memory Verbose) + q - Quit + +Available sort fields(use -s/--sort or interactive command):: + + cpu(c) - CPU delay + blkio(i) - I/O delay + irq(q) - IRQ delay + mem(m) - Total memory delay + swapin(s) - Swapin delay (memory verbose mode only) + freepages(r) - Freepages reclaim delay (memory verbose mode only) + thrashing(t) - Thrashing delay (memory verbose mode only) + compact(p) - Compaction delay (memory verbose mode only) + wpcopy(w) - Write page copy delay (memory verbose mode only) + +Advanced usage examples:: + + # ./delaytop -s blkio + Sorted by IO delay + + # ./delaytop -s mem -M + Sorted by memory delay in memory verbose mode # ./delaytop -p pid Print delayacct stats diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index e019db1633fd..74ca438d2d6d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4603,7 +4603,7 @@ bit 2: print timer info bit 3: print locks info if CONFIG_LOCKDEP is on bit 4: print ftrace buffer - bit 5: replay all messages on consoles at the end of panic + bit 5: replay all kernel messages on consoles at the end of panic bit 6: print all CPUs backtrace (if available in the arch) bit 7: print only tasks in uninterruptible (blocked) state *Be aware* that this option may print a _lot_ of lines, diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 8b49eab937d0..f3ee807b5d8b 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -890,7 +890,7 @@ bit 1 print system memory info bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer -bit 5 replay all messages on consoles at the end of panic +bit 5 replay all kernel messages on consoles at the end of panic bit 6 print all CPUs backtrace (if available in the arch) bit 7 print only tasks in uninterruptible (blocked) state ===== ============================================ diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst index 6611434e2dd2..8127849d40f5 100644 --- a/Documentation/dev-tools/kcov.rst +++ b/Documentation/dev-tools/kcov.rst @@ -361,7 +361,12 @@ local tasks spawned by the process and the global task that handles USB bus #1: */ sleep(2); - n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); + /* + * The load to the coverage count should be an acquire to pair with + * pair with the corresponding write memory barrier (smp_wmb()) on + * the kernel-side in kcov_move_area(). + */ + n = __atomic_load_n(&cover[0], __ATOMIC_ACQUIRE); for (i = 0; i < n; i++) printf("0x%lx\n", cover[i + 1]); if (ioctl(fd, KCOV_DISABLE, 0)) diff --git a/MAINTAINERS b/MAINTAINERS index 6a6cf856c98b..4abe33914884 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20987,7 +20987,7 @@ F: Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml F: drivers/pmdomain/qcom/cpr.c QUALCOMM CPUCP MAILBOX DRIVER -M: Sibi Sankar <quic_sibis@quicinc.com> +M: Sibi Sankar <sibi.sankar@oss.qualcomm.com> L: linux-arm-msm@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/mailbox/qcom,cpucp-mbox.yaml diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c6b12bed173d..335fd2ee9766 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -165,14 +165,23 @@ static struct crash_mem *fill_up_crash_elf_data(void) /* * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges * may cause range splits. So add extra slots here. + * + * Exclusion of low 1M may not cause another range split, because the + * range of exclude is [0, 1M] and the condition for splitting a new + * region is that the start, end parameters are both in a certain + * existing region in cmem and cannot be equal to existing region's + * start or end. Obviously, the start of [0, 1M] cannot meet this + * condition. + * + * But in order to lest the low 1M could be changed in the future, + * (e.g. [start, 1M]), add a extra slot. */ - nr_ranges += 2 + crashk_cma_cnt; + nr_ranges += 3 + crashk_cma_cnt; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; return cmem; } @@ -323,16 +332,20 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_mem *cmem; /* - * Using random kexec_buf for passing dm crypt keys may cause a range - * split. So use two slots here. + * In the current x86 architecture code, the elfheader is always + * allocated at crashk_res.start. But it depends on the allocation + * position of elfheader in crashk_res. To avoid potential out of + * bounds in future, add an extra slot. + * + * And using random kexec_buf for passing dm crypt keys may cause a + * range split too, add another extra slot here. */ - nr_ranges = 2; + nr_ranges = 3; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return -ENOMEM; cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 24a41f0e0cf1..c3244ac680d1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -16,6 +16,8 @@ #include <linux/kexec.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/libfdt.h> +#include <linux/of_fdt.h> #include <linux/efi.h> #include <linux/random.h> @@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_OF_FLATTREE +static void setup_dtb(struct boot_params *params, + unsigned long params_load_addr, + unsigned int dtb_setup_data_offset) +{ + struct setup_data *sd = (void *)params + dtb_setup_data_offset; + unsigned long setup_data_phys, dtb_len; + + dtb_len = fdt_totalsize(initial_boot_params); + sd->type = SETUP_DTB; + sd->len = dtb_len; + + /* Carry over current boot DTB with setup_data */ + memcpy(sd->data, initial_boot_params, dtb_len); + + /* Add setup data */ + setup_data_phys = params_load_addr + dtb_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} +#endif /* CONFIG_OF_FLATTREE */ + static void setup_ima_state(const struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct efi_setup_data); #endif +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) { + setup_dtb(params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); + } else { + pr_debug("Not carrying over DTB, force_dtb = %d\n", + image->force_dtb); + } +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { /* Setup IMA log buffer state */ setup_ima_state(image, params, params_load_addr, @@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel, sizeof(struct setup_data) + RNG_SEED_LENGTH; +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) + kbuf.bufsz += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); @@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct kho_data); - params = kzalloc(kbuf.bufsz, GFP_KERNEL); + params = kvzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ldata; out_free_params: - kfree(params); + kvfree(params); return ERR_PTR(ret); } @@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data) if (!ldata) return 0; - kfree(ldata->bootparams_buf); + kvfree(ldata->bootparams_buf); ldata->bootparams_buf = NULL; return 0; diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c index a00e07b853f2..a65c2d5b9e7b 100644 --- a/drivers/firmware/efi/efi-init.c +++ b/drivers/firmware/efi/efi-init.c @@ -12,6 +12,7 @@ #include <linux/efi.h> #include <linux/fwnode.h> #include <linux/init.h> +#include <linux/kexec_handover.h> #include <linux/memblock.h> #include <linux/mm_types.h> #include <linux/of.h> @@ -164,12 +165,32 @@ static __init void reserve_regions(void) pr_info("Processing EFI memory map:\n"); /* - * Discard memblocks discovered so far: if there are any at this - * point, they originate from memory nodes in the DT, and UEFI - * uses its own memory map instead. + * Discard memblocks discovered so far except for KHO scratch + * regions. Most memblocks at this point originate from memory nodes + * in the DT and UEFI uses its own memory map instead. However, if + * KHO is enabled, scratch regions, which are good known memory + * must be preserved. */ memblock_dump_all(); - memblock_remove(0, PHYS_ADDR_MAX); + + if (is_kho_boot()) { + struct memblock_region *r; + + /* Remove all non-KHO regions */ + for_each_mem_region(r) { + if (!memblock_is_kho_scratch(r)) { + memblock_remove(r->base, r->size); + r--; + } + } + } else { + /* + * KHO is disabled. Discard memblocks discovered so far: + * if there are any at this point, they originate from memory + * nodes in the DT, and UEFI uses its own memory map instead. + */ + memblock_remove(0, PHYS_ADDR_MAX); + } for_each_efi_memory_desc(md) { paddr = md->phys_addr; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 5940e2eb9231..96cc9b389246 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -279,14 +279,7 @@ static int fbcon_get_rotate(struct fb_info *info) static bool fbcon_skip_panic(struct fb_info *info) { -/* panic_cpu is not exported, and can't be used if built as module. Use - * oops_in_progress instead, but non-fatal oops won't be printed. - */ -#if defined(MODULE) - return (info->skip_panic && unlikely(oops_in_progress)); -#else - return (info->skip_panic && unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID)); -#endif + return (info->skip_panic && unlikely(panic_in_progress())); } static inline bool fbcon_is_active(struct vc_data *vc, struct fb_info *info) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 12daa85ed941..ca54bf24b719 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -421,7 +421,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; vmf = vmf_insert_mixed(vma, vma->vm_start + off, - address + off); + PHYS_PFN(address + off)); if (vmf & VM_FAULT_ERROR) ret = vm_fault_to_errno(vmf, 0); } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index acbec5bdd521..92b091783966 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1209,7 +1209,7 @@ EXPORT_SYMBOL_GPL(fat_alloc_new_dir); static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, int *nr_cluster, struct msdos_dir_entry **de, - struct buffer_head **bh, loff_t *i_pos) + struct buffer_head **bh) { struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -1269,7 +1269,6 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, get_bh(bhs[n]); *bh = bhs[n]; *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); - *i_pos = fat_make_i_pos(sb, *bh, *de); /* Second stage: clear the rest of cluster, and write outs */ err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE); @@ -1298,7 +1297,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ struct msdos_dir_entry *de; int err, free_slots, i, nr_bhs; - loff_t pos, i_pos; + loff_t pos; sinfo->nr_slots = nr_slots; @@ -1386,7 +1385,7 @@ found: * add the cluster to dir. */ cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster, - &de, &bh, &i_pos); + &de, &bh); if (cluster < 0) { err = cluster; goto error_remove; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 821cb7874685..162711cc5b20 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6928,8 +6928,7 @@ static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end, out: if (ret != 0) { - if (folios) - ocfs2_unlock_and_free_folios(folios, numfolios); + ocfs2_unlock_and_free_folios(folios, numfolios); numfolios = 0; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 86bb1a03bcc1..4145e06d2c08 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1477,7 +1477,6 @@ way_up_top: goto send_response; } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { spin_unlock(&res->spinlock); - // mlog(0, "node %u is the master\n", res->owner); response = DLM_MASTER_RESP_NO; if (mle) kmem_cache_free(dlm_mle_cache, mle); @@ -1493,7 +1492,6 @@ way_up_top: BUG(); } - // mlog(0, "lockres is in progress...\n"); spin_lock(&dlm->master_lock); found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { @@ -1503,8 +1501,6 @@ way_up_top: set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->type == DLM_MLE_BLOCK) { - // mlog(0, "this node is waiting for " - // "lockres to be mastered\n"); response = DLM_MASTER_RESP_NO; } else if (tmpmle->type == DLM_MLE_MIGRATION) { mlog(0, "node %u is master, but trying to migrate to " @@ -1531,8 +1527,6 @@ way_up_top: } else response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "this node is attempting to " - // "master lockres\n"); response = DLM_MASTER_RESP_MAYBE; } if (set_maybe) @@ -1559,7 +1553,6 @@ way_up_top: found = dlm_find_mle(dlm, &tmpmle, name, namelen); if (!found) { /* this lockid has never been seen on this node yet */ - // mlog(0, "no mle found\n"); if (!mle) { spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -1573,8 +1566,6 @@ way_up_top: goto way_up_top; } - // mlog(0, "this is second time thru, already allocated, " - // "add the block.\n"); dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); __dlm_insert_mle(dlm, mle); @@ -1897,8 +1888,6 @@ ok: spin_unlock(&res->spinlock); } - // mlog(0, "woo! got an assert_master from node %u!\n", - // assert->node_idx); if (mle) { int extra_ref = 0; int nn = -1; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 00f52812dbb0..843ee02bd85f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -464,7 +464,6 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { - // mlog(0, "nothing to recover! sleeping now!\n"); spin_unlock(&dlm->spinlock); /* return to main thread loop and sleep. */ return 0; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 6c4f78f473fb..fcc89856ab95 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1495,6 +1495,14 @@ int ocfs2_validate_inode_block(struct super_block *sb, goto bail; } + if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) { + rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(di->i_suballoc_slot)); + goto bail; + } + rc = 0; bail: diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index db14c92302a1..b6864602814c 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -358,13 +358,11 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, i); + int len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, i); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; @@ -651,12 +649,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, goto bail; } } else { - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, - OCFS2_INVALID_SLOT); + int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), + type, OCFS2_INVALID_SLOT); status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, - namebuf, - strlen(namebuf), - &blkno); + namebuf, len, &blkno); if (status < 0) { status = -ENOENT; goto bail; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index cbe2f8ed8897..86f2631e6360 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -364,7 +364,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, int *vict_bit, struct buffer_head **ret_bh) { - int ret, i, bits_per_unit = 0; + int ret, i, len, bits_per_unit = 0; u64 blkno; char namebuf[40]; @@ -375,9 +375,9 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, struct ocfs2_dinode *ac_dinode; struct ocfs2_group_desc *bg; - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); - ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno); + if (ret) { ret = -ENOENT; goto out; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e8e94599e907..ae0e44e5f2ad 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -614,7 +614,7 @@ struct ocfs2_super_block { __le16 s_reserved0; __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. * s_uuid_hash serves as seed[3]. */ -/*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ +/*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 765105f1ff8a..be0a5758bd40 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1011,6 +1011,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) printk(KERN_ERR "ocfs2: Could not determine" " locking version\n"); user_cluster_disconnect(conn); + lc = NULL; goto out; } wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 53a945da873b..d53a6cc866be 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -127,14 +127,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, char namebuf[40]; struct inode *inode = NULL; u64 blkno; - int status = 0; + int len, status = 0; - ocfs2_sprintf_system_inode_name(namebuf, - sizeof(namebuf), - type, slot); + len = ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, slot); - status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, - strlen(namebuf), &blkno); + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, + namebuf, len, &blkno); if (status < 0) { goto bail; } diff --git a/fs/proc/base.c b/fs/proc/base.c index b997ceef9135..6299878e3d97 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3947,7 +3947,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) tid = task_pid_nr_ns(task, ns); if (!tid) continue; /* The task has just exited. */ - len = snprintf(name, sizeof(name), "%u", tid); + len = snprintf(name, sizeof(name), "%d", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index ce7d661d5ad8..1582e0637a7e 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -307,7 +307,8 @@ static int fill_meta_index(struct inode *inode, int index, all_done: *index_block = cur_index_block; *index_offset = cur_offset; - *data_block = cur_data_block; + if (data_block) + *data_block = cur_data_block; /* * Scale cache index (cache slot entry) to index @@ -324,17 +325,15 @@ failed: * Get the on-disk location and compressed size of the datablock * specified by index. Fill_meta_index() does most of the work. */ -static int read_blocklist(struct inode *inode, int index, u64 *block) +static int read_blocklist_ptrs(struct inode *inode, int index, u64 *start, + int *offset, u64 *block) { - u64 start; long long blks; - int offset; __le32 size; - int res = fill_meta_index(inode, index, &start, &offset, block); + int res = fill_meta_index(inode, index, start, offset, block); - TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset" - " 0x%x, block 0x%llx\n", res, index, start, offset, - *block); + TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset 0x%x, block 0x%llx\n", + res, index, *start, *offset, block ? *block : 0); if (res < 0) return res; @@ -346,22 +345,31 @@ static int read_blocklist(struct inode *inode, int index, u64 *block) * extra block indexes needed. */ if (res < index) { - blks = read_indexes(inode->i_sb, index - res, &start, &offset); + blks = read_indexes(inode->i_sb, index - res, start, offset); if (blks < 0) return (int) blks; - *block += blks; + if (block) + *block += blks; } /* * Read length of block specified by index. */ - res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset, + res = squashfs_read_metadata(inode->i_sb, &size, start, offset, sizeof(size)); if (res < 0) return res; return squashfs_block_size(size); } +static inline int read_blocklist(struct inode *inode, int index, u64 *block) +{ + u64 start; + int offset; + + return read_blocklist_ptrs(inode, index, &start, &offset, block); +} + static bool squashfs_fill_page(struct folio *folio, struct squashfs_cache_entry *buffer, size_t offset, size_t avail) @@ -658,7 +666,114 @@ skip_pages: kfree(pages); } +static loff_t seek_hole_data(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 start, index = offset >> msblk->block_log; + u64 file_end = (i_size_read(inode) + msblk->block_size - 1) >> msblk->block_log; + int s_offset, length; + __le32 *blist = NULL; + + /* reject offset if negative or beyond file end */ + if ((unsigned long long)offset >= i_size_read(inode)) + return -ENXIO; + + /* is offset within tailend and is tailend packed into a fragment? */ + if (index + 1 == file_end && + squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { + if (whence == SEEK_DATA) + return offset; + + /* there is an implicit hole at the end of any file */ + return i_size_read(inode); + } + + length = read_blocklist_ptrs(inode, index, &start, &s_offset, NULL); + if (length < 0) + return length; + + /* nothing more to do if offset matches desired whence value */ + if ((length == 0 && whence == SEEK_HOLE) || + (length && whence == SEEK_DATA)) + return offset; + + /* skip scanning forwards if we're at file end */ + if (++ index == file_end) + goto not_found; + + blist = kmalloc(SQUASHFS_SCAN_INDEXES << 2, GFP_KERNEL); + if (blist == NULL) { + ERROR("%s: Failed to allocate block_list\n", __func__); + return -ENOMEM; + } + + while (index < file_end) { + int i, indexes = min(file_end - index, SQUASHFS_SCAN_INDEXES); + + offset = squashfs_read_metadata(sb, blist, &start, &s_offset, indexes << 2); + if (offset < 0) + goto finished; + + for (i = 0; i < indexes; i++) { + length = squashfs_block_size(blist[i]); + if (length < 0) { + offset = length; + goto finished; + } + + /* does this block match desired whence value? */ + if ((length == 0 && whence == SEEK_HOLE) || + (length && whence == SEEK_DATA)) { + offset = (index + i) << msblk->block_log; + goto finished; + } + } + + index += indexes; + } + +not_found: + /* whence value determines what happens */ + if (whence == SEEK_DATA) + offset = -ENXIO; + else + /* there is an implicit hole at the end of any file */ + offset = i_size_read(inode); + +finished: + kfree(blist); + return offset; +} + +static loff_t squashfs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + + switch (whence) { + default: + return generic_file_llseek(file, offset, whence); + case SEEK_DATA: + case SEEK_HOLE: + offset = seek_hole_data(file, offset, whence); + break; + } + + if (offset < 0) + return offset; + + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); +} + const struct address_space_operations squashfs_aops = { .read_folio = squashfs_read_folio, .readahead = squashfs_readahead }; + +const struct file_operations squashfs_file_operations = { + .llseek = squashfs_llseek, + .read_iter = generic_file_read_iter, + .mmap_prepare = generic_file_readonly_mmap_prepare, + .splice_read = filemap_splice_read +}; diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index d5918eba27e3..cceae3b78698 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -68,6 +68,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; + /* File type must not be set at this moment, for it will later be set by the caller. */ + if (inode->i_mode & S_IFMT) + err = -EIO; + return err; } @@ -140,8 +144,17 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; + inode->i_size = le32_to_cpu(sqsh_ino->file_size); frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { + /* + * the file cannot have a fragment (tailend) and have a + * file size a multiple of the block size + */ + if ((inode->i_size & (msblk->block_size - 1)) == 0) { + err = -EINVAL; + goto failed_read; + } frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { @@ -155,8 +168,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) } set_nlink(inode, 1); - inode->i_size = le32_to_cpu(sqsh_ino->file_size); - inode->i_fop = &generic_ro_fops; + inode->i_fop = &squashfs_file_operations; inode->i_mode |= S_IFREG; inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; squashfs_i(inode)->fragment_block = frag_blk; @@ -165,6 +177,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " @@ -183,8 +196,21 @@ int squashfs_read_inode(struct inode *inode, long long ino) if (err < 0) goto failed_read; + inode->i_size = le64_to_cpu(sqsh_ino->file_size); + if (inode->i_size < 0) { + err = -EINVAL; + goto failed_read; + } frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { + /* + * the file cannot have a fragment (tailend) and have a + * file size a multiple of the block size + */ + if ((inode->i_size & (msblk->block_size - 1)) == 0) { + err = -EINVAL; + goto failed_read; + } frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { @@ -199,9 +225,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); - inode->i_size = le64_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_inode_ops; - inode->i_fop = &generic_ro_fops; + inode->i_fop = &squashfs_file_operations; inode->i_mode |= S_IFREG; inode->i_blocks = (inode->i_size - le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; @@ -212,6 +237,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " @@ -292,6 +318,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; squashfs_i(inode)->offset = offset; + squashfs_i(inode)->parent = 0; if (type == SQUASHFS_LSYMLINK_TYPE) { __le32 xattr; @@ -329,6 +356,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + squashfs_i(inode)->parent = 0; TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); @@ -353,6 +381,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + squashfs_i(inode)->parent = 0; TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); @@ -373,6 +402,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_mode |= S_IFSOCK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); + squashfs_i(inode)->parent = 0; break; } case SQUASHFS_LFIFO_TYPE: @@ -392,6 +422,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); + squashfs_i(inode)->parent = 0; break; } default: diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 218868b20f16..4851bd964502 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -107,6 +107,7 @@ extern const struct address_space_operations squashfs_aops; /* inode.c */ extern const struct inode_operations squashfs_inode_ops; +extern const struct file_operations squashfs_file_operations; /* namei.c */ extern const struct inode_operations squashfs_dir_inode_ops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 95f8e8901768..a955d9369749 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -208,6 +208,7 @@ static inline int squashfs_block_size(__le32 raw) #define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) #define SQUASHFS_META_ENTRIES 127 #define SQUASHFS_META_SLOTS 8 +#define SQUASHFS_SCAN_INDEXES 1024 struct meta_entry { u64 data_block; diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 2c82d6f2a456..8e497ac07b9a 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -16,6 +16,7 @@ struct squashfs_inode_info { u64 xattr; unsigned int xattr_size; int xattr_count; + int parent; union { struct { u64 fragment_block; @@ -27,7 +28,6 @@ struct squashfs_inode_info { u64 dir_idx_start; int dir_idx_offset; int dir_idx_cnt; - int parent; }; }; struct inode vfs_inode; diff --git a/include/linux/idr.h b/include/linux/idr.h index 2267902d29a7..789e23e67444 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -334,14 +334,6 @@ static inline void ida_init(struct ida *ida) xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } -/* - * ida_simple_get() and ida_simple_remove() are deprecated. Use - * ida_alloc() and ida_free() instead respectively. - */ -#define ida_simple_get(ida, start, end, gfp) \ - ida_alloc_range(ida, start, (end) - 1, gfp) -#define ida_simple_remove(ida, id) ida_free(ida, id) - static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 989315dabb86..5b46924fdff5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -164,11 +164,23 @@ extern int root_mountflags; extern bool early_boot_irqs_disabled; -/* - * Values used for system_state. Ordering of the states must not be changed +/** + * enum system_states - Values used for system_state. + * + * @SYSTEM_BOOTING: %0, no init needed + * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU + * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running + * @SYSTEM_RUNNING: system is up and running + * @SYSTEM_HALT: system entered clean system halt state + * @SYSTEM_POWER_OFF: system entered shutdown/clean power off state + * @SYSTEM_RESTART: system entered emergency power off or normal restart + * @SYSTEM_SUSPEND: system entered suspend or hibernate state + * + * Note: + * Ordering of the states must not be changed * as code checks for <, <=, >, >= STATE. */ -extern enum system_states { +enum system_states { SYSTEM_BOOTING, SYSTEM_SCHEDULING, SYSTEM_FREEING_INITMEM, @@ -177,7 +189,8 @@ extern enum system_states { SYSTEM_POWER_OFF, SYSTEM_RESTART, SYSTEM_SUSPEND, -} system_state; +}; +extern enum system_states system_state; /* * General tracing related utility functions - trace_printk(), diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 39fe3e6cd282..ff7e231b0485 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -395,6 +395,9 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; + + /* Force carrying over the DTB from the current boot */ + bool force_dtb; #endif #ifdef CONFIG_CRASH_HOTPLUG @@ -461,7 +464,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ - KEXEC_FILE_NO_CMA) + KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 348844cffb13..559d13a3bc44 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -40,6 +40,7 @@ struct kho_serialization; #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); +bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); int kho_preserve_phys(phys_addr_t phys, size_t size); @@ -60,6 +61,11 @@ static inline bool kho_is_enabled(void) return false; } +static inline bool is_kho_boot(void) +{ + return false; +} + static inline int kho_preserve_folio(struct folio *folio) { return -EOPNOTSUPP; diff --git a/include/linux/list.h b/include/linux/list.h index 7f7657e41620..5bfda2f91fca 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -20,8 +20,16 @@ * using the generic single-entry routines. */ +/** + * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself + * @name: name of the list_head + */ #define LIST_HEAD_INIT(name) { &(name), &(name) } +/** + * LIST_HEAD - definition of a &struct list_head with initialization values + * @name: name of the list_head + */ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 3a25122d83e2..6907aedc4f74 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -349,6 +349,19 @@ static inline void kernel_param_unlock(struct module *mod) __module_param_call("", name, ¶m_ops_##type, &var, perm, \ -1, KERNEL_PARAM_FL_UNSAFE) +/** + * __core_param_cb - similar like core_param, with a set/get ops instead of type. + * @name: the name of the cmdline and sysfs parameter (often the same as var) + * @var: the variable + * @ops: the set & get operations for this parameter. + * @perm: visibility in sysfs + * + * Ideally this should be called 'core_param_cb', but the name has been + * used for module core parameter, so add the '__' prefix + */ +#define __core_param_cb(name, ops, arg, perm) \ + __module_param_call("", name, ops, arg, perm, -1, 0) + #endif /* !MODULE */ /** diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 615a560d9edb..f3b13da78aac 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -103,7 +103,7 @@ struct nvmem_cell_info { * * Note: A default "nvmem<id>" name will be assigned to the device if * no name is specified in its configuration. In such case "<id>" is - * generated with ida_simple_get() and provided id field is ignored. + * generated with ida_alloc() and provided id field is ignored. * * Note: Specifying name and setting id to -1 implies a unique device * whose name is provided as-is (kept unaltered). diff --git a/include/linux/panic.h b/include/linux/panic.h index 7be742628c25..6f972a66c13e 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -43,6 +43,12 @@ void abort(void); extern atomic_t panic_cpu; #define PANIC_CPU_INVALID -1 +bool panic_try_start(void); +void panic_reset(void); +bool panic_in_progress(void); +bool panic_on_this_cpu(void); +bool panic_on_other_cpu(void); + /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. diff --git a/include/linux/printk.h b/include/linux/printk.h index 5d22b803f51e..45c663124c9b 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -330,8 +330,6 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress) #endif -bool this_cpu_in_panic(void); - #ifdef CONFIG_SMP extern int __printk_cpu_sync_try_get(void); extern void __printk_cpu_sync_wait(void); diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 34d6a0e108c3..525aa2a632b2 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -210,9 +210,8 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * - * Nests both inside and outside of read_lock(&tasklist_lock). - * It must not be nested with write_lock_irq(&tasklist_lock), - * neither inside nor outside. + * Nests inside of read_lock(&tasklist_lock). It must not be nested with + * write_lock_irq(&tasklist_lock), neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 8958ebfcff94..55749cb0b81d 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -22,12 +22,16 @@ * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd * fd field. + * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new + * kernel on x86. This is already the default behavior on + * some other architectures, like ARM64 and PowerPC. */ #define KEXEC_FILE_UNLOAD 0x00000001 #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 #define KEXEC_FILE_NO_CMA 0x00000010 +#define KEXEC_FILE_FORCE_DTB 0x00000020 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/init/main.c b/init/main.c index fab4f599c035..07a3116811c5 100644 --- a/init/main.c +++ b/init/main.c @@ -545,6 +545,12 @@ static int __init unknown_bootoption(char *param, char *val, const char *unused, void *arg) { size_t len = strlen(param); + /* + * Well-known bootloader identifiers: + * 1. LILO/Grub pass "BOOT_IMAGE=..."; + * 2. kexec/kdump (kexec-tools) pass "kexec". + */ + const char *bootloader[] = { "BOOT_IMAGE=", "kexec", NULL }; /* Handle params aliased to sysctls */ if (sysctl_is_alias(param)) @@ -552,6 +558,12 @@ static int __init unknown_bootoption(char *param, char *val, repair_env_string(param, val); + /* Handle bootloader identifier */ + for (int i = 0; bootloader[i]; i++) { + if (strstarts(param, bootloader[i])) + return 0; + } + /* Handle obsolete-style parameters */ if (obsolete_checksetup(param)) return 0; diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 1224dd937df0..422270d64820 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -148,6 +148,17 @@ config CRASH_DM_CRYPT_CONFIGS CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that is required to be built-in. +config CRASH_DUMP_KUNIT_TEST + tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS + depends on CRASH_DUMP && KUNIT + default KUNIT_ALL_TESTS + help + This option builds KUnit unit tests for kernel crash dumps. The unit + tests will be used to verify the correctness of covered functions and + also prevent any regression. + + If unsure, say N. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y diff --git a/kernel/Makefile b/kernel/Makefile index 41751834e764..df3dd8291bb6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -78,6 +78,7 @@ obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o +obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a4ef79591eb2..3b1c43382eec 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -22,6 +22,7 @@ #include <linux/btf.h> #include <linux/objtool.h> #include <linux/delay.h> +#include <linux/panic.h> #include <asm/page.h> #include <asm/sections.h> @@ -143,17 +144,7 @@ STACK_FRAME_NON_STANDARD(__crash_kexec); __bpf_kfunc void crash_kexec(struct pt_regs *regs) { - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); @@ -161,7 +152,7 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * Reset panic_cpu to allow another panic()/crash_kexec() * call. */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); + panic_reset(); } } @@ -274,6 +265,20 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, return 0; } +/** + * crash_exclude_mem_range - exclude a mem range for existing ranges + * @mem: mem->range contains an array of ranges sorted in ascending order + * @mstart: the start of to-be-excluded range + * @mend: the start of to-be-excluded range + * + * If you are unsure if a range split will happen, to avoid function call + * failure because of -ENOMEM, always make sure + * mem->max_nr_ranges == mem->nr_ranges + 1 + * before calling the function each time. + * + * returns 0 if a memory range is excluded successfully + * return -ENOMEM if mem->ranges doesn't have space to hold split ranges + */ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { @@ -333,6 +338,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } +EXPORT_SYMBOL_GPL(crash_exclude_mem_range); ssize_t crash_get_memory_size(void) { diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c new file mode 100644 index 000000000000..8aadf6801530 --- /dev/null +++ b/kernel/crash_core_test.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <kunit/test.h> +#include <linux/crash_core.h> // For struct crash_mem and struct range if defined there + +// Helper to create and initialize crash_mem +static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges, + unsigned int nr_initial_ranges, + const struct range *initial_ranges) +{ + struct crash_mem *mem; + size_t alloc_size; + + // Check if max_ranges can even hold initial_ranges + if (max_ranges < nr_initial_ranges) { + kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n", + max_ranges, nr_initial_ranges); + return NULL; + } + + alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range); + mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL); + if (!mem) { + kunit_err(test, "Failed to allocate crash_mem\n"); + return NULL; + } + + mem->max_nr_ranges = max_ranges; + mem->nr_ranges = nr_initial_ranges; + if (initial_ranges && nr_initial_ranges > 0) { + memcpy(mem->ranges, initial_ranges, + nr_initial_ranges * sizeof(struct range)); + } + + return mem; +} + +// Helper to compare ranges for assertions +static void assert_ranges_equal(struct kunit *test, + const struct range *actual_ranges, + unsigned int actual_nr_ranges, + const struct range *expected_ranges, + unsigned int expected_nr_ranges, + const char *case_name) +{ + unsigned int i; + + KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges, + "%s: Number of ranges mismatch.", case_name); + + for (i = 0; i < expected_nr_ranges; i++) { + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start, + "%s: Range %u start mismatch.", case_name, i); + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end, + "%s: Range %u end mismatch.", case_name, i); + } +} + +// Structure for test parameters +struct exclude_test_param { + const char *description; + unsigned long long exclude_start; + unsigned long long exclude_end; + unsigned int initial_max_ranges; + const struct range *initial_ranges; + unsigned int initial_nr_ranges; + const struct range *expected_ranges; + unsigned int expected_nr_ranges; + int expected_ret; +}; + +static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params) +{ + struct crash_mem *mem; + int ret; + + kunit_info(test, "%s", params->description); + + mem = create_crash_mem(test, params->initial_max_ranges, + params->initial_nr_ranges, params->initial_ranges); + if (!mem) + return; // Error already logged by create_crash_mem or kunit_kzalloc + + ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end); + + KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret, + "%s: Return value mismatch.", params->description); + + if (params->expected_ret == 0) { + assert_ranges_equal(test, mem->ranges, mem->nr_ranges, + params->expected_ranges, params->expected_nr_ranges, + params->description); + } else { + // If an error is expected, nr_ranges might still be relevant to check + // depending on the exact point of failure. For ENOMEM on split, + // nr_ranges shouldn't have changed. + KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges, + mem->nr_ranges, + "%s: Number of ranges mismatch on error.", + params->description); + } +} + +/* + * Test Strategy 1: One to-be-excluded range A and one existing range B. + * + * Exhaust all possibilities of the position of A regarding B. + */ + +static const struct range single_range_b = { .start = 100, .end = 199 }; + +static const struct exclude_test_param exclude_single_range_test_data[] = { + { + .description = "1.1: A is left of B, no overlap", + .exclude_start = 10, .exclude_end = 50, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.2: A's right boundary touches B's left boundary", + .exclude_start = 10, .exclude_end = 99, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.3: A overlaps B's left part", + .exclude_start = 50, .exclude_end = 149, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.4: A is completely inside B", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 119 }, + { .start = 180, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.5: A overlaps B's right part", + .exclude_start = 150, .exclude_end = 249, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.6: A's left boundary touches B's right boundary", + .exclude_start = 200, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.7: A is right of B, no overlap", + .exclude_start = 250, .exclude_end = 300, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.8: A completely covers B and extends beyond", + .exclude_start = 50, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.9: A covers B and extends to the left", + .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.10: A covers B and extends to the right", + .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.11: A is identical to B", + .exclude_start = 100, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.12: A is a point, left of B, no overlap", + .exclude_start = 10, .exclude_end = 10, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.13: A is a point, at start of B", + .exclude_start = 100, .exclude_end = 100, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.14: A is a point, in middle of B (causes split)", + .exclude_start = 150, .exclude_end = 150, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 149 }, + { .start = 151, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.15: A is a point, at end of B", + .exclude_start = 199, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.16: A is a point, right of B, no overlap", + .exclude_start = 250, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + // ENOMEM case for single range split + { + .description = "1.17: A completely inside B (split), no space (ENOMEM)", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 1, // Not enough for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 1, // Should remain unchanged + .expected_ret = -ENOMEM, + }, +}; + + +static void exclude_single_range_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description); + run_exclude_test_case(test, &exclude_single_range_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * Test Strategy 2: Regression test. + */ + +static const struct exclude_test_param exclude_range_regression_test_data[] = { + // Test data from commit a2e9a95d2190 + { + .description = "2.1: exclude low 1M", + .exclude_start = 0, .exclude_end = (1 << 20) - 1, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 0, .end = 0x3efff }, + { .start = 0x3f000, .end = 0x3ffff }, + { .start = 0x40000, .end = 0x9ffff } + }, + .initial_nr_ranges = 3, + .expected_nr_ranges = 0, + .expected_ret = 0, + }, + // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u + { + .description = "2.2: when range out of bound", + .exclude_start = 100, .exclude_end = 200, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 1, .end = 299 }, + { .start = 401, .end = 1000 }, + { .start = 1001, .end = 2000 } + }, + .initial_nr_ranges = 3, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 3, // Should remain unchanged + .expected_ret = -ENOMEM + }, + +}; + + +static void exclude_range_regression_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description); + run_exclude_test_case(test, &exclude_range_regression_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * KUnit Test Suite + */ +static struct kunit_case crash_exclude_mem_range_test_cases[] = { + KUNIT_CASE(exclude_single_range_test), + KUNIT_CASE(exclude_range_regression_test), + {} +}; + +static struct kunit_suite crash_exclude_mem_range_suite = { + .name = "crash_exclude_mem_range_tests", + .test_cases = crash_exclude_mem_range_test_cases, + // .init and .exit can be NULL if not needed globally for the suite +}; + +kunit_test_suite(crash_exclude_mem_range_suite); + +MODULE_DESCRIPTION("crash dump KUnit test suite"); +MODULE_LICENSE("GPL"); diff --git a/kernel/fork.c b/kernel/fork.c index f1688b3e79a6..3da0f08615a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2132,9 +2132,7 @@ __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; -#ifdef CONFIG_LOCKDEP lockdep_init_task(p); -#endif p->blocked_on = NULL; /* not blocked yet */ @@ -2547,11 +2545,9 @@ struct task_struct * __init fork_idle(int cpu) struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| - CLONE_IO; + CLONE_IO|CLONE_VM|CLONE_UNTRACED; struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = flags, .fn = fn, .fn_arg = arg, .io_thread = 1, @@ -2663,9 +2659,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, @@ -2681,9 +2676,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8708a1205f82..b2c1f14b8129 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -95,9 +95,41 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; +static bool task_is_hung(struct task_struct *t, unsigned long timeout) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + unsigned int state = READ_ONCE(t->__state); + + /* + * skip the TASK_KILLABLE tasks -- these can be killed + * skip the TASK_IDLE tasks -- those are genuinely idle + * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer + */ + if (!(state & TASK_UNINTERRUPTIBLE) || + (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN))) + return false; + + /* + * When a freshly created task is scheduled once, changes its state to + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it + * musn't be checked. + */ + if (unlikely(!switch_count)) + return false; + + if (switch_count != t->last_switch_count) { + t->last_switch_count = switch_count; + t->last_switch_time = jiffies; + return false; + } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return false; + + return true; +} #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER -static void debug_show_blocker(struct task_struct *task) +static void debug_show_blocker(struct task_struct *task, unsigned long timeout) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; @@ -174,41 +206,21 @@ static void debug_show_blocker(struct task_struct *task) t->pid, rwsem_blocked_by); break; } - sched_show_task(t); + /* Avoid duplicated task dump, skip if the task is also hung. */ + if (!task_is_hung(t, timeout)) + sched_show_task(t); return; } } #else -static inline void debug_show_blocker(struct task_struct *task) +static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout) { } #endif static void check_hung_task(struct task_struct *t, unsigned long timeout) { - unsigned long switch_count = t->nvcsw + t->nivcsw; - - /* - * Ensure the task is not frozen. - * Also, skip vfork and any other user process that freezer should skip. - */ - if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) - return; - - /* - * When a freshly created task is scheduled once, changes its state to - * TASK_UNINTERRUPTIBLE without having ever been switched out once, it - * musn't be checked. - */ - if (unlikely(!switch_count)) - return; - - if (switch_count != t->last_switch_count) { - t->last_switch_count = switch_count; - t->last_switch_time = jiffies; - return; - } - if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + if (!task_is_hung(t, timeout)) return; /* @@ -243,7 +255,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_blocker(t); + debug_show_blocker(t, timeout); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) @@ -299,7 +311,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { - unsigned int state; if (!max_count--) goto unlock; @@ -308,15 +319,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* - * skip the TASK_KILLABLE tasks -- these can be killed - * skip the TASK_IDLE tasks -- those are genuinely idle - */ - state = READ_ONCE(t->__state); - if ((state & TASK_UNINTERRUPTIBLE) && - !(state & TASK_WAKEKILL) && - !(state & TASK_NOLOAD)) - check_hung_task(t, timeout); + + check_hung_task(t, timeout); } unlock: rcu_read_unlock(); diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index cf4af5728307..2b082a7e24a2 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -264,7 +264,7 @@ static int test_kallsyms_basic_function(void) char namebuf[KSYM_NAME_LEN]; struct test_stat *stat, *stat2; - stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); + stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL); if (!stat) return -ENOMEM; stat2 = stat + 1; diff --git a/kernel/kcov.c b/kernel/kcov.c index 1d85597057e1..6563141f5de9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -978,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, memcpy(dst_entries, src_entries, bytes_to_move); entries_moved = bytes_to_move >> entry_size_log; + /* + * A write memory barrier is required here, to ensure + * that the writes from the memcpy() are visible before + * the count is updated. Without this, it is possible for + * a user to observe a new count value but stale + * coverage data. + */ + smp_wmb(); + switch (mode) { case KCOV_MODE_TRACE_PC: WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 31203f0bacaf..fa00b239c5d9 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -233,7 +233,6 @@ struct kimage *do_kimage_alloc_init(void) if (!image) return NULL; - image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a817..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 555488eb1a18..5083c68c3a4e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -988,6 +988,26 @@ static const void *kho_get_fdt(void) } /** + * is_kho_boot - check if current kernel was booted via KHO-enabled + * kexec + * + * This function checks if the current kernel was loaded through a kexec + * operation with KHO enabled, by verifying that a valid KHO FDT + * was passed. + * + * Note: This function returns reliable results only after + * kho_populate() has been called during early boot. Before that, + * it may return false even if KHO data is present. + * + * Return: true if booted via KHO-enabled kexec, false otherwise + */ +bool is_kho_boot(void) +{ + return !!kho_get_fdt(); +} +EXPORT_SYMBOL_GPL(is_kho_boot); + +/** * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. * @name: the name of the sub FDT passed to kho_add_subtree(). * @phys: if found, the physical address of the sub FDT is stored in @phys. @@ -1269,7 +1289,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_enable) + if (!kho_out.finalized) return 0; image->kho.fdt = page_to_phys(kho_out.ser.fdt); diff --git a/kernel/panic.c b/kernel/panic.c index 72fcbb5a071b..24cc3eec1805 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -53,7 +53,7 @@ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; #define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ -int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; +int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS); static unsigned long tainted_mask = IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; @@ -67,6 +67,7 @@ static unsigned int warn_limit __read_mostly; static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; +static bool panic_this_cpu_backtrace_printed; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -77,6 +78,11 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +static void panic_print_deprecated(void) +{ + pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n"); +} + #ifdef CONFIG_SYSCTL /* @@ -125,7 +131,7 @@ static int proc_taint(const struct ctl_table *table, int write, static int sysctl_panic_print_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + panic_print_deprecated(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } @@ -294,6 +300,59 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +bool panic_try_start(void) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu); +} +EXPORT_SYMBOL(panic_try_start); + +void panic_reset(void) +{ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_reset); + +bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_in_progress); + +/* Return true if a panic is in progress on the current CPU. */ +bool panic_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); +} +EXPORT_SYMBOL(panic_on_this_cpu); + +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool panic_on_other_cpu(void) +{ + return (panic_in_progress() && !panic_on_this_cpu()); +} +EXPORT_SYMBOL(panic_on_other_cpu); + /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in @@ -302,15 +361,9 @@ atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); */ void nmi_panic(struct pt_regs *regs, const char *msg) { - int old_cpu, this_cpu; - - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) + if (panic_try_start()) panic("%s", msg); - else if (old_cpu != this_cpu) + else if (panic_on_other_cpu()) nmi_panic_self_stop(regs); } EXPORT_SYMBOL(nmi_panic); @@ -328,6 +381,19 @@ void check_panic_on_warn(const char *origin) origin, limit); } +static void panic_trigger_all_cpu_backtrace(void) +{ + /* Temporary allow non-panic CPUs to write their backtraces. */ + panic_triggering_all_cpu_backtrace = true; + + if (panic_this_cpu_backtrace_printed) + trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id()); + else + trigger_all_cpu_backtrace(); + + panic_triggering_all_cpu_backtrace = false; +} + /* * Helper that triggers the NMI backtrace (if set in panic_print) * and then performs the secondary CPUs shutdown - we cannot have @@ -335,12 +401,8 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & SYS_INFO_ALL_CPU_BT) { - /* Temporary allow non-panic CPUs to write their backtraces. */ - panic_triggering_all_cpu_backtrace = true; - trigger_all_cpu_backtrace(); - panic_triggering_all_cpu_backtrace = false; - } + if (panic_print & SYS_INFO_ALL_CPU_BT) + panic_trigger_all_cpu_backtrace(); /* * Note that smp_send_stop() is the usual SMP shutdown function, @@ -368,7 +430,6 @@ void vpanic(const char *fmt, va_list args) static char buf[1024]; long i, i_next = 0, len; int state = 0; - int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; if (panic_on_warn) { @@ -405,13 +466,10 @@ void vpanic(const char *fmt, va_list args) * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* go ahead */ - } else if (old_cpu != this_cpu) + } else if (panic_on_other_cpu()) panic_smp_self_stop(); console_verbose(); @@ -422,13 +480,15 @@ void vpanic(const char *fmt, va_list args) buf[len - 1] = '\0'; pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + if (test_taint(TAINT_DIE) || oops_in_progress > 1) { + panic_this_cpu_backtrace_printed = true; + } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { dump_stack(); -#endif + panic_this_cpu_backtrace_printed = true; + } /* * If kgdb is enabled, give it a chance to run before we stop all @@ -937,12 +997,29 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif core_param(panic, panic_timeout, int, 0644); -core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); core_param(panic_console_replay, panic_console_replay, bool, 0644); +static int panic_print_set(const char *val, const struct kernel_param *kp) +{ + panic_print_deprecated(); + return param_set_ulong(val, kp); +} + +static int panic_print_get(char *val, const struct kernel_param *kp) +{ + panic_print_deprecated(); + return param_get_ulong(val, kp); +} + +static const struct kernel_param_ops panic_print_ops = { + .set = panic_print_set, + .get = panic_print_get, +}; +__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644); + static int __init oops_setup(char *s) { if (!s) diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index ef282001f200..f72bbfa266d6 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -332,7 +332,6 @@ struct printk_message { unsigned long dropped; }; -bool other_cpu_in_panic(void); bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_supress); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 646801813415..558ef3177976 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -12,6 +12,7 @@ #include <linux/irqflags.h> #include <linux/kthread.h> #include <linux/minmax.h> +#include <linux/panic.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/slab.h> @@ -254,7 +255,7 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } @@ -309,7 +310,7 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. * Event #4 occurs when a non-panic CPU reacquires. - * Event #5 is not possible due to the other_cpu_in_panic() check + * Event #5 is not possible due to the panic_on_other_cpu() check * in nbcon_context_try_acquire_handover(). */ @@ -348,7 +349,7 @@ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state new; /* Note that the caller must still remove the request! */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* @@ -446,7 +447,7 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, * nbcon_waiter_matches(). In particular, the assumption that * lower priorities are ignored during panic. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* Handover is not possible on the same CPU. */ @@ -589,7 +590,6 @@ static struct printk_buffers panic_nbcon_pbufs; */ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { - unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; int err; @@ -614,7 +614,7 @@ out: /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ - if (atomic_read(&panic_cpu) == cpu) + if (panic_on_this_cpu()) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; @@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; - if (this_cpu_in_panic()) + if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0efbcdda9aab..5aee9ffb16b9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -48,6 +48,7 @@ #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> +#include <linux/panic.h> #include <linux/uaccess.h> #include <asm/sections.h> @@ -345,34 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -static bool panic_in_progress(void) -{ - return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); -} - -/* Return true if a panic is in progress on the current CPU. */ -bool this_cpu_in_panic(void) -{ - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); -} - -/* - * Return true if a panic is in progress on a remote CPU. - * - * On true, the local CPU should immediately release any printing resources - * that may be needed by the panic CPU. - */ -bool other_cpu_in_panic(void) -{ - return (panic_in_progress() && !this_cpu_in_panic()); -} - /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -2407,7 +2380,7 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && !debug_non_panic_cpus && !panic_triggering_all_cpu_backtrace) return 0; @@ -2843,7 +2816,7 @@ void console_lock(void) might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ - while (other_cpu_in_panic()) + while (panic_on_other_cpu()) msleep(1000); down_console_sem(); @@ -2863,7 +2836,7 @@ EXPORT_SYMBOL(console_lock); int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return 0; if (down_trylock_console_sem()) return 0; @@ -3243,7 +3216,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) goto abandon; if (do_cond_resched) diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index d9fb053cff67..e2a1b2d34d2b 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && + if (panic_on_this_cpu() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; diff --git a/kernel/sys.c b/kernel/sys.c index a46d9b75880b..8b58eece4e58 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1734,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, struct rlimit old, new; struct task_struct *tsk; unsigned int checkflags = 0; + bool need_tasklist; int ret; if (old_rlim) @@ -1760,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, get_task_struct(tsk); rcu_read_unlock(); - ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, - old_rlim ? &old : NULL); + need_tasklist = !same_thread_group(tsk, current); + if (need_tasklist) { + /* + * Ensure we can't race with group exit or de_thread(), + * so tsk->group_leader can't be freed or changed until + * read_unlock(tasklist_lock) below. + */ + read_lock(&tasklist_lock); + if (!pid_alive(tsk)) + ret = -ESRCH; + } + + if (!ret) { + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + } + + if (need_tasklist) + read_unlock(&tasklist_lock); if (!ret && old_rlim) { rlim_to_rlim64(&old, &old64); @@ -2515,7 +2533,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = -EINVAL; break; } + /* + * Ensure that either: + * + * 1. Subsequent getppid() calls reflect the parent process having died. + * 2. forget_original_parent() will send the new me->pdeath_signal. + * + * Also prevent the read of me->pdeath_signal from being a data race. + */ + read_lock(&tasklist_lock); me->pdeath_signal = arg2; + read_unlock(&tasklist_lock); break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 80b56c002c7f..5b62d1002783 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -425,7 +425,11 @@ static DEFINE_PER_CPU(u8, cpustat_tail); */ static u16 get_16bit_precision(u64 data_ns) { - return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ + /* + * 2^24ns ~= 16.8ms + * Round to the nearest multiple of 16.8 milliseconds. + */ + return (data_ns + (1 << 23)) >> 24LL; } static void update_cpustat(void) @@ -444,6 +448,14 @@ static void update_cpustat(void) old_stat = __this_cpu_read(cpustat_old[i]); new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); + /* + * Since we use 16-bit precision, the raw data will undergo + * integer division, which may sometimes result in data loss, + * and then result might exceed 100%. To avoid confusion, + * we enforce a 100% display cap when calculations exceed this threshold. + */ + if (util > 100) + util = 100; __this_cpu_write(cpustat_util[tail][i], util); __this_cpu_write(cpustat_old[i], new_stat); } @@ -455,17 +467,17 @@ static void print_cpustat(void) { int i, group; u8 tail = __this_cpu_read(cpustat_tail); - u64 sample_period_second = sample_period; + u64 sample_period_msecond = sample_period; - do_div(sample_period_second, NSEC_PER_SEC); + do_div(sample_period_msecond, NSEC_PER_MSEC); /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ - printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", - smp_processor_id(), sample_period_second); + printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n", + smp_processor_id(), sample_period_msecond); for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { group = (tail + i) % NUM_SAMPLE_PERIODS; @@ -740,6 +752,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (!watchdog_enabled) return HRTIMER_NORESTART; + /* + * pass the buddy check if a panic is in process + */ + if (panic_in_progress()) + return HRTIMER_NORESTART; + watchdog_hardlockup_kick(); /* kick the softlockup detector */ diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 9c58f5b4381d..d3ca70e3c256 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "NMI watchdog: " fmt +#include <linux/panic.h> #include <linux/nmi.h> #include <linux/atomic.h> #include <linux/module.h> @@ -108,6 +109,9 @@ static void watchdog_overflow_callback(struct perf_event *event, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; + if (panic_in_progress()) + return; + if (!watchdog_check_timestamp()) return; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cab4c7b27e54..3034e294d50d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1067,12 +1067,6 @@ config PANIC_ON_OOPS Say N if unsure. -config PANIC_ON_OOPS_VALUE - int - range 0 1 - default 0 if !PANIC_ON_OOPS - default 1 if PANIC_ON_OOPS - config PANIC_TIMEOUT int "panic timeout" default 0 diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 3ef702e6b69a..f26456988445 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -9,6 +9,7 @@ #include <linux/proc_fs.h> #include <linux/seq_buf.h> #include <linux/seq_file.h> +#include <linux/string_choices.h> #include <linux/vmalloc.h> #include <linux/kmemleak.h> @@ -728,7 +729,7 @@ static int __init setup_early_mem_profiling(char *str) } mem_profiling_support = true; pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n", - compressed ? "with" : "without", enable ? "on" : "off"); + compressed ? "with" : "without", str_on_off(enable)); } if (enable != mem_alloc_profiling_enabled()) { diff --git a/lib/btree.c b/lib/btree.c index bb81d3393ac5..9c80c0c7bba8 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -653,9 +653,9 @@ int btree_merge(struct btree_head *target, struct btree_head *victim, * walks to remove a single object from the victim. */ for (;;) { - if (!btree_last(victim, geo, key)) + val = btree_last(victim, geo, key); + if (!val) break; - val = btree_lookup(victim, geo, key); err = btree_insert(target, geo, key, val, gfp); if (err) return err; diff --git a/lib/decompress.c b/lib/decompress.c index ab3fc90ffc64..7785471586c6 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -49,15 +49,15 @@ struct compress_format { }; static const struct compress_format compressed_formats[] __initconst = { - { {0x1f, 0x8b}, "gzip", gunzip }, - { {0x1f, 0x9e}, "gzip", gunzip }, - { {0x42, 0x5a}, "bzip2", bunzip2 }, - { {0x5d, 0x00}, "lzma", unlzma }, - { {0xfd, 0x37}, "xz", unxz }, - { {0x89, 0x4c}, "lzo", unlzo }, - { {0x02, 0x21}, "lz4", unlz4 }, - { {0x28, 0xb5}, "zstd", unzstd }, - { {0, 0}, NULL, NULL } + { .magic = {0x1f, 0x8b}, .name = "gzip", .decompressor = gunzip }, + { .magic = {0x1f, 0x9e}, .name = "gzip", .decompressor = gunzip }, + { .magic = {0x42, 0x5a}, .name = "bzip2", .decompressor = bunzip2 }, + { .magic = {0x5d, 0x00}, .name = "lzma", .decompressor = unlzma }, + { .magic = {0xfd, 0x37}, .name = "xz", .decompressor = unxz }, + { .magic = {0x89, 0x4c}, .name = "lzo", .decompressor = unlzo }, + { .magic = {0x02, 0x21}, .name = "lz4", .decompressor = unlz4 }, + { .magic = {0x28, 0xb5}, .name = "zstd", .decompressor = unzstd }, + { /* sentinel */ } }; decompress_fn __init decompress_method(const unsigned char *inbuf, long len, @@ -73,11 +73,10 @@ decompress_fn __init decompress_method(const unsigned char *inbuf, long len, pr_debug("Compressed data magic: %#.2x %#.2x\n", inbuf[0], inbuf[1]); - for (cf = compressed_formats; cf->name; cf++) { + for (cf = compressed_formats; cf->name; cf++) if (!memcmp(inbuf, cf->magic, 2)) break; - } if (name) *name = cf->name; return cf->decompressor; diff --git a/lib/digsig.c b/lib/digsig.c index 04b5e55ed95f..2b36f9cc91e9 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -159,7 +159,6 @@ static int digsig_verify_rsa(struct key *key, len = mlen; head = len - l; - memset(out1, 0, head); memcpy(out1 + head, p, l); kfree(p); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index b3a85fe8b673..f0c78b5b5324 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -102,7 +102,7 @@ static void __dump_stack(const char *log_lvl) */ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { - bool in_panic = this_cpu_in_panic(); + bool in_panic = panic_on_this_cpu(); unsigned long flags; /* diff --git a/lib/fault-inject-usercopy.c b/lib/fault-inject-usercopy.c index 77558b6c29ca..75403ec50f49 100644 --- a/lib/fault-inject-usercopy.c +++ b/lib/fault-inject-usercopy.c @@ -22,10 +22,8 @@ static int __init fail_usercopy_debugfs(void) dir = fault_create_debugfs_attr("fail_usercopy", NULL, &fail_usercopy.attr); - if (IS_ERR(dir)) - return PTR_ERR(dir); - return 0; + return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_usercopy_debugfs); diff --git a/lib/genalloc.c b/lib/genalloc.c index 4fa5635bf81b..841f29783833 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -899,8 +899,11 @@ struct gen_pool *of_gen_pool_get(struct device_node *np, if (!name) name = of_node_full_name(np_pool); } - if (pdev) + if (pdev) { pool = gen_pool_get(&pdev->dev, name); + put_device(&pdev->dev); + } + of_node_put(np_pool); return pool; diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c index cce12287708e..258fb0e7abdf 100644 --- a/lib/ref_tracker.c +++ b/lib/ref_tracker.c @@ -75,7 +75,7 @@ ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; @@ -159,7 +159,7 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, return; } - sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN); + sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; @@ -306,7 +306,7 @@ int ref_tracker_free(struct ref_tracker_dir *dir, } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { diff --git a/lib/sys_info.c b/lib/sys_info.c index 5bf503fd7ec1..496f9151c9b6 100644 --- a/lib/sys_info.c +++ b/lib/sys_info.c @@ -55,7 +55,7 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - char names[sizeof(sys_info_avail) + 1]; + char names[sizeof(sys_info_avail)]; struct ctl_table table; unsigned long *si_bits_global; @@ -81,6 +81,7 @@ int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write, char *delim = ""; int i, len = 0; + names[0] = '\0'; for (i = 0; i < ARRAY_SIZE(si_names); i++) { if (*si_bits_global & si_names[i].bit) { len += scnprintf(names + len, sizeof(names) - len, diff --git a/lib/test_firmware.c b/lib/test_firmware.c index 211222e63328..be4f93124901 100644 --- a/lib/test_firmware.c +++ b/lib/test_firmware.c @@ -26,6 +26,7 @@ #include <linux/kthread.h> #include <linux/vmalloc.h> #include <linux/efi_embedded_fw.h> +#include <linux/string_choices.h> MODULE_IMPORT_NS("TEST_FIRMWARE"); @@ -304,17 +305,17 @@ static ssize_t config_show(struct device *dev, "FW_ACTION_NOUEVENT"); len += scnprintf(buf + len, PAGE_SIZE - len, "into_buf:\t\t%s\n", - test_fw_config->into_buf ? "true" : "false"); + str_true_false(test_fw_config->into_buf)); len += scnprintf(buf + len, PAGE_SIZE - len, "buf_size:\t%zu\n", test_fw_config->buf_size); len += scnprintf(buf + len, PAGE_SIZE - len, "file_offset:\t%zu\n", test_fw_config->file_offset); len += scnprintf(buf + len, PAGE_SIZE - len, "partial:\t\t%s\n", - test_fw_config->partial ? "true" : "false"); + str_true_false(test_fw_config->partial)); len += scnprintf(buf + len, PAGE_SIZE - len, "sync_direct:\t\t%s\n", - test_fw_config->sync_direct ? "true" : "false"); + str_true_false(test_fw_config->sync_direct)); len += scnprintf(buf + len, PAGE_SIZE - len, "read_fw_idx:\t%u\n", test_fw_config->read_fw_idx); if (test_fw_config->upload_name) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e722dd6fa8ef..92669904eecc 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2636,6 +2636,11 @@ sub exclude_global_initialisers { $realfile =~ m@/bpf/.*\.bpf\.c$@; } +sub is_userspace { + my ($realfile) = @_; + return ($realfile =~ m@^tools/@ || $realfile =~ m@^scripts/@); +} + sub process { my $filename = shift; @@ -3294,7 +3299,7 @@ sub process { # file delta changes $line =~ /^\s*(?:[\w\.\-\+]*\/)++[\w\.\-\+]+:/ || # filename then : - $line =~ /^\s*(?:Fixes:|$link_tags_search|$signature_tags)/i || + $line =~ /^\s*(?:Fixes:|https?:|$link_tags_search|$signature_tags)/i || # A Fixes:, link or signature tag line $commit_log_possible_stack_dump)) { WARN("COMMIT_LOG_LONG_LINE", @@ -7018,21 +7023,20 @@ sub process { # } # } # } - # strcpy uses that should likely be strscpy - if ($line =~ /\bstrcpy\s*\(/) { + if ($line =~ /\bstrcpy\s*\(/ && !is_userspace($realfile)) { WARN("STRCPY", "Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88\n" . $herecurr); } # strlcpy uses that should likely be strscpy - if ($line =~ /\bstrlcpy\s*\(/) { + if ($line =~ /\bstrlcpy\s*\(/ && !is_userspace($realfile)) { WARN("STRLCPY", "Prefer strscpy over strlcpy - see: https://github.com/KSPP/linux/issues/89\n" . $herecurr); } # strncpy uses that should likely be strscpy or strscpy_pad - if ($line =~ /\bstrncpy\s*\(/) { + if ($line =~ /\bstrncpy\s*\(/ && !is_userspace($realfile)) { WARN("STRNCPY", "Prefer strscpy, strscpy_pad, or __nonstring over strncpy - see: https://github.com/KSPP/linux/issues/90\n" . $herecurr); } diff --git a/scripts/coccinelle/api/platform_no_drv_owner.cocci b/scripts/coccinelle/api/platform_no_drv_owner.cocci index 8fa050eeb7e5..5e869858bda8 100644 --- a/scripts/coccinelle/api/platform_no_drv_owner.cocci +++ b/scripts/coccinelle/api/platform_no_drv_owner.cocci @@ -10,12 +10,21 @@ virtual org virtual report @match1@ +declarer name builtin_i2c_driver; +declarer name builtin_platform_driver; +declarer name builtin_platform_driver_probe; declarer name module_i2c_driver; declarer name module_platform_driver; declarer name module_platform_driver_probe; identifier __driver; @@ ( + builtin_i2c_driver(__driver); +| + builtin_platform_driver(__driver); +| + builtin_platform_driver_probe(__driver, ...); +| module_i2c_driver(__driver); | module_platform_driver(__driver); diff --git a/scripts/coccinelle/misc/of_table.cocci b/scripts/coccinelle/misc/of_table.cocci index 4693ea744753..17881cb0884b 100644 --- a/scripts/coccinelle/misc/of_table.cocci +++ b/scripts/coccinelle/misc/of_table.cocci @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/// Make sure (of/i2c/platform)_device_id tables are NULL terminated +/// Make sure (of/i2c/platform/spi)_device_id tables are NULL terminated // // Keywords: of_table i2c_table platform_table // Confidence: Medium @@ -15,14 +15,14 @@ identifier var, arr; expression E; @@ ( -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { .var = E, * } }; | -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., * { ..., E, ... }, }; @@ -33,7 +33,7 @@ identifier var, arr; expression E; @@ ( -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { .var = E, @@ -42,7 +42,7 @@ struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { + { } }; | -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { ..., E, ... }, + { }, @@ -55,7 +55,7 @@ identifier var, arr; expression E; @@ ( -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { .var = E, @@ -63,7 +63,7 @@ struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { @p1 }; | -struct \(of_device_id \| i2c_device_id \| platform_device_id\) arr[] = { +struct \(of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id\) arr[] = { ..., { ..., E, ... } @p1 diff --git a/tools/accounting/delaytop.c b/tools/accounting/delaytop.c index 9afb1ffc00ba..72cc500b44b1 100644 --- a/tools/accounting/delaytop.c +++ b/tools/accounting/delaytop.c @@ -42,14 +42,13 @@ #include <linux/genetlink.h> #include <linux/taskstats.h> #include <linux/cgroupstats.h> +#include <stddef.h> -#define PSI_CPU_SOME "/proc/pressure/cpu" -#define PSI_CPU_FULL "/proc/pressure/cpu" -#define PSI_MEMORY_SOME "/proc/pressure/memory" -#define PSI_MEMORY_FULL "/proc/pressure/memory" -#define PSI_IO_SOME "/proc/pressure/io" -#define PSI_IO_FULL "/proc/pressure/io" -#define PSI_IRQ_FULL "/proc/pressure/irq" +#define PSI_PATH "/proc/pressure" +#define PSI_CPU_PATH "/proc/pressure/cpu" +#define PSI_MEMORY_PATH "/proc/pressure/memory" +#define PSI_IO_PATH "/proc/pressure/io" +#define PSI_IRQ_PATH "/proc/pressure/irq" #define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) #define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) @@ -61,24 +60,28 @@ #define TASK_COMM_LEN 16 #define MAX_MSG_SIZE 1024 #define MAX_TASKS 1000 +#define MAX_BUF_LEN 256 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field #define BOOL_FPRINT(stream, fmt, ...) \ ({ \ int ret = fprintf(stream, fmt, ##__VA_ARGS__); \ ret >= 0; \ }) +#define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count) #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n" - -/* Program settings structure */ -struct config { - int delay; /* Update interval in seconds */ - int iterations; /* Number of iterations, 0 == infinite */ - int max_processes; /* Maximum number of processes to show */ - char sort_field; /* Field to sort by */ - int output_one_time; /* Output once and exit */ - int monitor_pid; /* Monitor specific PID */ - char *container_path; /* Path to container cgroup */ -}; +#define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n" +#define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n" +#define SORT_FIELD(name, cmd, modes) \ + {#name, #cmd, \ + offsetof(struct task_info, name##_delay_total), \ + offsetof(struct task_info, name##_count), \ + modes} +#define END_FIELD {NULL, 0, 0} + +/* Display mode types */ +#define MODE_TYPE_ALL (0xFFFFFFFF) +#define MODE_DEFAULT (1 << 0) +#define MODE_MEMVERBOSE (1 << 1) /* PSI statistics structure */ struct psi_stats { @@ -119,6 +122,8 @@ struct task_info { unsigned long long wpcopy_delay_total; unsigned long long irq_count; unsigned long long irq_delay_total; + unsigned long long mem_count; + unsigned long long mem_delay_total; }; /* Container statistics structure */ @@ -130,6 +135,27 @@ struct container_stats { int nr_io_wait; /* Number of processes in IO wait */ }; +/* Delay field structure */ +struct field_desc { + const char *name; /* Field name for cmdline argument */ + const char *cmd_char; /* Interactive command */ + unsigned long total_offset; /* Offset of total delay in task_info */ + unsigned long count_offset; /* Offset of count in task_info */ + size_t supported_modes; /* Supported display modes */ +}; + +/* Program settings structure */ +struct config { + int delay; /* Update interval in seconds */ + int iterations; /* Number of iterations, 0 == infinite */ + int max_processes; /* Maximum number of processes to show */ + int output_one_time; /* Output once and exit */ + int monitor_pid; /* Monitor specific PID */ + char *container_path; /* Path to container cgroup */ + const struct field_desc *sort_field; /* Current sort field */ + size_t display_mode; /* Current display mode */ +}; + /* Global variables */ static struct config cfg; static struct psi_stats psi; @@ -137,6 +163,19 @@ static struct task_info tasks[MAX_TASKS]; static int task_count; static int running = 1; static struct container_stats container_stats; +static const struct field_desc sort_fields[] = { + SORT_FIELD(cpu, c, MODE_DEFAULT), + SORT_FIELD(blkio, i, MODE_DEFAULT), + SORT_FIELD(irq, q, MODE_DEFAULT), + SORT_FIELD(mem, m, MODE_DEFAULT | MODE_MEMVERBOSE), + SORT_FIELD(swapin, s, MODE_MEMVERBOSE), + SORT_FIELD(freepages, r, MODE_MEMVERBOSE), + SORT_FIELD(thrashing, t, MODE_MEMVERBOSE), + SORT_FIELD(compact, p, MODE_MEMVERBOSE), + SORT_FIELD(wpcopy, w, MODE_MEMVERBOSE), + END_FIELD +}; +static int sort_selected; /* Netlink socket variables */ static int nl_sd = -1; @@ -158,18 +197,75 @@ static void disable_raw_mode(void) tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); } +/* Find field descriptor by command line */ +static const struct field_desc *get_field_by_cmd_char(char ch) +{ + const struct field_desc *field; + + for (field = sort_fields; field->name != NULL; field++) { + if (field->cmd_char[0] == ch) + return field; + } + + return NULL; +} + +/* Find field descriptor by name with string comparison */ +static const struct field_desc *get_field_by_name(const char *name) +{ + const struct field_desc *field; + size_t field_len; + + for (field = sort_fields; field->name != NULL; field++) { + field_len = strlen(field->name); + if (field_len != strlen(name)) + continue; + if (strncmp(field->name, name, field_len) == 0) + return field; + } + + return NULL; +} + +/* Find display name for a field descriptor */ +static const char *get_name_by_field(const struct field_desc *field) +{ + return field ? field->name : "UNKNOWN"; +} + +/* Generate string of available field names */ +static void display_available_fields(size_t mode) +{ + const struct field_desc *field; + char buf[MAX_BUF_LEN]; + + buf[0] = '\0'; + + for (field = sort_fields; field->name != NULL; field++) { + if (!(field->supported_modes & mode)) + continue; + strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1); + strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1); + buf[MAX_BUF_LEN - 1] = '\0'; + } + + fprintf(stderr, "Available fields: %s\n", buf); +} + /* Display usage information and command line options */ static void usage(void) { printf("Usage: delaytop [Options]\n" "Options:\n" - " -h, --help Show this help message and exit\n" - " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" - " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" - " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" - " -o, --once Display once and exit\n" - " -p, --pid=PID Monitor only the specified PID\n" - " -C, --container=PATH Monitor the container at specified cgroup path\n"); + " -h, --help Show this help message and exit\n" + " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" + " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" + " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" + " -o, --once Display once and exit\n" + " -p, --pid=PID Monitor only the specified PID\n" + " -C, --container=PATH Monitor the container at specified cgroup path\n" + " -s, --sort=FIELD Sort by delay field (default: cpu)\n" + " -M, --memverbose Display memory detailed information\n"); exit(0); } @@ -177,6 +273,7 @@ static void usage(void) static void parse_args(int argc, char **argv) { int c; + const struct field_desc *field; struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"delay", required_argument, 0, 'd'}, @@ -184,7 +281,9 @@ static void parse_args(int argc, char **argv) {"pid", required_argument, 0, 'p'}, {"once", no_argument, 0, 'o'}, {"processes", required_argument, 0, 'P'}, + {"sort", required_argument, 0, 's'}, {"container", required_argument, 0, 'C'}, + {"memverbose", no_argument, 0, 'M'}, {0, 0, 0, 0} }; @@ -192,15 +291,16 @@ static void parse_args(int argc, char **argv) cfg.delay = 2; cfg.iterations = 0; cfg.max_processes = 20; - cfg.sort_field = 'c'; /* Default sort by CPU delay */ + cfg.sort_field = &sort_fields[0]; /* Default sorted by CPU delay */ cfg.output_one_time = 0; cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ cfg.container_path = NULL; + cfg.display_mode = MODE_DEFAULT; while (1) { int option_index = 0; - c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index); + c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index); if (c == -1) break; @@ -247,6 +347,26 @@ static void parse_args(int argc, char **argv) case 'C': cfg.container_path = strdup(optarg); break; + case 's': + if (strlen(optarg) == 0) { + fprintf(stderr, "Error: empty sort field\n"); + exit(1); + } + + field = get_field_by_name(optarg); + /* Show available fields if invalid option provided */ + if (!field) { + fprintf(stderr, "Error: invalid sort field '%s'\n", optarg); + display_available_fields(MODE_TYPE_ALL); + exit(1); + } + + cfg.sort_field = field; + break; + case 'M': + cfg.display_mode = MODE_MEMVERBOSE; + cfg.sort_field = get_field_by_name("mem"); + break; default: fprintf(stderr, "Try 'delaytop --help' for more information.\n"); exit(1); @@ -254,6 +374,25 @@ static void parse_args(int argc, char **argv) } } +/* Calculate average delay in milliseconds for overall memory */ +static void set_mem_delay_total(struct task_info *t) +{ + t->mem_delay_total = t->swapin_delay_total + + t->freepages_delay_total + + t->thrashing_delay_total + + t->compact_delay_total + + t->wpcopy_delay_total; +} + +static void set_mem_count(struct task_info *t) +{ + t->mem_count = t->swapin_count + + t->freepages_count + + t->thrashing_count + + t->compact_count + + t->wpcopy_count; +} + /* Create a raw netlink socket and bind */ static int create_nl_socket(void) { @@ -358,87 +497,134 @@ static int get_family_id(int sd) return id; } -static void read_psi_stats(void) +static int read_psi_stats(void) { FILE *fp; char line[256]; int ret = 0; + int error_count = 0; + + /* Check if PSI path exists */ + if (access(PSI_PATH, F_OK) != 0) { + fprintf(stderr, "Error: PSI interface not found at %s\n", PSI_PATH); + fprintf(stderr, "Please ensure your kernel supports PSI (Pressure Stall Information)\n"); + return -1; + } + /* Zero all fields */ memset(&psi, 0, sizeof(psi)); + /* CPU pressure */ - fp = fopen(PSI_CPU_SOME, "r"); + fp = fopen(PSI_CPU_PATH, "r"); if (fp) { while (fgets(line, sizeof(line), fp)) { if (strncmp(line, "some", 4) == 0) { ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.cpu_some_avg10, &psi.cpu_some_avg60, &psi.cpu_some_avg300, &psi.cpu_some_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse CPU some PSI data\n"); + error_count++; + } } else if (strncmp(line, "full", 4) == 0) { ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.cpu_full_avg10, &psi.cpu_full_avg60, &psi.cpu_full_avg300, &psi.cpu_full_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse CPU full PSI data\n"); + error_count++; + } } } fclose(fp); + } else { + fprintf(stderr, "Warning: Failed to open %s\n", PSI_CPU_PATH); + error_count++; } + /* Memory pressure */ - fp = fopen(PSI_MEMORY_SOME, "r"); + fp = fopen(PSI_MEMORY_PATH, "r"); if (fp) { while (fgets(line, sizeof(line), fp)) { if (strncmp(line, "some", 4) == 0) { ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.memory_some_avg10, &psi.memory_some_avg60, &psi.memory_some_avg300, &psi.memory_some_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse Memory some PSI data\n"); + error_count++; + } } else if (strncmp(line, "full", 4) == 0) { ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.memory_full_avg10, &psi.memory_full_avg60, &psi.memory_full_avg300, &psi.memory_full_total); - } - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse Memory full PSI data\n"); + error_count++; + } + } } fclose(fp); + } else { + fprintf(stderr, "Warning: Failed to open %s\n", PSI_MEMORY_PATH); + error_count++; } + /* IO pressure */ - fp = fopen(PSI_IO_SOME, "r"); + fp = fopen(PSI_IO_PATH, "r"); if (fp) { while (fgets(line, sizeof(line), fp)) { if (strncmp(line, "some", 4) == 0) { ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.io_some_avg10, &psi.io_some_avg60, &psi.io_some_avg300, &psi.io_some_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse IO some PSI data\n"); + error_count++; + } } else if (strncmp(line, "full", 4) == 0) { ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.io_full_avg10, &psi.io_full_avg60, &psi.io_full_avg300, &psi.io_full_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse IO full PSI data\n"); + error_count++; + } } } fclose(fp); + } else { + fprintf(stderr, "Warning: Failed to open %s\n", PSI_IO_PATH); + error_count++; } + /* IRQ pressure (only full) */ - fp = fopen(PSI_IRQ_FULL, "r"); + fp = fopen(PSI_IRQ_PATH, "r"); if (fp) { while (fgets(line, sizeof(line), fp)) { if (strncmp(line, "full", 4) == 0) { ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", &psi.irq_full_avg10, &psi.irq_full_avg60, &psi.irq_full_avg300, &psi.irq_full_total); - if (ret != 4) + if (ret != 4) { fprintf(stderr, "Failed to parse IRQ full PSI data\n"); + error_count++; + } } } fclose(fp); + } else { + fprintf(stderr, "Warning: Failed to open %s\n", PSI_IRQ_PATH); + error_count++; + } + + /* Return error count: 0 means success, >0 means warnings, -1 means fatal error */ + if (error_count > 0) { + fprintf(stderr, "PSI stats reading completed with %d warnings\n", error_count); + return error_count; } + + return 0; } static int read_comm(int pid, char *comm_buf, size_t buf_size) @@ -527,6 +713,8 @@ static void fetch_and_fill_task_info(int pid, const char *comm) SET_TASK_STAT(task_count, wpcopy_delay_total); SET_TASK_STAT(task_count, irq_count); SET_TASK_STAT(task_count, irq_delay_total); + set_mem_count(&tasks[task_count]); + set_mem_delay_total(&tasks[task_count]); task_count++; } break; @@ -587,19 +775,23 @@ static int compare_tasks(const void *a, const void *b) { const struct task_info *t1 = (const struct task_info *)a; const struct task_info *t2 = (const struct task_info *)b; + unsigned long long total1; + unsigned long long total2; + unsigned long count1; + unsigned long count2; double avg1, avg2; - switch (cfg.sort_field) { - case 'c': /* CPU */ - avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count); - avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count); - if (avg1 != avg2) - return avg2 > avg1 ? 1 : -1; - return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; + total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset); + total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset); + count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset); + count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset); - default: - return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; - } + avg1 = average_ms(total1, count1); + avg2 = average_ms(total2, count2); + if (avg1 != avg2) + return avg2 > avg1 ? 1 : -1; + + return 0; } /* Sort tasks by selected field */ @@ -673,7 +865,7 @@ static void get_container_stats(void) } /* Display results to stdout or log file */ -static void display_results(void) +static void display_results(int psi_ret) { time_t now = time(NULL); struct tm *tm_now = localtime(&now); @@ -686,49 +878,53 @@ static void display_results(void) suc &= BOOL_FPRINT(out, "\033[H\033[J"); /* PSI output (one-line, no cat style) */ - suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n"); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "CPU some:", - psi.cpu_some_avg10, - psi.cpu_some_avg60, - psi.cpu_some_avg300, - psi.cpu_some_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "CPU full:", - psi.cpu_full_avg10, - psi.cpu_full_avg60, - psi.cpu_full_avg300, - psi.cpu_full_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "Memory full:", - psi.memory_full_avg10, - psi.memory_full_avg60, - psi.memory_full_avg300, - psi.memory_full_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "Memory some:", - psi.memory_some_avg10, - psi.memory_some_avg60, - psi.memory_some_avg300, - psi.memory_some_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "IO full:", - psi.io_full_avg10, - psi.io_full_avg60, - psi.io_full_avg300, - psi.io_full_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "IO some:", - psi.io_some_avg10, - psi.io_some_avg60, - psi.io_some_avg300, - psi.io_some_total / 1000); - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, - "IRQ full:", - psi.irq_full_avg10, - psi.irq_full_avg60, - psi.irq_full_avg300, - psi.irq_full_total / 1000); + suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60vg300/total)\n"); + if (psi_ret) { + suc &= BOOL_FPRINT(out, " PSI not found: check if psi=1 enabled in cmdline\n"); + } else { + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "CPU some:", + psi.cpu_some_avg10, + psi.cpu_some_avg60, + psi.cpu_some_avg300, + psi.cpu_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "CPU full:", + psi.cpu_full_avg10, + psi.cpu_full_avg60, + psi.cpu_full_avg300, + psi.cpu_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "Memory full:", + psi.memory_full_avg10, + psi.memory_full_avg60, + psi.memory_full_avg300, + psi.memory_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "Memory some:", + psi.memory_some_avg10, + psi.memory_some_avg60, + psi.memory_some_avg300, + psi.memory_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IO full:", + psi.io_full_avg10, + psi.io_full_avg60, + psi.io_full_avg300, + psi.io_full_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IO some:", + psi.io_some_avg10, + psi.io_some_avg60, + psi.io_some_avg300, + psi.io_some_total / 1000); + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, + "IRQ full:", + psi.irq_full_avg10, + psi.irq_full_avg60, + psi.irq_full_avg300, + psi.irq_full_total / 1000); + } if (cfg.container_path) { suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path); @@ -738,29 +934,59 @@ static void display_results(void) container_stats.nr_stopped, container_stats.nr_uninterruptible, container_stats.nr_io_wait); } - suc &= BOOL_FPRINT(out, "Top %d processes (sorted by CPU delay):\n", - cfg.max_processes); - suc &= BOOL_FPRINT(out, "%5s %5s %-17s", "PID", "TGID", "COMMAND"); - suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n", - "CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)", - "THR(ms)", "CMP(ms)", "WP(ms)", "IRQ(ms)"); - suc &= BOOL_FPRINT(out, "-----------------------------------------------"); - suc &= BOOL_FPRINT(out, "----------------------------------------------\n"); + /* Interacive command */ + suc &= BOOL_FPRINT(out, "[o]sort [M]memverbose [q]quit\n"); + if (sort_selected) { + if (cfg.display_mode == MODE_MEMVERBOSE) + suc &= BOOL_FPRINT(out, + "sort selection: [m]MEM [r]RCL [t]THR [p]CMP [w]WP\n"); + else + suc &= BOOL_FPRINT(out, + "sort selection: [c]CPU [i]IO [m]MEM [q]IRQ\n"); + } + + /* Task delay output */ + suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n", + cfg.max_processes, get_name_by_field(cfg.sort_field)); + + suc &= BOOL_FPRINT(out, "%8s %8s %-17s", "PID", "TGID", "COMMAND"); + if (cfg.display_mode == MODE_MEMVERBOSE) { + suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n", + "MEM(ms)", "SWAP(ms)", "RCL(ms)", + "THR(ms)", "CMP(ms)", "WP(ms)"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "---------------------\n"); + } else { + suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n", + "CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "-----------------------"); + suc &= BOOL_FPRINT(out, "--------------------------\n"); + } + count = task_count < cfg.max_processes ? task_count : cfg.max_processes; for (i = 0; i < count; i++) { - suc &= BOOL_FPRINT(out, "%5d %5d %-15s", + suc &= BOOL_FPRINT(out, "%8d %8d %-15s", tasks[i].pid, tasks[i].tgid, tasks[i].command); - suc &= BOOL_FPRINT(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n", - average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count), - average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count), - average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count), - average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count), - average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count), - average_ms(tasks[i].compact_delay_total, tasks[i].compact_count), - average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count), - average_ms(tasks[i].irq_delay_total, tasks[i].irq_count)); + if (cfg.display_mode == MODE_MEMVERBOSE) { + suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE, + TASK_AVG(tasks[i], mem), + TASK_AVG(tasks[i], swapin), + TASK_AVG(tasks[i], freepages), + TASK_AVG(tasks[i], thrashing), + TASK_AVG(tasks[i], compact), + TASK_AVG(tasks[i], wpcopy)); + } else { + suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT, + TASK_AVG(tasks[i], cpu), + TASK_AVG(tasks[i], blkio), + TASK_AVG(tasks[i], irq), + TASK_AVG(tasks[i], mem)); + } } suc &= BOOL_FPRINT(out, "\n"); @@ -769,11 +995,79 @@ static void display_results(void) perror("Error writing to output"); } +/* Check for keyboard input with timeout based on cfg.delay */ +static char check_for_keypress(void) +{ + struct timeval tv = {cfg.delay, 0}; + fd_set readfds; + char ch = 0; + + FD_ZERO(&readfds); + FD_SET(STDIN_FILENO, &readfds); + int r = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv); + + if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { + read(STDIN_FILENO, &ch, 1); + return ch; + } + + return 0; +} + +#define MAX_MODE_SIZE 2 +static void toggle_display_mode(void) +{ + static const size_t modes[MAX_MODE_SIZE] = {MODE_DEFAULT, MODE_MEMVERBOSE}; + static size_t cur_index; + + cur_index = (cur_index + 1) % MAX_MODE_SIZE; + cfg.display_mode = modes[cur_index]; +} + +/* Handle keyboard input: sorting selection, mode toggle, or quit */ +static void handle_keypress(char ch, int *running) +{ + const struct field_desc *field; + + /* Change sort field */ + if (sort_selected) { + field = get_field_by_cmd_char(ch); + if (field && (field->supported_modes & cfg.display_mode)) + cfg.sort_field = field; + + sort_selected = 0; + /* Handle mode changes or quit */ + } else { + switch (ch) { + case 'o': + sort_selected = 1; + break; + case 'M': + toggle_display_mode(); + for (field = sort_fields; field->name != NULL; field++) { + if (field->supported_modes & cfg.display_mode) { + cfg.sort_field = field; + break; + } + } + break; + case 'q': + case 'Q': + *running = 0; + break; + default: + break; + } + } +} + /* Main function */ int main(int argc, char **argv) { + const struct field_desc *field; int iterations = 0; - int use_q_quit = 0; + int psi_ret = 0; + char keypress; /* Parse command line arguments */ parse_args(argc, argv); @@ -793,17 +1087,24 @@ int main(int argc, char **argv) exit(1); } - if (!cfg.output_one_time) { - use_q_quit = 1; - enable_raw_mode(); - printf("Press 'q' to quit.\n"); - fflush(stdout); - } + /* Set terminal to non-canonical mode for interaction */ + enable_raw_mode(); /* Main loop */ while (running) { + /* Auto-switch sort field when not matching display mode */ + if (!(cfg.sort_field->supported_modes & cfg.display_mode)) { + for (field = sort_fields; field->name != NULL; field++) { + if (field->supported_modes & cfg.display_mode) { + cfg.sort_field = field; + printf("Auto-switched sort field to: %s\n", field->name); + break; + } + } + } + /* Read PSI statistics */ - read_psi_stats(); + psi_ret = read_psi_stats(); /* Get container stats if container path provided */ if (cfg.container_path) @@ -816,7 +1117,7 @@ int main(int argc, char **argv) sort_tasks(); /* Display results to stdout or log file */ - display_results(); + display_results(psi_ret); /* Check for iterations */ if (cfg.iterations > 0 && ++iterations >= cfg.iterations) @@ -826,32 +1127,14 @@ int main(int argc, char **argv) if (cfg.output_one_time) break; - /* Check for 'q' key to quit */ - if (use_q_quit) { - struct timeval tv = {cfg.delay, 0}; - fd_set readfds; - - FD_ZERO(&readfds); - FD_SET(STDIN_FILENO, &readfds); - int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv); - - if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { - char ch = 0; - - read(STDIN_FILENO, &ch, 1); - if (ch == 'q' || ch == 'Q') { - running = 0; - break; - } - } - } else { - sleep(cfg.delay); - } + /* Keypress for interactive usage */ + keypress = check_for_keypress(); + if (keypress) + handle_keypress(keypress, &running); } /* Restore terminal mode */ - if (use_q_quit) - disable_raw_mode(); + disable_raw_mode(); /* Cleanup */ close(nl_sd); diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c index 84b8c3c92c79..2f830ff8396c 100644 --- a/tools/testing/radix-tree/idr-test.c +++ b/tools/testing/radix-tree/idr-test.c @@ -499,19 +499,17 @@ void ida_check_random(void) goto repeat; } -void ida_simple_get_remove_test(void) +void ida_alloc_free_test(void) { DEFINE_IDA(ida); unsigned long i; - for (i = 0; i < 10000; i++) { - assert(ida_simple_get(&ida, 0, 20000, GFP_KERNEL) == i); - } - assert(ida_simple_get(&ida, 5, 30, GFP_KERNEL) < 0); + for (i = 0; i < 10000; i++) + assert(ida_alloc_max(&ida, 20000, GFP_KERNEL) == i); + assert(ida_alloc_range(&ida, 5, 30, GFP_KERNEL) < 0); - for (i = 0; i < 10000; i++) { - ida_simple_remove(&ida, i); - } + for (i = 0; i < 10000; i++) + ida_free(&ida, i); assert(ida_is_empty(&ida)); ida_destroy(&ida); @@ -524,7 +522,7 @@ void user_ida_checks(void) ida_check_nomem(); ida_check_conv_user(); ida_check_random(); - ida_simple_get_remove_test(); + ida_alloc_free_test(); radix_tree_cpu_dead(1); } diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 6b78a8382d40..9c9735570abf 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -7,6 +7,7 @@ /proc-loadavg-001 /proc-maps-race /proc-multiple-procfs +/proc-net-dev-lseek /proc-empty-vm /proc-pid-vm /proc-self-map-files-001 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index be3013515aae..a7de2bb6d8be 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -10,6 +10,7 @@ TEST_GEN_PROGS += fd-003-kthread TEST_GEN_PROGS += proc-2-is-kthread TEST_GEN_PROGS += proc-loadavg-001 TEST_GEN_PROGS += proc-maps-race +TEST_GEN_PROGS += proc-net-dev-lseek TEST_GEN_PROGS += proc-empty-vm TEST_GEN_PROGS += proc-pid-vm TEST_GEN_PROGS += proc-self-map-files-001 diff --git a/tools/testing/selftests/proc/proc-net-dev-lseek.c b/tools/testing/selftests/proc/proc-net-dev-lseek.c new file mode 100644 index 000000000000..742a3e804451 --- /dev/null +++ b/tools/testing/selftests/proc/proc-net-dev-lseek.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2025 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#undef _GNU_SOURCE +#define _GNU_SOURCE +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#include <unistd.h> +#include <sched.h> +/* + * Test that lseek("/proc/net/dev/", 0, SEEK_SET) + * a) works, + * b) does what you think it does. + */ +int main(void) +{ + /* /proc/net/dev output is deterministic in fresh netns only. */ + if (unshare(CLONE_NEWNET) == -1) { + if (errno == ENOSYS || errno == EPERM) { + return 4; + } + return 1; + } + + const int fd = open("/proc/net/dev", O_RDONLY); + assert(fd >= 0); + + char buf1[4096]; + const ssize_t rv1 = read(fd, buf1, sizeof(buf1)); + /* + * Not "<=", this file can't be empty: + * there is header, "lo" interface with some zeroes. + */ + assert(0 < rv1); + assert(rv1 <= sizeof(buf1)); + + /* Believe it or not, this line broke one day. */ + assert(lseek(fd, 0, SEEK_SET) == 0); + + char buf2[4096]; + const ssize_t rv2 = read(fd, buf2, sizeof(buf2)); + /* Not "<=", see above. */ + assert(0 < rv2); + assert(rv2 <= sizeof(buf2)); + + /* Test that lseek rewinds to the beginning of the file. */ + assert(rv1 == rv2); + assert(memcmp(buf1, buf2, rv1) == 0); + + /* Contents of the file is not validated: this test is about lseek(). */ + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c index d04685771952..978cbcb3eb11 100644 --- a/tools/testing/selftests/proc/proc-pid-vm.c +++ b/tools/testing/selftests/proc/proc-pid-vm.c @@ -47,6 +47,10 @@ #include <sys/resource.h> #include <linux/fs.h> +#ifndef __maybe_unused +#define __maybe_unused __attribute__((__unused__)) +#endif + #include "../kselftest.h" static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags) @@ -218,12 +222,12 @@ static int make_exe(const uint8_t *payload, size_t len) * 2: vsyscall VMA is r-xp vsyscall=emulate */ static volatile int g_vsyscall; -static const char *str_vsyscall; +static const char *str_vsyscall __maybe_unused; -static const char str_vsyscall_0[] = ""; -static const char str_vsyscall_1[] = +static const char str_vsyscall_0[] __maybe_unused = ""; +static const char str_vsyscall_1[] __maybe_unused = "ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; -static const char str_vsyscall_2[] = +static const char str_vsyscall_2[] __maybe_unused = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; #ifdef __x86_64__ |