diff options
author | Max Kellermann <max.kellermann@ionos.com> | 2025-05-04 20:08:30 +0200 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2025-05-21 10:48:22 -0700 |
commit | aaf05e96e93cb9c8fc8d35fc4525715530397655 (patch) | |
tree | 66a02bb56d03e1cf7a97cbf6cd1dc9e4e10148db | |
parent | cc66e4863ac3a00c709bfcbec685d48c184172b5 (diff) |
kernel/watchdog: add /sys/kernel/{hard,soft}lockup_count
Patch series "sysfs: add counters for lockups and stalls", v2.
Commits 9db89b411170 ("exit: Expose "oops_count" to sysfs") and
8b05aa263361 ("panic: Expose "warn_count" to sysfs") added counters for
oopses and warnings to sysfs, and these two patches do the same for
hard/soft lockups and RCU stalls.
All of these counters are useful for monitoring tools to detect whether
the machine is healthy. If the kernel has experienced a lockup or a
stall, it's probably due to a kernel bug, and I'd like to detect that
quickly and easily. There is currently no way to detect that, other than
parsing dmesg. Or observing indirect effects: such as certain tasks not
responding, but then I need to observe all tasks, and it may take a while
until these effects become visible/measurable. I'd rather be able to
detect the primary cause more quickly, possibly before everything falls
apart.
This patch (of 2):
There is /proc/sys/kernel/hung_task_detect_count, /sys/kernel/warn_count
and /sys/kernel/oops_count but there is no userspace-accessible counter
for hard/soft lockups. Having this is useful for monitoring tools.
Link: https://lkml.kernel.org/r/20250504180831.4190860-1-max.kellermann@ionos.com
Link: https://lkml.kernel.org/r/20250504180831.4190860-2-max.kellermann@ionos.com
Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Cc:
Cc: Core Minyard <cminyard@mvista.com>
Cc: Doug Anderson <dianders@chromium.org>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Song Liu <song@kernel.org>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-rw-r--r-- | Documentation/ABI/testing/sysfs-kernel-hardlockup_count | 7 | ||||
-rw-r--r-- | Documentation/ABI/testing/sysfs-kernel-softlockup_count | 7 | ||||
-rw-r--r-- | kernel/watchdog.c | 53 |
3 files changed, 67 insertions, 0 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count b/Documentation/ABI/testing/sysfs-kernel-hardlockup_count new file mode 100644 index 0000000000000..dfdd4078b0775 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-hardlockup_count @@ -0,0 +1,7 @@ +What: /sys/kernel/hardlockup_count +Date: May 2025 +KernelVersion: 6.16 +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> +Description: + Shows how many times the system has detected a hard lockup since last boot. + Available only if CONFIG_HARDLOCKUP_DETECTOR is enabled. diff --git a/Documentation/ABI/testing/sysfs-kernel-softlockup_count b/Documentation/ABI/testing/sysfs-kernel-softlockup_count new file mode 100644 index 0000000000000..337ff5531b5fe --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-softlockup_count @@ -0,0 +1,7 @@ +What: /sys/kernel/softlockup_count +Date: May 2025 +KernelVersion: 6.16 +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> +Description: + Shows how many times the system has detected a soft lockup since last boot. + Available only if CONFIG_SOFTLOCKUP_DETECTOR is enabled. diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 2d283e92be5ab..80b56c002c7f1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -64,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; */ unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); + +#ifdef CONFIG_SYSFS + +static unsigned int hardlockup_count; + +static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", hardlockup_count); +} + +static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count); + +static __init int kernel_hardlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_hardlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -170,6 +193,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) unsigned int this_cpu = smp_processor_id(); unsigned long flags; +#ifdef CONFIG_SYSFS + ++hardlockup_count; +#endif + /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) return; @@ -312,6 +339,28 @@ unsigned int __read_mostly softlockup_panic = static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; +#ifdef CONFIG_SYSFS + +static unsigned int softlockup_count; + +static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", softlockup_count); +} + +static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count); + +static __init int kernel_softlockup_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_softlockup_sysfs_init); + +#endif // CONFIG_SYSFS + /* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); /* Timestamp of the last softlockup report. */ @@ -743,6 +792,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) touch_ts = __this_cpu_read(watchdog_touch_ts); duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { +#ifdef CONFIG_SYSFS + ++softlockup_count; +#endif + /* * Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping all cpu back traces. |