diff options
Diffstat (limited to 'kernel/time/clocksource.c')
| -rw-r--r-- | kernel/time/clocksource.c | 227 | 
1 files changed, 214 insertions, 13 deletions
| diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2cd902592fc1..b89c76e1c02c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -14,6 +14,8 @@  #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */  #include <linux/tick.h>  #include <linux/kthread.h> +#include <linux/prandom.h> +#include <linux/cpu.h>  #include "tick-internal.h"  #include "timekeeping_internal.h" @@ -93,6 +95,20 @@ static char override_name[CS_NAME_LEN];  static int finished_booting;  static u64 suspend_start; +/* + * Threshold: 0.0312s, when doubled: 0.0625s. + * Also a default for cs->uncertainty_margin when registering clocks. + */ +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) + +/* + * Maximum permissible delay between two readouts of the watchdog + * clocksource surrounding a read of the clocksource being validated. + * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as + * a lower bound for cs->uncertainty_margin values when registering clocks. + */ +#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC) +  #ifdef CONFIG_CLOCKSOURCE_WATCHDOG  static void clocksource_watchdog_work(struct work_struct *work);  static void clocksource_select(void); @@ -119,10 +135,9 @@ static int clocksource_watchdog_kthread(void *data);  static void __clocksource_change_rating(struct clocksource *cs, int rating);  /* - * Interval: 0.5sec Threshold: 0.0625s + * Interval: 0.5sec.   */  #define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)  static void clocksource_watchdog_work(struct work_struct *work)  { @@ -184,12 +199,164 @@ void clocksource_mark_unstable(struct clocksource *cs)  	spin_unlock_irqrestore(&watchdog_lock, flags);  } +ulong max_cswd_read_retries = 3; +module_param(max_cswd_read_retries, ulong, 0644); +EXPORT_SYMBOL_GPL(max_cswd_read_retries); +static int verify_n_cpus = 8; +module_param(verify_n_cpus, int, 0644); + +static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) +{ +	unsigned int nretries; +	u64 wd_end, wd_delta; +	int64_t wd_delay; + +	for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) { +		local_irq_disable(); +		*wdnow = watchdog->read(watchdog); +		*csnow = cs->read(cs); +		wd_end = watchdog->read(watchdog); +		local_irq_enable(); + +		wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask); +		wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, +					      watchdog->shift); +		if (wd_delay <= WATCHDOG_MAX_SKEW) { +			if (nretries > 1 || nretries >= max_cswd_read_retries) { +				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", +					smp_processor_id(), watchdog->name, nretries); +			} +			return true; +		} +	} + +	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n", +		smp_processor_id(), watchdog->name, wd_delay, nretries); +	return false; +} + +static u64 csnow_mid; +static cpumask_t cpus_ahead; +static cpumask_t cpus_behind; +static cpumask_t cpus_chosen; + +static void clocksource_verify_choose_cpus(void) +{ +	int cpu, i, n = verify_n_cpus; + +	if (n < 0) { +		/* Check all of the CPUs. */ +		cpumask_copy(&cpus_chosen, cpu_online_mask); +		cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); +		return; +	} + +	/* If no checking desired, or no other CPU to check, leave. */ +	cpumask_clear(&cpus_chosen); +	if (n == 0 || num_online_cpus() <= 1) +		return; + +	/* Make sure to select at least one CPU other than the current CPU. */ +	cpu = cpumask_next(-1, cpu_online_mask); +	if (cpu == smp_processor_id()) +		cpu = cpumask_next(cpu, cpu_online_mask); +	if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) +		return; +	cpumask_set_cpu(cpu, &cpus_chosen); + +	/* Force a sane value for the boot parameter. */ +	if (n > nr_cpu_ids) +		n = nr_cpu_ids; + +	/* +	 * Randomly select the specified number of CPUs.  If the same +	 * CPU is selected multiple times, that CPU is checked only once, +	 * and no replacement CPU is selected.  This gracefully handles +	 * situations where verify_n_cpus is greater than the number of +	 * CPUs that are currently online. +	 */ +	for (i = 1; i < n; i++) { +		cpu = prandom_u32() % nr_cpu_ids; +		cpu = cpumask_next(cpu - 1, cpu_online_mask); +		if (cpu >= nr_cpu_ids) +			cpu = cpumask_next(-1, cpu_online_mask); +		if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) +			cpumask_set_cpu(cpu, &cpus_chosen); +	} + +	/* Don't verify ourselves. */ +	cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); +} + +static void clocksource_verify_one_cpu(void *csin) +{ +	struct clocksource *cs = (struct clocksource *)csin; + +	csnow_mid = cs->read(cs); +} + +void clocksource_verify_percpu(struct clocksource *cs) +{ +	int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX; +	u64 csnow_begin, csnow_end; +	int cpu, testcpu; +	s64 delta; + +	if (verify_n_cpus == 0) +		return; +	cpumask_clear(&cpus_ahead); +	cpumask_clear(&cpus_behind); +	get_online_cpus(); +	preempt_disable(); +	clocksource_verify_choose_cpus(); +	if (cpumask_weight(&cpus_chosen) == 0) { +		preempt_enable(); +		put_online_cpus(); +		pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); +		return; +	} +	testcpu = smp_processor_id(); +	pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); +	for_each_cpu(cpu, &cpus_chosen) { +		if (cpu == testcpu) +			continue; +		csnow_begin = cs->read(cs); +		smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1); +		csnow_end = cs->read(cs); +		delta = (s64)((csnow_mid - csnow_begin) & cs->mask); +		if (delta < 0) +			cpumask_set_cpu(cpu, &cpus_behind); +		delta = (csnow_end - csnow_mid) & cs->mask; +		if (delta < 0) +			cpumask_set_cpu(cpu, &cpus_ahead); +		delta = clocksource_delta(csnow_end, csnow_begin, cs->mask); +		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); +		if (cs_nsec > cs_nsec_max) +			cs_nsec_max = cs_nsec; +		if (cs_nsec < cs_nsec_min) +			cs_nsec_min = cs_nsec; +	} +	preempt_enable(); +	put_online_cpus(); +	if (!cpumask_empty(&cpus_ahead)) +		pr_warn("        CPUs %*pbl ahead of CPU %d for clocksource %s.\n", +			cpumask_pr_args(&cpus_ahead), testcpu, cs->name); +	if (!cpumask_empty(&cpus_behind)) +		pr_warn("        CPUs %*pbl behind CPU %d for clocksource %s.\n", +			cpumask_pr_args(&cpus_behind), testcpu, cs->name); +	if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) +		pr_warn("        CPU %d check durations %lldns - %lldns for clocksource %s.\n", +			testcpu, cs_nsec_min, cs_nsec_max, cs->name); +} +EXPORT_SYMBOL_GPL(clocksource_verify_percpu); +  static void clocksource_watchdog(struct timer_list *unused)  { -	struct clocksource *cs;  	u64 csnow, wdnow, cslast, wdlast, delta; -	int64_t wd_nsec, cs_nsec;  	int next_cpu, reset_pending; +	int64_t wd_nsec, cs_nsec; +	struct clocksource *cs; +	u32 md;  	spin_lock(&watchdog_lock);  	if (!watchdog_running) @@ -206,10 +373,11 @@ static void clocksource_watchdog(struct timer_list *unused)  			continue;  		} -		local_irq_disable(); -		csnow = cs->read(cs); -		wdnow = watchdog->read(watchdog); -		local_irq_enable(); +		if (!cs_watchdog_read(cs, &csnow, &wdnow)) { +			/* Clock readout unreliable, so give it up. */ +			__clocksource_unstable(cs); +			continue; +		}  		/* Clocksource initialized ? */  		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || @@ -235,13 +403,20 @@ static void clocksource_watchdog(struct timer_list *unused)  			continue;  		/* Check the deviation from the watchdog clocksource. */ -		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { +		md = cs->uncertainty_margin + watchdog->uncertainty_margin; +		if (abs(cs_nsec - wd_nsec) > md) {  			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",  				smp_processor_id(), cs->name); -			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n", -				watchdog->name, wdnow, wdlast, watchdog->mask); -			pr_warn("                      '%s' cs_now: %llx cs_last: %llx mask: %llx\n", -				cs->name, csnow, cslast, cs->mask); +			pr_warn("                      '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", +				watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); +			pr_warn("                      '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", +				cs->name, cs_nsec, csnow, cslast, cs->mask); +			if (curr_clocksource == cs) +				pr_warn("                      '%s' is current clocksource.\n", cs->name); +			else if (curr_clocksource) +				pr_warn("                      '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name); +			else +				pr_warn("                      No current clocksource.\n");  			__clocksource_unstable(cs);  			continue;  		} @@ -407,6 +582,12 @@ static int __clocksource_watchdog_kthread(void)  	unsigned long flags;  	int select = 0; +	/* Do any required per-CPU skew verification. */ +	if (curr_clocksource && +	    curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE && +	    curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU) +		clocksource_verify_percpu(curr_clocksource); +  	spin_lock_irqsave(&watchdog_lock, flags);  	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {  		if (cs->flags & CLOCK_SOURCE_UNSTABLE) { @@ -876,6 +1057,26 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq  		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,  				       NSEC_PER_SEC / scale, sec * scale);  	} + +	/* +	 * If the uncertainty margin is not specified, calculate it. +	 * If both scale and freq are non-zero, calculate the clock +	 * period, but bound below at 2*WATCHDOG_MAX_SKEW.  However, +	 * if either of scale or freq is zero, be very conservative and +	 * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the +	 * uncertainty margin.  Allow stupidly small uncertainty margins +	 * to be specified by the caller for testing purposes, but warn +	 * to discourage production use of this capability. +	 */ +	if (scale && freq && !cs->uncertainty_margin) { +		cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); +		if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW) +			cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW; +	} else if (!cs->uncertainty_margin) { +		cs->uncertainty_margin = WATCHDOG_THRESHOLD; +	} +	WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW); +  	/*  	 * Ensure clocksources that have large 'mult' values don't overflow  	 * when adjusted. | 
