From 951ed4d36b77ba9fe1ea08fc3c59d8bb6c9bda32 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 7 Jul 2009 11:27:28 +0200 Subject: timekeeping: optimized ktime_get[_ts] for GENERIC_TIME=y The generic ktime_get function defined in kernel/hrtimer.c is suboptimial for GENERIC_TIME=y: 0) | ktime_get() { 0) | ktime_get_ts() { 0) | getnstimeofday() { 0) | read_tod_clock() { 0) 0.601 us | } 0) 1.938 us | } 0) | set_normalized_timespec() { 0) 0.602 us | } 0) 4.375 us | } 0) 5.523 us | } Overall there are two read_seqbegin/read_seqretry loops and a lot of unnecessary struct timespec calculations. ktime_get returns a nano second value which is the sum of xtime, wall_to_monotonic and the nano second delta from the clock source. ktime_get can be optimized for GENERIC_TIME=y. The new version only calls clocksource_read: 0) | ktime_get() { 0) | read_tod_clock() { 0) 0.610 us | } 0) 1.977 us | } It uses a single read_seqbegin/readseqretry loop and just adds everthing to a nano second value. ktime_get_ts is optimized in a similar fashion. [ tglx: added WARN_ON(timekeeping_suspended) as in getnstimeofday() ] Signed-off-by: Martin Schwidefsky Acked-by: john stultz LKML-Reference: <20090707112728.3005244d@skybase> Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 4 +++ kernel/time/timekeeping.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9002958a96e7..829e0664b72e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,6 +48,7 @@ #include +#ifndef CONFIG_GENERIC_TIME /** * ktime_get - get the monotonic time in ktime_t format * @@ -62,6 +63,7 @@ ktime_t ktime_get(void) return timespec_to_ktime(now); } EXPORT_SYMBOL_GPL(ktime_get); +#endif /** * ktime_get_real - get the real (wall-) time in ktime_t format @@ -106,6 +108,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; +#ifndef CONFIG_GENERIC_TIME /** * ktime_get_ts - get the monotonic clock in timespec format * @ts: pointer to timespec variable @@ -130,6 +133,7 @@ void ktime_get_ts(struct timespec *ts) ts->tv_nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); +#endif /* * Get the coarse grained time at the softirq based on xtime and diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e8c77d9c633a..7a248135c6f2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -125,6 +125,75 @@ void getnstimeofday(struct timespec *ts) EXPORT_SYMBOL(getnstimeofday); +ktime_t ktime_get(void) +{ + cycle_t cycle_now, cycle_delta; + unsigned int seq; + s64 secs, nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + secs = xtime.tv_sec + wall_to_monotonic.tv_sec; + nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs += cyc2ns(clock, cycle_delta); + + } while (read_seqretry(&xtime_lock, seq)); + /* + * Use ktime_set/ktime_add_ns to create a proper ktime on + * 32-bit architectures without CONFIG_KTIME_SCALAR. + */ + return ktime_add_ns(ktime_set(secs, 0), nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + cycle_t cycle_now, cycle_delta; + struct timespec tomono; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit v1.2.3 From a40f262cc21fbfd781bbddcc40b16b83a75f5f34 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2009 13:00:31 +0200 Subject: timekeeping: Move ktime_get() functions to timekeeping.c The ktime_get() functions for GENERIC_TIME=n are still located in hrtimer.c. Move them to time/timekeeping.c where they belong. LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 60 ----------------------------------------------- kernel/time/timekeeping.c | 59 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 829e0664b72e..43d151f185b6 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,39 +48,6 @@ #include -#ifndef CONFIG_GENERIC_TIME -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); -#endif - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ - struct timespec now; - - getnstimeofday(&now); - - return timespec_to_ktime(now); -} - -EXPORT_SYMBOL_GPL(ktime_get_real); - /* * The timer bases: * @@ -108,33 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; -#ifndef CONFIG_GENERIC_TIME -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); -#endif - /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7a248135c6f2..02c0b2c9c674 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -290,10 +290,65 @@ static void change_clocksource(void) clock->name); */ } -#else +#else /* GENERIC_TIME */ static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -#endif + +/** + * ktime_get - get the monotonic time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get(void) +{ + struct timespec now; + + ktime_get_ts(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + tomono = wall_to_monotonic; + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); +#endif /* !GENERIC_TIME */ + +/** + * ktime_get_real - get the real (wall-) time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get_real(void) +{ + struct timespec now; + + getnstimeofday(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get_real); /** * getrawmonotonic - Returns the raw monotonic time in a timespec -- cgit v1.2.3 From fbd90375d7531927d312766b548376d909811b4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 13:40:14 +0200 Subject: hrtimer: Remove cb_entry from struct hrtimer It's unused, remove it. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner LKML-Reference: --- kernel/hrtimer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 43d151f185b6..052a0f53e4eb 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1092,7 +1092,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &cpu_base->clock_base[clock_id]; - INIT_LIST_HEAD(&timer->cb_entry); hrtimer_init_timer_hres(timer); #ifdef CONFIG_TIMER_STATS -- cgit v1.2.3 From 97fd9ed48ce2b807edc363bef3e817aeeb5cd5e6 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 21 Jul 2009 20:25:05 +0200 Subject: timers: Cache __next_timer_interrupt result Each time a cpu goes to sleep on a NOHZ=y system the timer wheel is searched for the next timer interrupt. It can take quite a few cycles to find the next pending timer. This patch adds a field to tvec_base that caches the result of __next_timer_interrupt. The hit ratio is around 80% on my thinkpad under normal use, on a server I've seen hit ratios from 5% to 95% dependent on the workload. -v2: jiffies wrap fixes Signed-off-by: Martin Schwidefsky Acked-by: Thomas Gleixner Cc: john stultz Cc: Venki Pallipadi LKML-Reference: <20090721202505.7d56a079@skybase> Signed-off-by: Ingo Molnar --- kernel/timer.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 0b36b9e5cc8b..5c1e49ec2f1b 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -72,6 +72,7 @@ struct tvec_base { spinlock_t lock; struct timer_list *running_timer; unsigned long timer_jiffies; + unsigned long next_timer; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -622,6 +623,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, if (timer_pending(timer)) { detach_timer(timer, 0); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } else { if (pending_only) @@ -663,6 +667,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, } timer->expires = expires; + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); out_unlock: @@ -781,6 +788,9 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); debug_timer_activate(timer); + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; internal_add_timer(base, timer); /* * Check whether the other CPU is idle and needs to be @@ -817,6 +827,9 @@ int del_timer(struct timer_list *timer) base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } spin_unlock_irqrestore(&base->lock, flags); @@ -850,6 +863,9 @@ int try_to_del_timer_sync(struct timer_list *timer) ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; ret = 1; } out: @@ -1134,7 +1150,9 @@ unsigned long get_next_timer_interrupt(unsigned long now) unsigned long expires; spin_lock(&base->lock); - expires = __next_timer_interrupt(base); + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; spin_unlock(&base->lock); if (time_before_eq(expires, now)) @@ -1523,6 +1541,7 @@ static int __cpuinit init_timers_cpu(int cpu) INIT_LIST_HEAD(base->tv1.vec + j); base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; return 0; } @@ -1535,6 +1554,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); timer_set_base(timer, new_base); + if (time_before(timer->expires, new_base->next_timer) && + !tbase_get_deferrable(timer->base)) + new_base->next_timer = timer->expires; internal_add_timer(new_base, timer); } } -- cgit v1.2.3 From 31089c13bcb18d2cd2a3ddfbe3a28666346f237e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 14 Aug 2009 15:47:18 +0200 Subject: timekeeping: Introduce timekeeping_leap_insert Move the adjustment of xtime, wall_to_monotonic and the update of the vsyscall variables to the timekeeping code. Signed-off-by: John Stultz Signed-off-by: Martin Schwidefsky LKML-Reference: <20090814134807.609730216@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 7 ++----- kernel/time/timekeeping.c | 7 +++++++ 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7fc64375ff43..4800f933910e 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) case TIME_OK: break; case TIME_INS: - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; + timekeeping_leap_insert(-1); time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); @@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) res = HRTIMER_RESTART; break; case TIME_DEL: - xtime.tv_sec++; + timekeeping_leap_insert(1); time_tai--; - wall_to_monotonic.tv_sec--; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); @@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OK; break; } - update_vsyscall(&xtime, clock); write_sequnlock(&xtime_lock); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 02c0b2c9c674..b8b70fb545fc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,6 +58,13 @@ void update_xtime_cache(u64 nsec) struct clocksource *clock; +/* must hold xtime_lock */ +void timekeeping_leap_insert(int leapsecond) +{ + xtime.tv_sec += leapsecond; + wall_to_monotonic.tv_sec -= leapsecond; + update_vsyscall(&xtime, clock); +} #ifdef CONFIG_GENERIC_TIME /** -- cgit v1.2.3 From a0f7d48bfb95a4c5172a2756dbc4b82afc8e9ae4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:19 +0200 Subject: timekeeping: Remove clocksource inline functions The three inline functions clocksource_read, clocksource_enable and clocksource_disable are simple wrappers of an indirect call plus the copy from and to the mult_orig value. The functions are exclusively used by the timekeeping code which has intimate knowledge of the clocksource anyway. Therefore remove the inline functions. No functional change. Signed-off-by: Martin Schwidefsky Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134807.903108946@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b8b70fb545fc..016a2591d719 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -79,7 +79,7 @@ static void clocksource_forward_now(void) cycle_t cycle_now, cycle_delta; s64 nsec; - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; @@ -114,7 +114,7 @@ void getnstimeofday(struct timespec *ts) *ts = xtime; /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -146,7 +146,7 @@ ktime_t ktime_get(void) nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -186,7 +186,7 @@ void ktime_get_ts(struct timespec *ts) tomono = wall_to_monotonic; /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -274,16 +274,29 @@ static void change_clocksource(void) clocksource_forward_now(); - if (clocksource_enable(new)) + if (new->enable && !new->enable(new)) return; + /* + * The frequency may have changed while the clocksource + * was disabled. If so the code in ->enable() must update + * the mult value to reflect the new frequency. Make sure + * mult_orig follows this change. + */ + new->mult_orig = new->mult; new->raw_time = clock->raw_time; old = clock; clock = new; - clocksource_disable(old); + /* + * Save mult_orig in mult so that the value can be restored + * regardless if ->enable() updates the value of mult or not. + */ + old->mult = old->mult_orig; + if (old->disable) + old->disable(old); clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); + clock->cycle_last = clock->read(clock); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -373,7 +386,7 @@ void getrawmonotonic(struct timespec *ts) seq = read_seqbegin(&xtime_lock); /* read clocksource: */ - cycle_now = clocksource_read(clock); + cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; @@ -435,9 +448,12 @@ void __init timekeeping_init(void) ntp_init(); clock = clocksource_get_next(); - clocksource_enable(clock); + if (clock->enable) + clock->enable(clock); + /* set mult_orig on enable */ + clock->mult_orig = clock->mult; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); + clock->cycle_last = clock->read(clock); xtime.tv_sec = sec; xtime.tv_nsec = 0; @@ -477,8 +493,7 @@ static int timekeeping_resume(struct sys_device *dev) } update_xtime_cache(0); /* re-base the last cycle value */ - clock->cycle_last = 0; - clock->cycle_last = clocksource_read(clock); + clock->cycle_last = clock->read(clock); clock->error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -630,7 +645,7 @@ void update_wall_time(void) return; #ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #else offset = clock->cycle_interval; #endif -- cgit v1.2.3 From 1be396794897f80bfc8774719ba60309a9e3d374 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:20 +0200 Subject: timekeeping: Move reset of cycle_last for tsc clocksource to tsc change_clocksource resets the cycle_last value to zero then sets it to a value read from the clocksource. The reset to zero is required only for the TSC clocksource to make the read_tsc function work after a resume. The reason is that the TSC read function uses cycle_last to detect backwards going TSCs. In the resume case cycle_last contains the TSC value from the last update before the suspend. On resume the TSC starts counting from 0 again and would trip over the cycle_last comparison. This is subtle and surprising. Move the reset to a resume function in the tsc code. Signed-off-by: Martin Schwidefsky Acked-by: Thomas Gleixner Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.142191175@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 016a2591d719..b5673016089f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -295,7 +295,6 @@ static void change_clocksource(void) if (old->disable) old->disable(old); - clock->cycle_last = 0; clock->cycle_last = clock->read(clock); clock->error = 0; clock->xtime_nsec = 0; -- cgit v1.2.3 From f1b82746c1e93daf24e1ab9bfbd39bcdb2e7018b Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:21 +0200 Subject: clocksource: Cleanup clocksource selection If a non high-resolution clocksource is first set as override clock and then registered it becomes active even if the system is in one-shot mode. Move the override check from sysfs_override_clocksource to the clocksource selection. That fixes the bug and simplifies the code. The check in clocksource_register for double registration of the same clocksource is removed without replacement. To find the initial clocksource a new weak function in jiffies.c is defined that returns the jiffies clocksource. The architecture code can then override the weak function with a more suitable clocksource, e.g. the TOD clock on s390. [ tglx: Folded in a fix from John Stultz ] Signed-off-by: Martin Schwidefsky Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.388024160@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 134 ++++++++++++++++++---------------------------- kernel/time/jiffies.c | 6 ++- kernel/time/timekeeping.c | 4 +- 3 files changed, 58 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7466cb811251..e91662e87cde 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -21,7 +21,6 @@ * * TODO WishList: * o Allow clocksource drivers to be unregistered - * o get rid of clocksource_jiffies extern */ #include @@ -107,12 +106,9 @@ u64 timecounter_cyc2time(struct timecounter *tc, } EXPORT_SYMBOL(timecounter_cyc2time); -/* XXX - Would like a better way for initializing curr_clocksource */ -extern struct clocksource clocksource_jiffies; - /*[Clocksource internal variables]--------- * curr_clocksource: - * currently selected clocksource. Initialized to clocksource_jiffies. + * currently selected clocksource. * next_clocksource: * pending next selected clocksource. * clocksource_list: @@ -123,9 +119,8 @@ extern struct clocksource clocksource_jiffies; * override_name: * Name of the user-specified clocksource. */ -static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *curr_clocksource; static struct clocksource *next_clocksource; -static struct clocksource *clocksource_override; static LIST_HEAD(clocksource_list); static DEFINE_SPINLOCK(clocksource_lock); static char override_name[32]; @@ -320,6 +315,7 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } +#ifdef CONFIG_GENERIC_TIME /** * clocksource_get_next - Returns the selected clocksource * @@ -339,56 +335,65 @@ struct clocksource *clocksource_get_next(void) } /** - * select_clocksource - Selects the best registered clocksource. + * clocksource_select - Select the best clocksource available * * Private function. Must hold clocksource_lock when called. * * Select the clocksource with the best rating, or the clocksource, * which is selected by userspace override. */ -static struct clocksource *select_clocksource(void) +static void clocksource_select(void) { - struct clocksource *next; + struct clocksource *best, *cs; if (list_empty(&clocksource_list)) - return NULL; + return; + /* First clocksource on the list has the best rating. */ + best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + tick_oneshot_mode_active()) { + /* Override clocksource cannot be used. */ + printk(KERN_WARNING "Override clocksource %s is not " + "HRT compatible. Cannot switch while in " + "HRT/NOHZ mode\n", cs->name); + override_name[0] = 0; + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + if (curr_clocksource != best) + next_clocksource = best; +} - if (clocksource_override) - next = clocksource_override; - else - next = list_entry(clocksource_list.next, struct clocksource, - list); +#else /* CONFIG_GENERIC_TIME */ - if (next == curr_clocksource) - return NULL; +static void clocksource_select(void) { } - return next; -} +#endif /* * Enqueue the clocksource sorted by rating */ -static int clocksource_enqueue(struct clocksource *c) +static void clocksource_enqueue(struct clocksource *cs) { - struct list_head *tmp, *entry = &clocksource_list; - - list_for_each(tmp, &clocksource_list) { - struct clocksource *cs; + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; - cs = list_entry(tmp, struct clocksource, list); - if (cs == c) - return -EBUSY; + list_for_each_entry(tmp, &clocksource_list, list) /* Keep track of the place, where to insert */ - if (cs->rating >= c->rating) - entry = tmp; - } - list_add(&c->list, entry); - - if (strlen(c->name) == strlen(override_name) && - !strcmp(c->name, override_name)) - clocksource_override = c; - - return 0; + if (tmp->rating >= cs->rating) + entry = &tmp->list; + list_add(&cs->list, entry); } /** @@ -397,19 +402,16 @@ static int clocksource_enqueue(struct clocksource *c) * * Returns -EBUSY if registration fails, zero otherwise. */ -int clocksource_register(struct clocksource *c) +int clocksource_register(struct clocksource *cs) { unsigned long flags; - int ret; spin_lock_irqsave(&clocksource_lock, flags); - ret = clocksource_enqueue(c); - if (!ret) - next_clocksource = select_clocksource(); + clocksource_enqueue(cs); + clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); - if (!ret) - clocksource_check_watchdog(c); - return ret; + clocksource_check_watchdog(cs); + return 0; } EXPORT_SYMBOL(clocksource_register); @@ -425,7 +427,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) list_del(&cs->list); cs->rating = rating; clocksource_enqueue(cs); - next_clocksource = select_clocksource(); + clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } @@ -438,9 +440,7 @@ void clocksource_unregister(struct clocksource *cs) spin_lock_irqsave(&clocksource_lock, flags); list_del(&cs->list); - if (clocksource_override == cs) - clocksource_override = NULL; - next_clocksource = select_clocksource(); + clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } @@ -478,9 +478,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - struct clocksource *ovr = NULL; size_t ret = count; - int len; /* strings from sysfs write are not 0 terminated! */ if (count >= sizeof(override_name)) @@ -495,37 +493,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, if (count > 0) memcpy(override_name, buf, count); override_name[count] = 0; - - len = strlen(override_name); - if (len) { - struct clocksource *cs; - - ovr = clocksource_override; - /* try to select it: */ - list_for_each_entry(cs, &clocksource_list, list) { - if (strlen(cs->name) == len && - !strcmp(cs->name, override_name)) - ovr = cs; - } - } - - /* - * Check to make sure we don't switch to a non-highres capable - * clocksource if the tick code is in oneshot mode (highres or nohz) - */ - if (tick_oneshot_mode_active() && ovr && - !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) { - printk(KERN_WARNING "%s clocksource is not HRT compatible. " - "Cannot switch while in HRT/NOHZ mode\n", ovr->name); - ovr = NULL; - override_name[0] = 0; - } - - /* Reselect, when the override name has changed */ - if (ovr != clocksource_override) { - clocksource_override = ovr; - next_clocksource = select_clocksource(); - } + clocksource_select(); spin_unlock_irq(&clocksource_lock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index c3f6c30816e3..5404a8456909 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ - .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; @@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void) } core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b5673016089f..325a9b63265a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -269,7 +269,7 @@ static void change_clocksource(void) new = clocksource_get_next(); - if (clock == new) + if (!new || clock == new) return; clocksource_forward_now(); @@ -446,7 +446,7 @@ void __init timekeeping_init(void) ntp_init(); - clock = clocksource_get_next(); + clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); /* set mult_orig on enable */ -- cgit v1.2.3 From 8cf4e750f8459d51c2e8a035a201da4bf7aa996a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:22 +0200 Subject: clocksource: Delay clocksource watchdog highres enablement The clocksource watchdog marks a clock as highres capable before it checked the deviation from the watchdog clocksource even for a single time. Make sure that the deviation is at least checked once before doing the switch to highres mode. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.627795883@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e91662e87cde..76256c5aecb8 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -153,11 +153,8 @@ static unsigned long watchdog_resumed; #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) -static void clocksource_ratewd(struct clocksource *cs, int64_t delta) +static void clocksource_unstable(struct clocksource *cs, int64_t delta) { - if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) - return; - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); @@ -183,31 +180,31 @@ static void clocksource_watchdog(unsigned long data) list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { csnow = cs->read(cs); - if (unlikely(resumed)) { + /* Clocksource initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + cs->flags |= CLOCK_SOURCE_WATCHDOG; cs->wd_last = csnow; continue; } - /* Initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { - if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - /* - * We just marked the clocksource as - * highres-capable, notify the rest of the - * system as well so that we transition - * into high-res mode: - */ - tick_clock_notify(); - } - cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = csnow; - } else { - cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); - cs->wd_last = csnow; - /* Check the delta. Might remove from the list ! */ - clocksource_ratewd(cs, cs_nsec - wd_nsec); + /* Check the deviation from the watchdog clocksource. */ + cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); + cs->wd_last = csnow; + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + clocksource_unstable(cs, cs_nsec - wd_nsec); + continue; + } + + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as highres-capable, + * notify the rest of the system as well so that we + * transition into high-res mode: + */ + tick_clock_notify(); } } -- cgit v1.2.3 From 0f8e8ef7c204988246da5a42d576b7fa5277a8e4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:23 +0200 Subject: clocksource: Simplify clocksource watchdog resume logic To resume the clocksource watchdog just remove the CLOCK_SOURCE_WATCHDOG bit from the watched clocksource. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.880925790@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 76256c5aecb8..89a7b91bfbdd 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -145,7 +145,6 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; -static unsigned long watchdog_resumed; /* * Interval: 0.5sec Threshold: 0.0625s @@ -167,12 +166,9 @@ static void clocksource_watchdog(unsigned long data) struct clocksource *cs, *tmp; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; - int resumed; spin_lock(&watchdog_lock); - resumed = test_and_clear_bit(0, &watchdog_resumed); - wdnow = watchdog->read(watchdog); wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); watchdog_last = wdnow; @@ -223,14 +219,26 @@ static void clocksource_watchdog(unsigned long data) } spin_unlock(&watchdog_lock); } + +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + static void clocksource_resume_watchdog(void) { - set_bit(0, &watchdog_resumed); + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + clocksource_reset_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); } static void clocksource_check_watchdog(struct clocksource *cs) { - struct clocksource *cse; unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); @@ -256,8 +264,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_timer.function = clocksource_watchdog; /* Reset watchdog cycles */ - list_for_each_entry(cse, &watchdog_list, wd_list) - cse->flags &= ~CLOCK_SOURCE_WATCHDOG; + clocksource_reset_watchdog(); /* Start if list is not empty */ if (!list_empty(&watchdog_list)) { watchdog_last = watchdog->read(watchdog); -- cgit v1.2.3 From fb63a0ebe615fba9de8c75ea44ded999d1e24c65 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:24 +0200 Subject: clocksource: Refactor clocksource watchdog Refactor clocksource watchdog code to make it more readable. Add clocksource_dequeue_watchdog to remove a clocksource from the watchdog list when it is unregistered. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134809.110881699@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 97 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 69 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 89a7b91bfbdd..56aaa749645d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -145,6 +145,7 @@ static struct clocksource *watchdog; static struct timer_list watchdog_timer; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; +static int watchdog_running; /* * Interval: 0.5sec Threshold: 0.0625s @@ -168,6 +169,8 @@ static void clocksource_watchdog(unsigned long data) int64_t wd_nsec, cs_nsec; spin_lock(&watchdog_lock); + if (!watchdog_running) + goto out; wdnow = watchdog->read(watchdog); wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); @@ -217,9 +220,30 @@ static void clocksource_watchdog(unsigned long data) watchdog_timer.expires += WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, next_cpu); } +out: spin_unlock(&watchdog_lock); } +static inline void clocksource_start_watchdog(void) +{ + if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + return; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + watchdog_last = watchdog->read(watchdog); + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + watchdog_running = 1; +} + +static inline void clocksource_stop_watchdog(void) +{ + if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + return; + del_timer(&watchdog_timer); + watchdog_running = 0; +} + static inline void clocksource_reset_watchdog(void) { struct clocksource *cs; @@ -237,55 +261,70 @@ static void clocksource_resume_watchdog(void) spin_unlock_irqrestore(&watchdog_lock, flags); } -static void clocksource_check_watchdog(struct clocksource *cs) +static void clocksource_enqueue_watchdog(struct clocksource *cs) { unsigned long flags; spin_lock_irqsave(&watchdog_lock, flags); if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { - int started = !list_empty(&watchdog_list); - + /* cs is a clocksource to be watched. */ list_add(&cs->wd_list, &watchdog_list); - if (!started && watchdog) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; } else { + /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - + /* Pick the best watchdog. */ if (!watchdog || cs->rating > watchdog->rating) { - if (watchdog) - del_timer(&watchdog_timer); watchdog = cs; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; - /* Reset watchdog cycles */ clocksource_reset_watchdog(); - /* Start if list is not empty */ - if (!list_empty(&watchdog_list)) { - watchdog_last = watchdog->read(watchdog); - watchdog_timer.expires = - jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - cpumask_first(cpu_online_mask)); - } } } + /* Check if the watchdog timer needs to be started. */ + clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } -#else -static void clocksource_check_watchdog(struct clocksource *cs) + +static void clocksource_dequeue_watchdog(struct clocksource *cs) +{ + struct clocksource *tmp; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + } else if (cs == watchdog) { + /* Reset watchdog cycles */ + clocksource_reset_watchdog(); + /* Current watchdog is removed. Find an alternative. */ + watchdog = NULL; + list_for_each_entry(tmp, &clocksource_list, list) { + if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + if (!watchdog || tmp->rating > watchdog->rating) + watchdog = tmp; + } + } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static void clocksource_enqueue_watchdog(struct clocksource *cs) { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -#endif + +#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ /** * clocksource_resume - resume the clocksource(s) @@ -414,14 +453,13 @@ int clocksource_register(struct clocksource *cs) clocksource_enqueue(cs); clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); - clocksource_check_watchdog(cs); + clocksource_enqueue_watchdog(cs); return 0; } EXPORT_SYMBOL(clocksource_register); /** * clocksource_change_rating - Change the rating of a registered clocksource - * */ void clocksource_change_rating(struct clocksource *cs, int rating) { @@ -434,6 +472,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } +EXPORT_SYMBOL(clocksource_change_rating); /** * clocksource_unregister - remove a registered clocksource @@ -442,11 +481,13 @@ void clocksource_unregister(struct clocksource *cs) { unsigned long flags; + clocksource_dequeue_watchdog(cs); spin_lock_irqsave(&clocksource_lock, flags); list_del(&cs->list); clocksource_select(); spin_unlock_irqrestore(&clocksource_lock, flags); } +EXPORT_SYMBOL(clocksource_unregister); #ifdef CONFIG_SYSFS /** -- cgit v1.2.3 From c55c87c892c1875deace0c8fc28787335277fdf2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:25 +0200 Subject: clocksource: Move watchdog downgrade to a work queue thread Move the downgrade of an unstable clocksource from the timer interrupt context into the process context of a work queue thread. This is needed to be able to do the clocksource switch with stop_machine. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134809.354926067@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 56 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56aaa749645d..f1508019bfb4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -143,10 +143,13 @@ fs_initcall(clocksource_done_booting); static LIST_HEAD(watchdog_list); static struct clocksource *watchdog; static struct timer_list watchdog_timer; +static struct work_struct watchdog_work; static DEFINE_SPINLOCK(watchdog_lock); static cycle_t watchdog_last; static int watchdog_running; +static void clocksource_watchdog_work(struct work_struct *work); + /* * Interval: 0.5sec Threshold: 0.0625s */ @@ -158,15 +161,16 @@ static void clocksource_unstable(struct clocksource *cs, int64_t delta) printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); - clocksource_change_rating(cs, 0); - list_del(&cs->wd_list); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + schedule_work(&watchdog_work); } static void clocksource_watchdog(unsigned long data) { - struct clocksource *cs, *tmp; + struct clocksource *cs; cycle_t csnow, wdnow; int64_t wd_nsec, cs_nsec; + int next_cpu; spin_lock(&watchdog_lock); if (!watchdog_running) @@ -176,7 +180,12 @@ static void clocksource_watchdog(unsigned long data) wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); watchdog_last = wdnow; - list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + list_for_each_entry(cs, &watchdog_list, wd_list) { + + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) + continue; + csnow = cs->read(cs); /* Clocksource initialized ? */ @@ -207,19 +216,15 @@ static void clocksource_watchdog(unsigned long data) } } - if (!list_empty(&watchdog_list)) { - /* - * Cycle through CPUs to check if the CPUs stay - * synchronized to each other. - */ - int next_cpu = cpumask_next(raw_smp_processor_id(), - cpu_online_mask); - - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); - } + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); out: spin_unlock(&watchdog_lock); } @@ -228,6 +233,7 @@ static inline void clocksource_start_watchdog(void) { if (watchdog_running || !watchdog || list_empty(&watchdog_list)) return; + INIT_WORK(&watchdog_work, clocksource_watchdog_work); init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; watchdog_last = watchdog->read(watchdog); @@ -313,6 +319,22 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } +static void clocksource_watchdog_work(struct work_struct *work) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + clocksource_change_rating(cs, 0); + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock(&watchdog_lock); +} + #else /* CONFIG_CLOCKSOURCE_WATCHDOG */ static void clocksource_enqueue_watchdog(struct clocksource *cs) -- cgit v1.2.3 From 155ec60226ae0ae2aadaa57c951a58a359331030 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:26 +0200 Subject: timekeeping: Introduce struct timekeeper Add struct timekeeper to keep the internal values timekeeping.c needs in regard to the currently selected clock source. This moves the timekeeping intervals, xtime_nsec and the ntp error value from struct clocksource to struct timekeeper. The raw_time is removed from the clocksource as well. It gets treated like xtime as a global variable. Eventually xtime raw_time should be moved to struct timekeeper. [ tglx: minor cleanup ] Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134809.613209842@de.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 6 +- kernel/time/timekeeping.c | 235 ++++++++++++++++++++++++++++++---------------- 2 files changed, 160 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f1508019bfb4..f18c9a6bdcf4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -177,7 +177,8 @@ static void clocksource_watchdog(unsigned long data) goto out; wdnow = watchdog->read(watchdog); - wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); + wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, + watchdog->mult, watchdog->shift); watchdog_last = wdnow; list_for_each_entry(cs, &watchdog_list, wd_list) { @@ -196,7 +197,8 @@ static void clocksource_watchdog(unsigned long data) } /* Check the deviation from the watchdog clocksource. */ - cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask); + cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & + cs->mask, cs->mult, cs->shift); cs->wd_last = csnow; if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { clocksource_unstable(cs, cs_nsec - wd_nsec); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 325a9b63265a..7af45cbf6b13 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -19,6 +19,65 @@ #include #include +/* Structure holding internal timekeeping values. */ +struct timekeeper { + /* Current clocksource used for timekeeping. */ + struct clocksource *clock; + + /* Number of clock cycles in one NTP interval. */ + cycle_t cycle_interval; + /* Number of clock shifted nano seconds in one NTP interval. */ + u64 xtime_interval; + /* Raw nano seconds accumulated per NTP interval. */ + u32 raw_interval; + + /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ + u64 xtime_nsec; + /* Difference between accumulated time and NTP time in ntp + * shifted nano seconds. */ + s64 ntp_error; +}; + +struct timekeeper timekeeper; + +/** + * timekeeper_setup_internals - Set up internals to use clocksource clock. + * + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void timekeeper_setup_internals(struct clocksource *clock) +{ + cycle_t interval; + u64 tmp; + + timekeeper.clock = clock; + clock->cycle_last = clock->read(clock); + + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + tmp += clock->mult_orig/2; + do_div(tmp, clock->mult_orig); + if (tmp == 0) + tmp = 1; + + interval = (cycle_t) tmp; + timekeeper.cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.raw_interval = + ((u64) interval * clock->mult_orig) >> clock->shift; + + timekeeper.xtime_nsec = 0; + + timekeeper.ntp_error = 0; +} /* * This read-write spinlock protects us from races in SMP while @@ -46,6 +105,11 @@ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ +/* + * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. + */ +struct timespec raw_time; + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -56,42 +120,42 @@ void update_xtime_cache(u64 nsec) timespec_add_ns(&xtime_cache, nsec); } -struct clocksource *clock; - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); } #ifdef CONFIG_GENERIC_TIME /** - * clocksource_forward_now - update clock to the current time + * timekeeping_forward_now - update clock to the current time * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void clocksource_forward_now(void) +static void timekeeping_forward_now(void) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; s64 nsec; + clock = timekeeper.clock; cycle_now = clock->read(clock); cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; clock->cycle_last = cycle_now; - nsec = cyc2ns(clock, cycle_delta); + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); timespec_add_ns(&xtime, nsec); - nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; - clock->raw_time.tv_nsec += nsec; + nsec = clocksource_cyc2ns(cycle_delta, clock->mult_orig, clock->shift); + timespec_add_ns(&raw_time, nsec); } /** @@ -103,6 +167,7 @@ static void clocksource_forward_now(void) void getnstimeofday(struct timespec *ts) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; unsigned long seq; s64 nsecs; @@ -114,13 +179,15 @@ void getnstimeofday(struct timespec *ts) *ts = xtime; /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, + clock->shift); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -135,6 +202,7 @@ EXPORT_SYMBOL(getnstimeofday); ktime_t ktime_get(void) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; unsigned int seq; s64 secs, nsecs; @@ -146,13 +214,15 @@ ktime_t ktime_get(void) nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs += cyc2ns(clock, cycle_delta); + nsecs += clocksource_cyc2ns(cycle_delta, clock->mult, + clock->shift); } while (read_seqretry(&xtime_lock, seq)); /* @@ -174,6 +244,7 @@ EXPORT_SYMBOL_GPL(ktime_get); void ktime_get_ts(struct timespec *ts) { cycle_t cycle_now, cycle_delta; + struct clocksource *clock; struct timespec tomono; unsigned int seq; s64 nsecs; @@ -186,13 +257,15 @@ void ktime_get_ts(struct timespec *ts) tomono = wall_to_monotonic; /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = cyc2ns(clock, cycle_delta); + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult, + clock->shift); } while (read_seqretry(&xtime_lock, seq)); @@ -233,7 +306,7 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; @@ -243,10 +316,10 @@ int do_settimeofday(struct timespec *tv) update_xtime_cache(0); - clock->error = 0; + timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, clock); + update_vsyscall(&xtime, timekeeper.clock); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -269,10 +342,10 @@ static void change_clocksource(void) new = clocksource_get_next(); - if (!new || clock == new) + if (!new || timekeeper.clock == new) return; - clocksource_forward_now(); + timekeeping_forward_now(); if (new->enable && !new->enable(new)) return; @@ -284,9 +357,9 @@ static void change_clocksource(void) */ new->mult_orig = new->mult; - new->raw_time = clock->raw_time; - old = clock; - clock = new; + old = timekeeper.clock; + timekeeper_setup_internals(new); + /* * Save mult_orig in mult so that the value can be restored * regardless if ->enable() updates the value of mult or not. @@ -295,22 +368,10 @@ static void change_clocksource(void) if (old->disable) old->disable(old); - clock->cycle_last = clock->read(clock); - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - tick_clock_notify(); - - /* - * We're holding xtime lock and waking up klogd would deadlock - * us on enqueue. So no printing! - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); - */ } #else /* GENERIC_TIME */ -static inline void clocksource_forward_now(void) { } +static inline void timekeeping_forward_now(void) { } static inline void change_clocksource(void) { } /** @@ -380,20 +441,23 @@ void getrawmonotonic(struct timespec *ts) unsigned long seq; s64 nsecs; cycle_t cycle_now, cycle_delta; + struct clocksource *clock; do { seq = read_seqbegin(&xtime_lock); /* read clocksource: */ + clock = timekeeper.clock; cycle_now = clock->read(clock); /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; /* convert to nanoseconds: */ - nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + nsecs = clocksource_cyc2ns(cycle_delta, clock->mult_orig, + clock->shift); - *ts = clock->raw_time; + *ts = raw_time; } while (read_seqretry(&xtime_lock, seq)); @@ -413,7 +477,7 @@ int timekeeping_valid_for_hres(void) do { seq = read_seqbegin(&xtime_lock); - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqretry(&xtime_lock, seq)); @@ -439,6 +503,7 @@ unsigned long __attribute__((weak)) read_persistent_clock(void) */ void __init timekeeping_init(void) { + struct clocksource *clock; unsigned long flags; unsigned long sec = read_persistent_clock(); @@ -451,11 +516,13 @@ void __init timekeeping_init(void) clock->enable(clock); /* set mult_orig on enable */ clock->mult_orig = clock->mult; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clock->read(clock); + + timekeeper_setup_internals(clock); xtime.tv_sec = sec; xtime.tv_nsec = 0; + raw_time.tv_sec = 0; + raw_time.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); update_xtime_cache(0); @@ -492,8 +559,8 @@ static int timekeeping_resume(struct sys_device *dev) } update_xtime_cache(0); /* re-base the last cycle value */ - clock->cycle_last = clock->read(clock); - clock->error = 0; + timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); + timekeeper.ntp_error = 0; timekeeping_suspended = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -514,7 +581,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - clocksource_forward_now(); + timekeeping_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -549,7 +616,7 @@ device_initcall(timekeeping_init_device); * If the error is already larger, we look ahead even further * to compensate for late or lost adjustments. */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, +static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, s64 *offset) { s64 tick_error, i; @@ -565,7 +632,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -574,8 +641,9 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; + tick_error = tick_length >> + (NTP_SCALE_SHIFT - timekeeper.clock->shift + 1); + tick_error -= timekeeper.xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; /* Finally calculate the adjustment shift value. */ @@ -600,18 +668,19 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(s64 offset) +static void timekeeping_adjust(s64 offset) { - s64 error, interval = clock->cycle_interval; + s64 error, interval = timekeeper.cycle_interval; int adj; - error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); + error = timekeeper.ntp_error >> + (NTP_SCALE_SHIFT - timekeeper.clock->shift - 1); if (error > interval) { error >>= 2; if (likely(error <= interval)) adj = 1; else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else if (error < -interval) { error >>= 2; if (likely(error >= -interval)) { @@ -619,15 +688,15 @@ static void clocksource_adjust(s64 offset) interval = -interval; offset = -offset; } else - adj = clocksource_bigadjust(error, &interval, &offset); + adj = timekeeping_bigadjust(error, &interval, &offset); } else return; - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (NTP_SCALE_SHIFT - clock->shift); + timekeeper.clock->mult += adj; + timekeeper.xtime_interval += interval; + timekeeper.xtime_nsec -= offset; + timekeeper.ntp_error -= (interval - offset) << + (NTP_SCALE_SHIFT - timekeeper.clock->shift); } /** @@ -637,53 +706,59 @@ static void clocksource_adjust(s64 offset) */ void update_wall_time(void) { + struct clocksource *clock; cycle_t offset; + s64 nsecs; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return; + clock = timekeeper.clock; #ifdef CONFIG_GENERIC_TIME offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #else - offset = clock->cycle_interval; + offset = timekeeper.cycle_interval; #endif - clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will ac