aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2015-04-02 13:22:35 +0200
committerIngo Molnar <mingo@kernel.org>2015-04-02 13:22:35 +0200
commit223aa646d5aa6096230a9481e4c6c9cb67cfc4a6 (patch)
tree3e9013b2d6a099fabf4c5148dd9e522e04f8f073 /kernel
parentaaa9fa387541a30f6f4138bea6e082319de5e3fe (diff)
parent34f439278cef7b1177f8ce24f9fc81dfc6221d3b (diff)
Merge branch 'perf/timer' into perf/core
This WIP branch is now ready to be merged. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c77
-rw-r--r--kernel/sched/fair.c8
-rw-r--r--kernel/time/clockevents.c88
-rw-r--r--kernel/time/clocksource.c170
-rw-r--r--kernel/time/jiffies.c5
-rw-r--r--kernel/time/sched_clock.c236
-rw-r--r--kernel/time/timekeeping.c345
-rw-r--r--kernel/time/timer_list.c32
8 files changed, 669 insertions, 292 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bb1a7c36e794..c40c2cac2d8e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
return local_clock();
}
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+ return event->clock();
+}
+
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
@@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
}
if (sample_type & PERF_SAMPLE_TIME)
- data->time = perf_clock();
+ data->time = perf_event_clock(event);
if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
data->id = primary_event_id(event);
@@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event,
task_event->event_id.tid = perf_event_tid(event, task);
task_event->event_id.ptid = perf_event_tid(event, current);
+ task_event->event_id.time = perf_event_clock(event);
+
perf_output_put(&handle, task_event->event_id);
perf_event__output_id_sample(event, &handle, &sample);
@@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task,
/* .ppid */
/* .tid */
/* .ptid */
- .time = perf_clock(),
+ /* .time */
},
};
@@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
.misc = 0,
.size = sizeof(throttle_event),
},
- .time = perf_clock(),
+ .time = perf_event_clock(event),
.id = primary_event_id(event),
.stream_id = event->id,
};
@@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event)
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
.event_init = perf_swevent_init,
.add = perf_swevent_add,
.del = perf_swevent_del,
@@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event)
static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
.del = cpu_clock_event_del,
@@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event)
static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
+ .capabilities = PERF_PMU_CAP_NO_NMI,
+
.event_init = task_clock_event_init,
.add = task_clock_event_add,
.del = task_clock_event_del,
@@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->hw.target = task;
}
+ event->clock = &local_clock;
+ if (parent_event)
+ event->clock = parent_event->clock;
+
if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
@@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
if (output_event->cpu == -1 && output_event->ctx != event->ctx)
goto out;
+ /*
+ * Mixing clocks in the same buffer is trouble you don't need.
+ */
+ if (output_event->clock != event->clock)
+ goto out;
+
set:
mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
@@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+ bool nmi_safe = false;
+
+ switch (clk_id) {
+ case CLOCK_MONOTONIC:
+ event->clock = &ktime_get_mono_fast_ns;
+ nmi_safe = true;
+ break;
+
+ case CLOCK_MONOTONIC_RAW:
+ event->clock = &ktime_get_raw_fast_ns;
+ nmi_safe = true;
+ break;
+
+ case CLOCK_REALTIME:
+ event->clock = &ktime_get_real_ns;
+ break;
+
+ case CLOCK_BOOTTIME:
+ event->clock = &ktime_get_boot_ns;
+ break;
+
+ case CLOCK_TAI:
+ event->clock = &ktime_get_tai_ns;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+ return -EINVAL;
+
+ return 0;
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open,
*/
pmu = event->pmu;
+ if (attr.use_clockid) {
+ err = perf_event_set_clock(event, attr.clockid);
+ if (err)
+ goto err_alloc;
+ }
+
if (group_leader &&
(is_software_event(event) != is_software_event(group_leader))) {
if (is_software_event(event)) {
@@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open,
*/
if (group_leader->group_leader != group_leader)
goto err_context;
+
+ /* All events in a group should have the same clock */
+ if (group_leader->clock != event->clock)
+ goto err_context;
+
/*
* Do not allow to attach to a group in a different
* task or CPU context:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3c097a..bcfe32088b37 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p,
/*
* If there were no record hinting faults then either the task is
* completely idle or all activity is areas that are not of interest
- * to automatic numa balancing. Scan slower
+ * to automatic numa balancing. Related to that, if there were failed
+ * migration then it implies we are migrating too quickly or the local
+ * node is overloaded. In either case, scan slower
*/
- if (local + shared == 0) {
+ if (local + shared == 0 || p->numa_faults_locality[2]) {
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1);
@@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
+ if (flags & TNF_MIGRATE_FAIL)
+ p->numa_faults_locality[2] += pages;
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 55449909f114..489642b08d64 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,6 +94,57 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);
+static int __clockevents_set_mode(struct clock_event_device *dev,
+ enum clock_event_mode mode)
+{
+ /* Transition with legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* Legacy callback doesn't support new modes */
+ if (mode > CLOCK_EVT_MODE_RESUME)
+ return -ENOSYS;
+ dev->set_mode(mode, dev);
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ /* Transition with new mode-specific callbacks */
+ switch (mode) {
+ case CLOCK_EVT_MODE_UNUSED:
+ /*
+ * This is an internal state, which is guaranteed to go from
+ * SHUTDOWN to UNUSED. No driver interaction required.
+ */
+ return 0;
+
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ return dev->set_mode_shutdown(dev);
+
+ case CLOCK_EVT_MODE_PERIODIC:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+ return -ENOSYS;
+ return dev->set_mode_periodic(dev);
+
+ case CLOCK_EVT_MODE_ONESHOT:
+ /* Core internal bug */
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return -ENOSYS;
+ return dev->set_mode_oneshot(dev);
+
+ case CLOCK_EVT_MODE_RESUME:
+ /* Optional callback */
+ if (dev->set_mode_resume)
+ return dev->set_mode_resume(dev);
+ else
+ return 0;
+
+ default:
+ return -ENOSYS;
+ }
+}
+
/**
* clockevents_set_mode - set the operating mode of a clock event device
* @dev: device to modify
@@ -105,7 +156,9 @@ void clockevents_set_mode(struct clock_event_device *dev,
enum clock_event_mode mode)
{
if (dev->mode != mode) {
- dev->set_mode(mode, dev);
+ if (__clockevents_set_mode(dev, mode))
+ return;
+
dev->mode = mode;
/*
@@ -373,6 +426,35 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
}
EXPORT_SYMBOL_GPL(clockevents_unbind);
+/* Sanity check of mode transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+ /* Legacy set_mode() callback */
+ if (dev->set_mode) {
+ /* We shouldn't be supporting new modes now */
+ WARN_ON(dev->set_mode_periodic || dev->set_mode_oneshot ||
+ dev->set_mode_shutdown || dev->set_mode_resume);
+ return 0;
+ }
+
+ if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
+
+ /* New mode-specific callbacks */
+ if (!dev->set_mode_shutdown)
+ return -EINVAL;
+
+ if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+ !dev->set_mode_periodic)
+ return -EINVAL;
+
+ if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+ !dev->set_mode_oneshot)
+ return -EINVAL;
+
+ return 0;
+}
+
/**
* clockevents_register_device - register a clock event device
* @dev: device to register
@@ -382,6 +464,8 @@ void clockevents_register_device(struct clock_event_device *dev)
unsigned long flags;
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+ BUG_ON(clockevents_sanity_check(dev));
+
if (!dev->cpumask) {
WARN_ON(num_possible_cpus() > 1);
dev->cpumask = cpumask_of(smp_processor_id());
@@ -449,7 +533,7 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
return clockevents_program_event(dev, dev->next_event, false);
if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
- dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+ return __clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
return 0;
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4892352f0e49..c3be3c71bbad 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
schedule_work(&watchdog_work);
}
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
- printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
- cs->name, delta);
- __clocksource_unstable(cs);
-}
-
/**
* clocksource_mark_unstable - mark clocksource unstable via watchdog
* @cs: clocksource to be marked unstable
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
- cycle_t csnow, wdnow, delta;
+ cycle_t csnow, wdnow, cslast, wdlast, delta;
int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)
delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+ wdlast = cs->wd_last; /* save these in case we print them */
+ cslast = cs->cs_last;
cs->cs_last = csnow;
cs->wd_last = wdnow;
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
/* Check the deviation from the watchdog clocksource. */
if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
- clocksource_unstable(cs, cs_nsec - wd_nsec);
+ pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+ pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+ watchdog->name, wdnow, wdlast, watchdog->mask);
+ pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+ cs->name, csnow, cslast, cs->mask);
+ __clocksource_unstable(cs);
continue;
}
@@ -469,26 +469,22 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
* @shift: cycle to nanosecond divisor (power of two)
* @maxadj: maximum adjustment value to mult (~11%)
* @mask: bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc: maximum cycle value before potential overflow (does not include
+ * any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, so that bad clock values
+ * can be detected.
*/
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
{
u64 max_nsecs, max_cycles;
/*
* Calculate the maximum number of cycles that we can pass to the
- * cyc2ns function without overflowing a 64-bit signed result. The
- * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
- * which is equivalent to the below.
- * max_cycles < (2^63)/(mult + maxadj)
- * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
- * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
- * max_cycles < 2^(63 - log2(mult + maxadj))
- * max_cycles < 1 << (63 - log2(mult + maxadj))
- * Please note that we add 1 to the result of the log2 to account for
- * any rounding errors, ensure the above inequality is satisfied and
- * no overflow will occur.
+ * cyc2ns() function without overflowing a 64-bit result.
*/
- max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+ max_cycles = ULLONG_MAX;
+ do_div(max_cycles, mult+maxadj);
/*
* The actual maximum number of cycles we can defer the clocksource is
@@ -499,27 +495,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
max_cycles = min(max_cycles, mask);
max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
+ /* return the max_cycles value as well if requested */
+ if (max_cyc)
+ *max_cyc = max_cycles;
+
+ /* Return 50% of the actual maximum, so we can detect bad values */
+ max_nsecs >>= 1;
+
return max_nsecs;
}
/**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs: Pointer to clocksource
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
+ * @cs: Pointer to clocksource to be updated
*
*/
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
{
- u64 max_nsecs;
-
- max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
- cs->mask);
- /*
- * To ensure that the clocksource does not wrap whilst we are idle,
- * limit the time the clocksource can be deferred by 12.5%. Please
- * note a margin of 12.5% is used because this can be computed with
- * a shift, versus say 10% which would require division.
- */
- return max_nsecs - (max_nsecs >> 3);
+ cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+ cs->maxadj, cs->mask,
+ &cs->max_cycles);
}
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -648,7 +643,7 @@ static void clocksource_enqueue(struct clocksource *cs)
}
/**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
* @cs: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
@@ -656,48 +651,64 @@ static void clocksource_enqueue(struct clocksource *cs)
* This should only be called from the clocksource->enable() method.
*
* This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
*/
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
u64 sec;
+
/*
- * Calc the maximum number of seconds which we can run before
- * wrapping around. For clocksources which have a mask > 32bit
- * we need to limit the max sleep time to have a good
- * conversion precision. 10 minutes is still a reasonable
- * amount. That results in a shift value of 24 for a
- * clocksource with mask >= 40bit and f >= 4GHz. That maps to
- * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
- * margin as we do in clocksource_max_deferment()
+ * Default clocksources are *special* and self-define their mult/shift.
+ * But, you're not special, so you should specify a freq value.
*/
- sec = (cs->mask - (cs->mask >> 3));
- do_div(sec, freq);
- do_div(sec, scale);
- if (!sec)
- sec = 1;
- else if (sec > 600 && cs->mask > UINT_MAX)
- sec = 600;
-
- clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
- NSEC_PER_SEC / scale, sec * scale);
-
+ if (freq) {
+ /*
+ * Calc the maximum number of seconds which we can run before
+ * wrapping around. For clocksources which have a mask > 32-bit
+ * we need to limit the max sleep time to have a good
+ * conversion precision. 10 minutes is still a reasonable
+ * amount. That results in a shift value of 24 for a
+ * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
+ * ~ 0.06ppm granularity for NTP.
+ */
+ sec = cs->mask;
+ do_div(sec, freq);
+ do_div(sec, scale);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && cs->mask > UINT_MAX)
+ sec = 600;
+
+ clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+ NSEC_PER_SEC / scale, sec * scale);
+ }
/*
- * for clocksources that have large mults, to avoid overflow.
- * Since mult may be adjusted by ntp, add an safety extra margin
- *
+ * Ensure clocksources that have large 'mult' values don't overflow
+ * when adjusted.
*/
cs->maxadj = clocksource_max_adjustment(cs);
- while ((cs->mult + cs->maxadj < cs->mult)
- || (cs->mult - cs->maxadj > cs->mult)) {
+ while (freq && ((cs->mult + cs->maxadj < cs->mult)
+ || (cs->mult - cs->maxadj > cs->mult))) {
cs->mult >>= 1;
cs->shift--;
cs->maxadj = clocksource_max_adjustment(cs);
}
- cs->max_idle_ns = clocksource_max_deferment(cs);
+ /*
+ * Only warn for *special* clocksources that self-define
+ * their mult/shift values and don't specify a freq.
+ */
+ WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+ "timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+ cs->name);
+
+ clocksource_update_max_deferment(cs);
+
+ pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+ cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
}
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
/**
* __clocksource_register_scale - Used to install new clocksources
@@ -714,7 +725,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
/* Initialize mult/shift and max_idle_ns */
- __clocksource_updatefreq_scale(cs, scale, freq);
+ __clocksource_update_freq_scale(cs, scale, freq);
/* Add clocksource to the clocksource list */
mutex_lock(&clocksource_mutex);
@@ -726,33 +737,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);
-
-/**
- * clocksource_register - Used to install new clocksources
- * @cs: clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
- /* calculate max adjustment for given mult/shift */
- cs->maxadj = clocksource_max_adjustment(cs);
- WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
- "Clocksource %s might overflow on 11%% adjustment\n",
- cs->name);
-
- /* calculate max idle time permitted for this clocksource */
- cs->max_idle_ns = clocksource_max_deferment(cs);
-
- mutex_lock(&clocksource_mutex);
- clocksource_enqueue(cs);
- clocksource_enqueue_watchdog(cs);
- clocksource_select();
- mutex_unlock(&clocksource_mutex);
- return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
-
static void __clocksource_change_rating(struct clocksource *cs, int rating)
{
list_del(&cs->list);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..c4bb518725b5 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
.mask = 0xffffffff, /*32bits*/
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
+ .max_cycles = 10,
};
__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
static int __init init_jiffies_clocksource(void)
{
- return clocksource_register(&clocksource_jiffies);
+ return __clocksource_register(&clocksource_jiffies);
}
core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
- clocksource_register(&refined_jiffies);
+ __clocksource_register(&refined_jiffies);
return 0;
}
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
/*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ * hardware time counters to full 64-bit ns values.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -18,15 +19,53 @@
#include <linux/seqlock.h>
#include <linux/bitops.h>
-struct clock_data {
- ktime_t wrap_kt;
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns: sched_clock() value at last update
+ * @epoch_cyc: Clock cycle value at last update.
+ * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
+ * clocks.
+ * @read_sched_clock: Current clock source (or dummy source when suspended).
+ * @mult: Multipler for scaled math conversion.
+ * @shift: Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
u64 epoch_ns;
u64 epoch_cyc;
- seqcount_t seq;
- unsigned long rate;
+ u64 sched_clock_mask;
+ u64 (*read_sched_clock)(void);
u32 mult;
u32 shift;
- bool suspended;
+};
+
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ * registration of a new clock source)
+ *
+ * @seq: Sequence counter for protecting updates. The lowest
+ * bit is the index for @read_data.
+ * @read_data: Data required to read from sched_clock.
+ * @wrap_kt: Duration for which clock can run before wrapping.
+ * @rate: Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+ seqcount_t seq;
+ struct clock_read_data read_data[2];
+ ktime_t wrap_kt;
+ unsigned long rate;
+
+ u64 (*actual_read_sched_clock)(void);
};
static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = {
- .mult = NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
static u64 notrace jiffy_sched_clock_read(void)
{
/*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
return (u64)(jiffies - INITIAL_JIFFIES);
}
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+ .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+ .read_sched_clock = jiffy_sched_clock_read, },
+ .actual_read_sched_clock = jiffy_sched_clock_read,
+};
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void)
{
- u64 epoch_ns;
- u64 epoch_cyc;
- u64 cyc;
+ u64 cyc, res;
unsigned long seq;
-
- if (cd.suspended)
- return cd.epoch_ns;
+ struct clock_read_data *rd;
do {
- seq = raw_read_seqcount_begin(&cd.seq);
- epoch_cyc = cd.epoch_cyc;
- epoch_ns = cd.epoch_ns;
+ seq = raw_read_seqcount(&cd.seq);
+ rd = cd.read_data + (seq & 1);
+
+ cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+ rd->sched_clock_mask;
+ res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
} while (read_seqcount_retry(&cd.seq, seq));
- cyc = read_sched_clock();
- cyc = (cyc - epoch_cyc) & sched_clock_mask;
- return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+ return res;
+}
+
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+ /* update the backup (odd) copy with the new data */
+ cd.read_data[1] = *rd;
+
+ /* steer readers towards the odd copy */
+ raw_write_seqcount_latch(&cd.seq);
+
+ /* now its safe for us to update the normal (even) copy */
+ cd.read_data[0] = *rd;
+
+ /* switch readers back to the even copy */
+ raw_write_seqcount_latch(&cd.seq);
}
/*
- * Atomically update the sched_clock epoch.
+ * Atomically update the sched_clock() epoch.
*/
-static void notrace update_sched_clock(void)
+static void update_sched_clock(void)
{
- unsigned long flags;
u64 cyc;
u64 ns;
+ struct clock_read_data rd;
+
+ rd = cd.read_data[0];
+
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+
+ rd.epoch_ns = ns;
+ rd.epoch_cyc = cyc;
- cyc = read_sched_clock();
- ns = cd.epoch_ns +
- cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
- cd.mult, cd.shift);
-
- raw_local_irq_save(flags);
- raw_write_seqcount_begin(&cd.seq);
- cd.epoch_ns = ns;
- cd.epoch_cyc = cyc;
- raw_write_seqcount_end(&cd.seq);
- raw_local_irq_restore(flags);
+ update_clock_read_data(&rd);
}
static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
{
update_sched_clock();
hrtimer_forward_now(hrt, cd.wrap_kt);
+
return HRTIMER_RESTART;
}
-void __init sched_clock_register(u64 (*read)(void), int bits,
- unsigned long rate)
+void __init
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
- ktime_t new_wrap_kt;
unsigned long r;
char r_unit;
+ struct clock_read_data rd;
if (cd.rate > rate)
return;
WARN_ON(!irqs_disabled());
- /* calculate the mult/shift to convert counter ticks to ns. */
+ /* Calculate the mult/shift to convert counter ticks to ns. */
clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
new_mask = CLOCKSOURCE_MASK(bits);
+ cd.rate = rate;
+
+ /* Calculate how many nanosecs until we risk wrapping */
+ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+ cd.wrap_kt = ns_to_ktime(wrap);
- /* calculate how many ns until we wrap */
- wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
- new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+ rd = cd.read_data[0];
- /* update epoch for new counter and update epoch_ns from old counter*/
+ /* Update epoch for new counter and update 'epoch_ns' from old counter*/
new_epoch = read();
- cyc = read_sched_clock();
- ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
- cd.mult, cd.shift);
+ cyc = cd.actual_read_sched_clock();
+ ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+ cd.actual_read_sched_clock = read;
- raw_write_seqcount_begin(&cd.seq);
- read_sched_clock = read;
- sched_clock_mask = new_mask;
- cd.rate = rate;
- cd.wrap_kt = new_wrap_kt;
- cd.mult = new_mult;
- cd.shift = new_shift;
- cd.epoch_cyc = new_epoch;
- cd.epoch_ns = ns;
- raw_write_seqcount_end(&cd.seq);
+ rd.read_sched_clock = read;
+ rd.sched_clock_mask = new_mask;
+ rd.mult = new_mult;
+ rd.shift = new_shift;
+ rd.epoch_cyc = new_epoch;
+ rd.epoch_ns = ns;
+
+ update_clock_read_data(&rd);
r = rate;
if (r >= 4000000) {
r /= 1000000;
r_unit = 'M';
- } else if (r >= 1000) {
- r /= 1000;
- r_unit = 'k';
- } else
- r_unit = ' ';
-
- /* calculate the ns resolution of this counter */
+ } else {
+ if (r >= 1000) {
+ r /= 1000;
+ r_unit = 'k';
+ } else {
+ r_unit = ' ';
+ }
+ }
+
+ /* Calculate the ns resolution of this counter */
res = cyc_to_ns(1ULL, new_mult, new_shift);
pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
bits, r, r_unit, res, wrap);
- /* Enable IRQ time accounting if we have a fast enough sched_clock */
+ /* Enable IRQ time accounting if we have a fast enough sched_clock() */
if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
enable_sched_clock_irqtime();
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
void __init sched_clock_postinit(void)
{
/*
- * If no sched_clock function has been provided at that point,
+ * If no sched_clock() function has been provided at that point,
* make it the final one one.
*/
- if (read_sched_clock == jiffy_sched_clock_read)
+ if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
}
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+ unsigned long seq = raw_read_seqcount(&cd.seq);
+
+ return cd.read_data[seq & 1].epoch_cyc;
+}
+
static int sched_clock_suspend(void)
{
+ struct clock_read_data *rd = &cd.read_data[0];
+
update_sched_clock();
hrtimer_cancel(&sched_clock_timer);
- cd.suspended = true;
+ rd->read_sched_clock = suspended_sched_clock_read;
+
return 0;
}
static void sched_clock_resume(void)
{
- cd.epoch_cyc = read_sched_clock();
+ struct clock_read_data *rd = &cd.read_data[0];
+
+ rd->epoch_cyc = cd.actual_read_sched_clock();
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
- cd.suspended = false;
+ rd->read_sched_clock = cd.actual_read_sched_clock;
}
static struct syscore_ops sched_clock_ops = {
- .suspend = sched_clock_suspend,
- .resume = sched_clock_resume,
+ .suspend = sched_clock_suspend,
+ .resume = sched_clock_resume,
};
static int __init sched_clock_syscore_init(void)
{
register_syscore_ops(&sched_clock_ops);
+
return 0;
}
device_initcall(sched_clock_syscore_init);
diff --git a/kernel/ti