Merge branch 'perf/timer' into perf/core

This WIP branch is now ready to be merged. Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2015-04-02 13:22:35 +0200
committer: Ingo Molnar <mingo@kernel.org> 2015-04-02 13:22:35 +0200
commit: 223aa646d5aa6096230a9481e4c6c9cb67cfc4a6 (patch)
tree: 3e9013b2d6a099fabf4c5148dd9e522e04f8f073 /kernel
parent: aaa9fa387541a30f6f4138bea6e082319de5e3fe (diff)
parent: 34f439278cef7b1177f8ce24f9fc81dfc6221d3b (diff)
8 files changed, 669 insertions, 292 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bb1a7c36e794..c40c2cac2d8e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+	return event->clock();
+}
+
 static inline struct perf_cpu_context *
 __get_cpu_context(struct perf_event_context *ctx)
 {
@@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME)
-		data->time = perf_clock();
+		data->time = perf_event_clock(event);
 
 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
 		data->id = primary_event_id(event);
@@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event,
 	task_event->event_id.tid = perf_event_tid(event, task);
 	task_event->event_id.ptid = perf_event_tid(event, current);
 
+	task_event->event_id.time = perf_event_clock(event);
+
 	perf_output_put(&handle, task_event->event_id);
 
 	perf_event__output_id_sample(event, &handle, &sample);
@@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task,
 			/* .ppid */
 			/* .tid  */
 			/* .ptid */
-			.time = perf_clock(),
+			/* .time */
 		},
 	};
 
@@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time		= perf_clock(),
+		.time		= perf_event_clock(event),
 		.id		= primary_event_id(event),
 		.stream_id	= event->id,
 	};
@@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event)
 static struct pmu perf_swevent = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= perf_swevent_init,
 	.add		= perf_swevent_add,
 	.del		= perf_swevent_del,
@@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 static struct pmu perf_cpu_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= cpu_clock_event_init,
 	.add		= cpu_clock_event_add,
 	.del		= cpu_clock_event_del,
@@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event)
 static struct pmu perf_task_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= task_clock_event_init,
 	.add		= task_clock_event_add,
 	.del		= task_clock_event_del,
@@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		event->hw.target = task;
 	}
 
+	event->clock = &local_clock;
+	if (parent_event)
+		event->clock = parent_event->clock;
+
 	if (!overflow_handler && parent_event) {
 		overflow_handler = parent_event->overflow_handler;
 		context = parent_event->overflow_handler_context;
@@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 		goto out;
 
+	/*
+	 * Mixing clocks in the same buffer is trouble you don't need.
+	 */
+	if (output_event->clock != event->clock)
+		goto out;
+
 set:
 	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
@@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
 	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
 }
 
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+	bool nmi_safe = false;
+
+	switch (clk_id) {
+	case CLOCK_MONOTONIC:
+		event->clock = &ktime_get_mono_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_MONOTONIC_RAW:
+		event->clock = &ktime_get_raw_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_REALTIME:
+		event->clock = &ktime_get_real_ns;
+		break;
+
+	case CLOCK_BOOTTIME:
+		event->clock = &ktime_get_boot_ns;
+		break;
+
+	case CLOCK_TAI:
+		event->clock = &ktime_get_tai_ns;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
  * sys_perf_event_open - open a performance event, associate it to a task/cpu
  *
@@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open,
 	 */
 	pmu = event->pmu;
 
+	if (attr.use_clockid) {
+		err = perf_event_set_clock(event, attr.clockid);
+		if (err)
+			goto err_alloc;
+	}
+
 	if (group_leader &&
 	    (is_software_event(event) != is_software_event(group_leader))) {
 		if (is_software_event(event)) {
@@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		 */
 		if (group_leader->group_leader != group_leader)
 			goto err_context;
+
+		/* All events in a group should have the same clock */
+		if (group_leader->clock != event->clock)
+			goto err_context;
+
 		/*
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7ce18f3c097a..bcfe32088b37 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p,
 	/*
 	 * If there were no record hinting faults then either the task is
 	 * completely idle or all activity is areas that are not of interest
-	 * to automatic numa balancing. Scan slower
+	 * to automatic numa balancing. Related to that, if there were failed
+	 * migration then it implies we are migrating too quickly or the local
+	 * node is overloaded. In either case, scan slower
 	 */
-	if (local + shared == 0) {
+	if (local + shared == 0 || p->numa_faults_locality[2]) {
 		p->numa_scan_period = min(p->numa_scan_period_max,
 			p->numa_scan_period << 1);
 
@@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
 	if (migrated)
 		p->numa_pages_migrated += pages;
+	if (flags & TNF_MIGRATE_FAIL)
+		p->numa_faults_locality[2] += pages;
 
 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 55449909f114..489642b08d64 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,6 +94,57 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 
+static int __clockevents_set_mode(struct clock_event_device *dev,
+				  enum clock_event_mode mode)
+{
+	/* Transition with legacy set_mode() callback */
+	if (dev->set_mode) {
+		/* Legacy callback doesn't support new modes */
+		if (mode > CLOCK_EVT_MODE_RESUME)
+			return -ENOSYS;
+		dev->set_mode(mode, dev);
+		return 0;
+	}
+
+	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;
+
+	/* Transition with new mode-specific callbacks */
+	switch (mode) {
+	case CLOCK_EVT_MODE_UNUSED:
+		/*
+		 * This is an internal state, which is guaranteed to go from
+		 * SHUTDOWN to UNUSED. No driver interaction required.
+		 */
+		return 0;
+
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		return dev->set_mode_shutdown(dev);
+
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* Core internal bug */
+		if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC))
+			return -ENOSYS;
+		return dev->set_mode_periodic(dev);
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		/* Core internal bug */
+		if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+			return -ENOSYS;
+		return dev->set_mode_oneshot(dev);
+
+	case CLOCK_EVT_MODE_RESUME:
+		/* Optional callback */
+		if (dev->set_mode_resume)
+			return dev->set_mode_resume(dev);
+		else
+			return 0;
+
+	default:
+		return -ENOSYS;
+	}
+}
+
 /**
  * clockevents_set_mode - set the operating mode of a clock event device
  * @dev:	device to modify
@@ -105,7 +156,9 @@ void clockevents_set_mode(struct clock_event_device *dev,
 				 enum clock_event_mode mode)
 {
 	if (dev->mode != mode) {
-		dev->set_mode(mode, dev);
+		if (__clockevents_set_mode(dev, mode))
+			return;
+
 		dev->mode = mode;
 
 		/*
@@ -373,6 +426,35 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind);
 
+/* Sanity check of mode transition callbacks */
+static int clockevents_sanity_check(struct clock_event_device *dev)
+{
+	/* Legacy set_mode() callback */
+	if (dev->set_mode) {
+		/* We shouldn't be supporting new modes now */
+		WARN_ON(dev->set_mode_periodic || dev->set_mode_oneshot ||
+			dev->set_mode_shutdown || dev->set_mode_resume);
+		return 0;
+	}
+
+	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;
+
+	/* New mode-specific callbacks */
+	if (!dev->set_mode_shutdown)
+		return -EINVAL;
+
+	if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+	    !dev->set_mode_periodic)
+		return -EINVAL;
+
+	if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) &&
+	    !dev->set_mode_oneshot)
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
  * clockevents_register_device - register a clock event device
  * @dev:	device to register
@@ -382,6 +464,8 @@ void clockevents_register_device(struct clock_event_device *dev)
 	unsigned long flags;
 
 	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+	BUG_ON(clockevents_sanity_check(dev));
+
 	if (!dev->cpumask) {
 		WARN_ON(num_possible_cpus() > 1);
 		dev->cpumask = cpumask_of(smp_processor_id());
@@ -449,7 +533,7 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 		return clockevents_program_event(dev, dev->next_event, false);
 
 	if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
-		dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
+		return __clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
 
 	return 0;
 }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4892352f0e49..c3be3c71bbad 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs)
 		schedule_work(&watchdog_work);
 }
 
-static void clocksource_unstable(struct clocksource *cs, int64_t delta)
-{
-	printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
-	       cs->name, delta);
-	__clocksource_unstable(cs);
-}
-
 /**
  * clocksource_mark_unstable - mark clocksource unstable via watchdog
  * @cs:		clocksource to be marked unstable
@@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 static void clocksource_watchdog(unsigned long data)
 {
 	struct clocksource *cs;
-	cycle_t csnow, wdnow, delta;
+	cycle_t csnow, wdnow, cslast, wdlast, delta;
 	int64_t wd_nsec, cs_nsec;
 	int next_cpu, reset_pending;
 
@@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data)
 
 		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
 		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
+		wdlast = cs->wd_last; /* save these in case we print them */
+		cslast = cs->cs_last;
 		cs->cs_last = csnow;
 		cs->wd_last = wdnow;
 
@@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data)
 
 		/* Check the deviation from the watchdog clocksource. */
 		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
-			clocksource_unstable(cs, cs_nsec - wd_nsec);
+			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
+			pr_warn("	'%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+				watchdog->name, wdnow, wdlast, watchdog->mask);
+			pr_warn("	'%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+				cs->name, csnow, cslast, cs->mask);
+			__clocksource_unstable(cs);
 			continue;
 		}
 
@@ -469,26 +469,22 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
  * @shift:	cycle to nanosecond divisor (power of two)
  * @maxadj:	maximum adjustment value to mult (~11%)
  * @mask:	bitmask for two's complement subtraction of non 64 bit counters
+ * @max_cyc:	maximum cycle value before potential overflow (does not include
+ *		any safety margin)
+ *
+ * NOTE: This function includes a safety margin of 50%, so that bad clock values
+ * can be detected.
  */
-u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
 {
 	u64 max_nsecs, max_cycles;
 
 	/*
 	 * Calculate the maximum number of cycles that we can pass to the
-	 * cyc2ns function without overflowing a 64-bit signed result. The
-	 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
-	 * which is equivalent to the below.
-	 * max_cycles < (2^63)/(mult + maxadj)
-	 * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
-	 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
-	 * max_cycles < 2^(63 - log2(mult + maxadj))
-	 * max_cycles < 1 << (63 - log2(mult + maxadj))
-	 * Please note that we add 1 to the result of the log2 to account for
-	 * any rounding errors, ensure the above inequality is satisfied and
-	 * no overflow will occur.
+	 * cyc2ns() function without overflowing a 64-bit result.
 	 */
-	max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
+	max_cycles = ULLONG_MAX;
+	do_div(max_cycles, mult+maxadj);
 
 	/*
 	 * The actual maximum number of cycles we can defer the clocksource is
@@ -499,27 +495,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
 	max_cycles = min(max_cycles, mask);
 	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
 
+	/* return the max_cycles value as well if requested */
+	if (max_cyc)
+		*max_cyc = max_cycles;
+
+	/* Return 50% of the actual maximum, so we can detect bad values */
+	max_nsecs >>= 1;
+
 	return max_nsecs;
 }
 
 /**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
- * @cs:         Pointer to clocksource
+ * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
+ * @cs:         Pointer to clocksource to be updated
  *
  */
-static u64 clocksource_max_deferment(struct clocksource *cs)
+static inline void clocksource_update_max_deferment(struct clocksource *cs)
 {
-	u64 max_nsecs;
-
-	max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
-					  cs->mask);
-	/*
-	 * To ensure that the clocksource does not wrap whilst we are idle,
-	 * limit the time the clocksource can be deferred by 12.5%. Please
-	 * note a margin of 12.5% is used because this can be computed with
-	 * a shift, versus say 10% which would require division.
-	 */
-	return max_nsecs - (max_nsecs >> 3);
+	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
+						cs->maxadj, cs->mask,
+						&cs->max_cycles);
 }
 
 #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -648,7 +643,7 @@ static void clocksource_enqueue(struct clocksource *cs)
 }
 
 /**
- * __clocksource_updatefreq_scale - Used update clocksource with new freq
+ * __clocksource_update_freq_scale - Used update clocksource with new freq
  * @cs:		clocksource to be registered
  * @scale:	Scale factor multiplied against freq to get clocksource hz
  * @freq:	clocksource frequency (cycles per second) divided by scale
@@ -656,48 +651,64 @@ static void clocksource_enqueue(struct clocksource *cs)
  * This should only be called from the clocksource->enable() method.
  *
  * This *SHOULD NOT* be called directly! Please use the
- * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions.
+ * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
+ * functions.
  */
-void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
+void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 	u64 sec;
+
 	/*
-	 * Calc the maximum number of seconds which we can run before
-	 * wrapping around. For clocksources which have a mask > 32bit
-	 * we need to limit the max sleep time to have a good
-	 * conversion precision. 10 minutes is still a reasonable
-	 * amount. That results in a shift value of 24 for a
-	 * clocksource with mask >= 40bit and f >= 4GHz. That maps to
-	 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
-	 * margin as we do in clocksource_max_deferment()
+	 * Default clocksources are *special* and self-define their mult/shift.
+	 * But, you're not special, so you should specify a freq value.
 	 */
-	sec = (cs->mask - (cs->mask >> 3));
-	do_div(sec, freq);
-	do_div(sec, scale);
-	if (!sec)
-		sec = 1;
-	else if (sec > 600 && cs->mask > UINT_MAX)
-		sec = 600;
-
-	clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
-			       NSEC_PER_SEC / scale, sec * scale);
-
+	if (freq) {
+		/*
+		 * Calc the maximum number of seconds which we can run before
+		 * wrapping around. For clocksources which have a mask > 32-bit
+		 * we need to limit the max sleep time to have a good
+		 * conversion precision. 10 minutes is still a reasonable
+		 * amount. That results in a shift value of 24 for a
+		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
+		 * ~ 0.06ppm granularity for NTP.
+		 */
+		sec = cs->mask;
+		do_div(sec, freq);
+		do_div(sec, scale);
+		if (!sec)
+			sec = 1;
+		else if (sec > 600 && cs->mask > UINT_MAX)
+			sec = 600;
+
+		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
+				       NSEC_PER_SEC / scale, sec * scale);
+	}
 	/*
-	 * for clocksources that have large mults, to avoid overflow.
-	 * Since mult may be adjusted by ntp, add an safety extra margin
-	 *
+	 * Ensure clocksources that have large 'mult' values don't overflow
+	 * when adjusted.
 	 */
 	cs->maxadj = clocksource_max_adjustment(cs);
-	while ((cs->mult + cs->maxadj < cs->mult)
-		|| (cs->mult - cs->maxadj > cs->mult)) {
+	while (freq && ((cs->mult + cs->maxadj < cs->mult)
+		|| (cs->mult - cs->maxadj > cs->mult))) {
 		cs->mult >>= 1;
 		cs->shift--;
 		cs->maxadj = clocksource_max_adjustment(cs);
 	}
 
-	cs->max_idle_ns = clocksource_max_deferment(cs);
+	/*
+	 * Only warn for *special* clocksources that self-define
+	 * their mult/shift values and don't specify a freq.
+	 */
+	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
+		cs->name);
+
+	clocksource_update_max_deferment(cs);
+
+	pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+			cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
-EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
+EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 
 /**
  * __clocksource_register_scale - Used to install new clocksources
@@ -714,7 +725,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 
 	/* Initialize mult/shift and max_idle_ns */
-	__clocksource_updatefreq_scale(cs, scale, freq);
+	__clocksource_update_freq_scale(cs, scale, freq);
 
 	/* Add clocksource to the clocksource list */
 	mutex_lock(&clocksource_mutex);
@@ -726,33 +737,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 }
 EXPORT_SYMBOL_GPL(__clocksource_register_scale);
 
-
-/**
- * clocksource_register - Used to install new clocksources
- * @cs:		clocksource to be registered
- *
- * Returns -EBUSY if registration fails, zero otherwise.
- */
-int clocksource_register(struct clocksource *cs)
-{
-	/* calculate max adjustment for given mult/shift */
-	cs->maxadj = clocksource_max_adjustment(cs);
-	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
-		"Clocksource %s might overflow on 11%% adjustment\n",
-		cs->name);
-
-	/* calculate max idle time permitted for this clocksource */
-	cs->max_idle_ns = clocksource_max_deferment(cs);
-
-	mutex_lock(&clocksource_mutex);
-	clocksource_enqueue(cs);
-	clocksource_enqueue_watchdog(cs);
-	clocksource_select();
-	mutex_unlock(&clocksource_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(clocksource_register);
-
 static void __clocksource_change_rating(struct clocksource *cs, int rating)
 {
 	list_del(&cs->list);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a6a5bf53e86d..c4bb518725b5 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = {
 	.mask		= 0xffffffff, /*32bits*/
 	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
 	.shift		= JIFFIES_SHIFT,
+	.max_cycles	= 10,
 };
 
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
@@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies);
 
 static int __init init_jiffies_clocksource(void)
 {
-	return clocksource_register(&clocksource_jiffies);
+	return __clocksource_register(&clocksource_jiffies);
 }
 
 core_initcall(init_jiffies_clocksource);
@@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second)
 
 	refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
 
-	clocksource_register(&refined_jiffies);
+	__clocksource_register(&refined_jiffies);
 	return 0;
 }
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a26036d37a38 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -1,5 +1,6 @@
 /*
- * sched_clock.c: support for extending counters to full 64-bit ns counter
+ * sched_clock.c: Generic sched_clock() support, to extend low level
+ *                hardware time counters to full 64-bit ns values.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -18,15 +19,53 @@
 #include <linux/seqlock.h>
 #include <linux/bitops.h>
 
-struct clock_data {
-	ktime_t wrap_kt;
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns:		sched_clock() value at last update
+ * @epoch_cyc:		Clock cycle value at last update.
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *			clocks.
+ * @read_sched_clock:	Current clock source (or dummy source when suspended).
+ * @mult:		Multipler for scaled math conversion.
+ * @shift:		Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
 	u64 epoch_ns;
 	u64 epoch_cyc;
-	seqcount_t seq;
-	unsigned long rate;
+	u64 sched_clock_mask;
+	u64 (*read_sched_clock)(void);
 	u32 mult;
 	u32 shift;
-	bool suspended;
+};
+
+/**
+ * struct clock_data - all data needed for sched_clock() (including
+ *                     registration of a new clock source)
+ *
+ * @seq:		Sequence counter for protecting updates. The lowest
+ *			bit is the index for @read_data.
+ * @read_data:		Data required to read from sched_clock.
+ * @wrap_kt:		Duration for which clock can run before wrapping.
+ * @rate:		Tick rate of the registered clock.
+ * @actual_read_sched_clock: Registered hardware level clock read function.
+ *
+ * The ordering of this structure has been chosen to optimize cache
+ * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
+ * into a single 64-byte cache line.
+ */
+struct clock_data {
+	seqcount_t		seq;
+	struct clock_read_data	read_data[2];
+	ktime_t			wrap_kt;
+	unsigned long		rate;
+
+	u64 (*actual_read_sched_clock)(void);
 };
 
 static struct hrtimer sched_clock_timer;
@@ -34,12 +73,6 @@ static int irqtime = -1;
 
 core_param(irqtime, irqtime, int, 0400);
 
-static struct clock_data cd = {
-	.mult	= NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
 static u64 notrace jiffy_sched_clock_read(void)
 {
 	/*
@@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void)
 	return (u64)(jiffies - INITIAL_JIFFIES);
 }
 
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd ____cacheline_aligned = {
+	.read_data[0] = { .mult = NSEC_PER_SEC / HZ,
+			  .read_sched_clock = jiffy_sched_clock_read, },
+	.actual_read_sched_clock = jiffy_sched_clock_read,
+};
 
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
@@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 
 unsigned long long notrace sched_clock(void)
 {
-	u64 epoch_ns;
-	u64 epoch_cyc;
-	u64 cyc;
+	u64 cyc, res;
 	unsigned long seq;
-
-	if (cd.suspended)
-		return cd.epoch_ns;
+	struct clock_read_data *rd;
 
 	do {
-		seq = raw_read_seqcount_begin(&cd.seq);
-		epoch_cyc = cd.epoch_cyc;
-		epoch_ns = cd.epoch_ns;
+		seq = raw_read_seqcount(&cd.seq);
+		rd = cd.read_data + (seq & 1);
+
+		cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
+		      rd->sched_clock_mask;
+		res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
 	} while (read_seqcount_retry(&cd.seq, seq));
 
-	cyc = read_sched_clock();
-	cyc = (cyc - epoch_cyc) & sched_clock_mask;
-	return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+	return res;
+}
+
+/*
+ * Updating the data required to read the clock.
+ *
+ * sched_clock() will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock() to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock() as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ */
+static void update_clock_read_data(struct clock_read_data *rd)
+{
+	/* update the backup (odd) copy with the new data */
+	cd.read_data[1] = *rd;
+
+	/* steer readers towards the odd copy */
+	raw_write_seqcount_latch(&cd.seq);
+
+	/* now its safe for us to update the normal (even) copy */
+	cd.read_data[0] = *rd;
+
+	/* switch readers back to the even copy */
+	raw_write_seqcount_latch(&cd.seq);
 }
 
 /*
- * Atomically update the sched_clock epoch.
+ * Atomically update the sched_clock() epoch.
  */
-static void notrace update_sched_clock(void)
+static void update_sched_clock(void)
 {
-	unsigned long flags;
 	u64 cyc;
 	u64 ns;
+	struct clock_read_data rd;
+
+	rd = cd.read_data[0];
+
+	cyc = cd.actual_read_sched_clock();
+	ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+
+	rd.epoch_ns = ns;
+	rd.epoch_cyc = cyc;
 
-	cyc = read_sched_clock();
-	ns = cd.epoch_ns +
-		cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
-
-	raw_local_irq_save(flags);
-	raw_write_seqcount_begin(&cd.seq);
-	cd.epoch_ns = ns;
-	cd.epoch_cyc = cyc;
-	raw_write_seqcount_end(&cd.seq);
-	raw_local_irq_restore(flags);
+	update_clock_read_data(&rd);
 }
 
 static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
 	update_sched_clock();
 	hrtimer_forward_now(hrt, cd.wrap_kt);
+
 	return HRTIMER_RESTART;
 }
 
-void __init sched_clock_register(u64 (*read)(void), int bits,
-				 unsigned long rate)
+void __init
+sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
 {
 	u64 res, wrap, new_mask, new_epoch, cyc, ns;
 	u32 new_mult, new_shift;
-	ktime_t new_wrap_kt;
 	unsigned long r;
 	char r_unit;
+	struct clock_read_data rd;
 
 	if (cd.rate > rate)
 		return;
 
 	WARN_ON(!irqs_disabled());
 
-	/* calculate the mult/shift to convert counter ticks to ns. */
+	/* Calculate the mult/shift to convert counter ticks to ns. */
 	clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
 
 	new_mask = CLOCKSOURCE_MASK(bits);
+	cd.rate = rate;
+
+	/* Calculate how many nanosecs until we risk wrapping */
+	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
+	cd.wrap_kt = ns_to_ktime(wrap);
 
-	/* calculate how many ns until we wrap */
-	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-	new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+	rd = cd.read_data[0];
 
-	/* update epoch for new counter and update epoch_ns from old counter*/
+	/* Update epoch for new counter and update 'epoch_ns' from old counter*/
 	new_epoch = read();
-	cyc = read_sched_clock();
-	ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
+	cyc = cd.actual_read_sched_clock();
+	ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
+	cd.actual_read_sched_clock = read;
 
-	raw_write_seqcount_begin(&cd.seq);
-	read_sched_clock = read;
-	sched_clock_mask = new_mask;
-	cd.rate = rate;
-	cd.wrap_kt = new_wrap_kt;
-	cd.mult = new_mult;
-	cd.shift = new_shift;
-	cd.epoch_cyc = new_epoch;
-	cd.epoch_ns = ns;
-	raw_write_seqcount_end(&cd.seq);
+	rd.read_sched_clock	= read;
+	rd.sched_clock_mask	= new_mask;
+	rd.mult			= new_mult;
+	rd.shift		= new_shift;
+	rd.epoch_cyc		= new_epoch;
+	rd.epoch_ns		= ns;
+
+	update_clock_read_data(&rd);
 
 	r = rate;
 	if (r >= 4000000) {
 		r /= 1000000;
 		r_unit = 'M';
-	} else if (r >= 1000) {
-		r /= 1000;
-		r_unit = 'k';
-	} else
-		r_unit = ' ';
-
-	/* calculate the ns resolution of this counter */
+	} else {
+		if (r >= 1000) {
+			r /= 1000;
+			r_unit = 'k';
+		} else {
+			r_unit = ' ';
+		}
+	}
+
+	/* Calculate the ns resolution of this counter */
 	res = cyc_to_ns(1ULL, new_mult, new_shift);
 
 	pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
 		bits, r, r_unit, res, wrap);
 
-	/* Enable IRQ time accounting if we have a fast enough sched_clock */
+	/* Enable IRQ time accounting if we have a fast enough sched_clock() */
 	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
 		enable_sched_clock_irqtime();
 
@@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 void __init sched_clock_postinit(void)
 {
 	/*
-	 * If no sched_clock function has been provided at that point,
+	 * If no sched_clock() function has been provided at that point,
 	 * make it the final one one.
 	 */
-	if (read_sched_clock == jiffy_sched_clock_read)
+	if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
 		sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
 
 	update_sched_clock();
@@ -189,29 +251,53 @@ void __init sched_clock_postinit(void)
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
 }
 
+/*
+ * Clock read function for use when the clock is suspended.
+ *
+ * This function makes it appear to sched_clock() as if the clock
+ * stopped counting at its last update.
+ *
+ * This function must only be called from the critical
+ * section in sched_clock(). It relies on the read_seqcount_retry()
+ * at the end of the critical section to be sure we observe the
+ * correct copy of 'epoch_cyc'.
+ */
+static u64 notrace suspended_sched_clock_read(void)
+{
+	unsigned long seq = raw_read_seqcount(&cd.seq);
+
+	return cd.read_data[seq & 1].epoch_cyc;
+}
+
 static int sched_clock_suspend(void)
 {
+	struct clock_read_data *rd = &cd.read_data[0];
+
 	update_sched_clock();
 	hrtimer_cancel(&sched_clock_timer);
-	cd.suspended = true;
+	rd->read_sched_clock = suspended_sched_clock_read;
+
 	return 0;
 }
 
 static void sched_clock_resume(void)
 {
-	cd.epoch_cyc = read_sched_clock();
+	struct clock_read_data *rd = &cd.read_data[0];
+
+	rd->epoch_cyc = cd.actual_read_sched_clock();
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-	cd.suspended = false;
+	rd->read_sched_clock = cd.actual_read_sched_clock;
 }
 
 static struct syscore_ops sched_clock_ops = {
-	.suspend = sched_clock_suspend,
-	.resume = sched_clock_resume,
+	.suspend	= sched_clock_suspend,
+	.resume		= sched_clock_resume,
 };
 
 static int __init sched_clock_syscore_init(void)
 {
 	register_syscore_ops(&sched_clock_ops);
+
 	return 0;
 }
 device_initcall(sched_clock_syscore_init);
diff --git a/kernel/ti
author	Ingo Molnar <mingo@kernel.org>	2015-04-02 13:22:35 +0200
committer	Ingo Molnar <mingo@kernel.org>	2015-04-02 13:22:35 +0200
commit	223aa646d5aa6096230a9481e4c6c9cb67cfc4a6 (patch)
tree	3e9013b2d6a099fabf4c5148dd9e522e04f8f073 /kernel
parent	aaa9fa387541a30f6f4138bea6e082319de5e3fe (diff)
parent	34f439278cef7b1177f8ce24f9fc81dfc6221d3b (diff)