From 8a8cde163ea724baf74e7752a31a69d3121a240e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:28 +0200 Subject: sched: rt: dont stop the period timer when there are tasks wanting to run So if the group ever gets throttled, it will never wake up again. Reported-by: "Daniel K." Signed-off-by: Peter Zijlstra Tested-by: Daniel K. Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1dad5bbb59b6..0f3c19197fa4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -250,7 +250,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; spin_unlock(&rt_rq->rt_runtime_lock); - } + } else if (rt_rq->rt_nr_running) + idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); -- cgit v1.2.3 From bb10ed0994927d433f6dbdf274fdb26cfcf516b7 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 19 Jun 2008 15:04:07 -0700 Subject: sched: fix wait_for_completion_timeout() spurious failure under heavy load It seems that the current implementaton of wait_for_completion_timeout() has a small problem under very high load for the common pattern: if (!wait_for_completion_timeout(&done, timeout)) /* handle failure */ because the implementation very roughly does (lots of code deleted to show the basic flow): static inline long __sched do_wait_for_common(struct completion *x, long timeout, int state) { if (x->done) return timeout; do { timeout = schedule_timeout(timeout); if (!timeout) return timeout; } while (!x->done); return timeout; } so if the system is very busy and x->done is not set when do_wait_for_common() is entered, it is possible that the first call to schedule_timeout() returns 0 because the task doing wait_for_completion doesn't get rescheduled for a long time, even if it is woken up early enough. In this case, wait_for_completion_timeout() returns 0 without even checking x->done again, and the code above falls into its failure case purely for scheduler reasons, even if the hardware event or whatever was being waited for happened early enough. It would make sense to add an extra test to do_wait_for() in the timeout case and return 1 if x->done is actually set. A quick audit (not exhaustive) of wait_for_completion_timeout() callers seems to indicate that no one actually cares about the return value in the success case -- they just test for 0 (timed out) versus non-zero (wait succeeded). Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 4a3cb0614158..577f160131bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4405,6 +4405,16 @@ do_wait_for_common(struct completion *x, long timeout, int state) spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); + + /* + * If the completion has arrived meanwhile + * then return 1 jiffy time left: + */ + if (x->done && !timeout) { + timeout = 1; + break; + } + if (!timeout) { __remove_wait_queue(&x->wait, &wait); return timeout; -- cgit v1.2.3 From 54481cf88bc59923ea30f2ca345a73c60155e901 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 19 Jun 2008 09:41:22 -0700 Subject: x86: fix NULL pointer deref in __switch_to MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I am able to reproduce the oops reported by Simon in __switch_to() with lguest. My debug showed that there is at least one lguest specific issue (which should be present in 2.6.25 and before aswell) and it got exposed with a kernel oops with the recent fpu dynamic allocation patches. In addition to the previous possible scenario (with fpu_counter), in the presence of lguest, it is possible that the cpu's TS bit it still set and the lguest launcher task's thread_info has TS_USEDFPU still set. This is because of the way the lguest launcher handling the guest's TS bit. (look at lguest_set_ts() in lguest_arch_run_guest()). This can result in a DNA fault while doing unlazy_fpu() in __switch_to(). This will end up causing a DNA fault in the context of new process thats getting context switched in (as opossed to handling DNA fault in the context of lguest launcher/helper process). This is wrong in both pre and post 2.6.25 kernels. In the recent 2.6.26-rc series, this is showing up as NULL pointer dereferences or sleeping function called from atomic context(__switch_to()), as we free and dynamically allocate the FPU context for the newly created threads. Older kernels might show some FPU corruption for processes running inside of lguest. With the appended patch, my test system is running for more than 50 mins now. So atleast some of your oops (hopefully all!) should get fixed. Please give it a try. I will spend more time with this fix tomorrow. Reported-by: Simon Holm Thøgersen Reported-by: Patrick McHardy Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- drivers/lguest/x86/core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 5126d5d9ea0e..2e554a4ab337 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -176,7 +176,7 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * we set it now, so we can trap and pass that trap to the Guest if it * uses the FPU. */ if (cpu->ts) - lguest_set_ts(); + unlazy_fpu(current); /* SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest @@ -196,6 +196,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * trap made the switcher code come back, and an error code which some * traps set. */ + /* Restore SYSENTER if it's supposed to be on. */ + if (boot_cpu_has(X86_FEATURE_SEP)) + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + /* If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite @@ -203,13 +207,12 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); /* Similarly, if we took a trap because the Guest used the FPU, - * we have to restore the FPU it expects to see. */ + * we have to restore the FPU it expects to see. + * math_state_restore() may sleep and we may even move off to + * a different CPU. So all the critical stuff should be done + * before this. */ else if (cpu->regs->trapnum == 7) math_state_restore(); - - /* Restore SYSENTER if it's supposed to be on. */ - if (boot_cpu_has(X86_FEATURE_SEP)) - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); } /*H:130 Now we've examined the hypercall code; our Guest can make requests. -- cgit v1.2.3 From 46539383791a0e59a4af7412056dfbfc5240af0a Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Mon, 16 Jun 2008 14:58:13 -0700 Subject: xen: Use wmb instead of rmb in xen_evtchn_do_upcall(). This patch is ported one from 534:77db69c38249 of linux-2.6.18-xen.hg. Use wmb instead of rmb to enforce ordering between evtchn_upcall_pending and evtchn_pending_sel stores in xen_evtchn_do_upcall(). Cc: Samuel Thibault Signed-off-by: Isaku Yamahata Cc: Nick Piggin Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4f0f22b020ea..76e5b7386af9 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -529,7 +529,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ /* Clear master flag /before/ clearing selector flag. */ - rmb(); + wmb(); #endif pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); while (pending_words != 0) { -- cgit v1.2.3 From 05345b0f006ac226d0d25d48fcb2d792ac44a071 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 15:01:53 -0700 Subject: xen: mask unwanted pte bits in __supported_pte_mask [ Stable: this isn't a bugfix in itself, but it's a pre-requiste for "xen: don't drop NX bit" ] Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 5 +++++ arch/x86/xen/mmu.c | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c8a56e457d61..c048de34d6a1 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1228,6 +1228,11 @@ asmlinkage void __init xen_start_kernel(void) if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!is_initial_xendomain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + /* set the limit of our address space */ xen_reserve_top(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3525ef523a74..3f2a67fe6ad6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -199,10 +199,8 @@ pgdval_t xen_pgd_val(pgd_t pgd) pte_t xen_make_pte(pteval_t pte) { - if (pte & _PAGE_PRESENT) { + if (pte & _PAGE_PRESENT) pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } return (pte_t){ .pte = pte }; } -- cgit v1.2.3 From ebb9cfe20fe167f29960a5e913193a684fac50bf Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Jun 2008 15:01:56 -0700 Subject: xen: don't drop NX bit Because NX is now enforced properly, we must put the hypercall page into the .text segment so that it is executable. Signed-off-by: Jeremy Fitzhardinge Cc: Stable Kernel Cc: the arch/x86 maintainers Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 54 +++++++++++++++++++++++++++---------------------- arch/x86/xen/xen-head.S | 2 +- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f2a67fe6ad6..265601d5a6ae 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -179,46 +179,54 @@ out: preempt_enable(); } -pteval_t xen_pte_val(pte_t pte) +/* Assume pteval_t is equivalent to all the other *val_t types. */ +static pteval_t pte_mfn_to_pfn(pteval_t val) { - pteval_t ret = pte.pte; + if (val & _PAGE_PRESENT) { + unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; + } - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return val; +} - return ret; +static pteval_t pte_pfn_to_mfn(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_MASK; + val = (pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + } + + return val; +} + +pteval_t xen_pte_val(pte_t pte) +{ + return pte_mfn_to_pfn(pte.pte); } pgdval_t xen_pgd_val(pgd_t pgd) { - pgdval_t ret = pgd.pgd; - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pgd.pgd); } pte_t xen_make_pte(pteval_t pte) { - if (pte & _PAGE_PRESENT) - pte = phys_to_machine(XPADDR(pte)).maddr; - - return (pte_t){ .pte = pte }; + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); } pgd_t xen_make_pgd(pgdval_t pgd) { - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; + pgd = pte_pfn_to_mfn(pgd); + return native_make_pgd(pgd); } pmdval_t xen_pmd_val(pmd_t pmd) { - pmdval_t ret = native_pmd_val(pmd); - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; - return ret; + return pte_mfn_to_pfn(pmd.pmd); } #ifdef CONFIG_X86_PAE void xen_set_pud(pud_t *ptr, pud_t val) @@ -265,9 +273,7 @@ void xen_pmd_clear(pmd_t *pmdp) pmd_t xen_make_pmd(pmdval_t pmd) { - if (pmd & _PAGE_PRESENT) - pmd = phys_to_machine(XPADDR(pmd)).maddr; - + pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } #else /* !PAE */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 288d587ce73c..3175e973fd0d 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -17,7 +17,7 @@ ENTRY(startup_xen) __FINIT -.pushsection .bss.page_aligned +.pushsection .text .align PAGE_SIZE_asm ENTRY(hypercall_page) .skip 0x1000 -- cgit v1.2.3 From ea71a546706dfdad72462624394e1e472c6bf34f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 20 Jun 2008 18:32:20 +0400 Subject: sched: refactor wait_for_completion_timeout() Simplify the code and fix the boundary condition of wait_for_completion_timeout(,0). We can kill the first __remove_wait_queue() as well. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra --- kernel/sched.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 577f160131bd..bebf9788f45e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4398,32 +4398,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) signal_pending(current)) || (state == TASK_KILLABLE && fatal_signal_pending(current))) { - __remove_wait_queue(&x->wait, &wait); - return -ERESTARTSYS; + timeout = -ERESTARTSYS; + break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); - - /* - * If the completion has arrived meanwhile - * then return 1 jiffy time left: - */ - if (x->done && !timeout) { - timeout = 1; - break; - } - - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - return timeout; - } - } while (!x->done); + } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; } x->done--; - return timeout; + return timeout ?: 1; } static long __sched -- cgit v1.2.3 From 55d8538498f62ec72b5ba67aa386c7726f630475 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 Jun 2008 12:23:15 -0700 Subject: Fix performance regression on lmbench select benchmark Christian Borntraeger reported that reinstating cond_resched() with CONFIG_PREEMPT caused a performance regression on lmbench: For example select file 500: 23 microseconds 32 microseconds and that's really because we totally unnecessarily do the cond_resched() in the innermost loop of select(), which is just silly. This moves it out from the innermost loop (which only ever loops ove the bits in a single "unsigned long" anyway), which makes the performance regression go away. Reported-and-tested-by: Christian Borntraeger Signed-off-by: Linus Torvalds --- fs/select.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/select.c b/fs/select.c index 8dda969614a9..da0e88201c3a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) retval++; } } - cond_resched(); } if (res_in) *rinp = res_in; @@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) *routp = res_out; if (res_ex) *rexp = res_ex; + cond_resched(); } wait = NULL; if (retval || !*timeout || signal_pending(current)) -- cgit v1.2.3 From 44e051773da465f8c92127914bc784770e0e2a28 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 23 Jun 2008 11:54:05 +0200 Subject: ALSA: aw2 - Fix Oops at initialization The irq handler may be called before the proper initialization of hardware. Call snd_aw2_saa7146_setup() before the irq handler registration. Signed-off-by: Takashi Iwai --- sound/pci/aw2/aw2-alsa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/aw2/aw2-alsa.c b/sound/pci/aw2/aw2-alsa.c index 56f87cd33c19..3f00ddf450f8 100644 --- a/sound/pci/aw2/aw2-alsa.c +++ b/sound/pci/aw2/aw2-alsa.c @@ -316,6 +316,8 @@ static int __devinit snd_aw2_create(struct snd_card *card, return -ENOMEM; } + /* (2) initialization of the chip hardware */ + snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt); if (request_irq(pci->irq, snd_aw2_saa7146_interrupt, IRQF_SHARED, "Audiowerk2", chip)) { @@ -329,8 +331,6 @@ static int __devinit snd_aw2_create(struct snd_card *card, } chip->irq = pci->irq; - /* (2) initialization of the chip hardware */ - snd_aw2_saa7146_setup(&chip->saa7146, chip->iobase_virt); err = snd_device_new(card, SNDRV_DEV_LOWLEVEL, chip, &ops); if (err < 0) { free_irq(chip->irq, (void *)chip); -- cgit v1.2.3 From 3e14b50dd4a3178f4f635267a2706b5d4f8c61ee Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 23 Jun 2008 11:58:06 +0200 Subject: ALSA: sb - Fix wrong assertions snd_assert() in save_mixer() and restore_mixer() in sb_mixer.c is just wrong. The debug code wasn't tested at all, obviously... Signed-off-by: Takashi Iwai --- sound/isa/sb/sb_mixer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/isa/sb/sb_mixer.c b/sound/isa/sb/sb_mixer.c index 91d14224f6b3..73d4572d136b 100644 --- a/sound/isa/sb/sb_mixer.c +++ b/sound/isa/sb/sb_mixer.c @@ -925,7 +925,7 @@ static unsigned char als4000_saved_regs[] = { static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) { unsigned char *val = chip->saved_regs; - snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); + snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return); for (; num_regs; num_regs--) *val++ = snd_sbmixer_read(chip, *regs++); } @@ -933,7 +933,7 @@ static void save_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) static void restore_mixer(struct snd_sb *chip, unsigned char *regs, int num_regs) { unsigned char *val = chip->saved_regs; - snd_assert(num_regs > ARRAY_SIZE(chip->saved_regs), return); + snd_assert(num_regs <= ARRAY_SIZE(chip->saved_regs), return); for (; num_regs; num_regs--) snd_sbmixer_write(chip, *regs++, *val++); } -- cgit v1.2.3 From 1b7558e457ed0de61023cfc913d2c342c7c3d9f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 23 Jun 2008 11:21:58 +0200 Subject: futexes: fix fault handling in futex_lock_pi This patch addresses a very sporadic pi-futex related failure in highly threaded java apps on large SMP systems. David Holmes reported that the pi_state consistency check in lookup_pi_state triggered with his test application. This means that the kernel internal pi_state and the user space futex variable are out of sync. First we assumed that this is a user space data corruption, but deeper investigation revieled that the problem happend because the pi-futex code is not handling a fault in the futex_lock_pi path when the user space variable needs to be fixed up. The fault happens when a fork mapped the anon memory which contains the futex readonly for COW or the page got swapped out exactly between the unlock of the futex and the return of either the new futex owner or the task which was the expected owner but failed to acquire the kernel internal rtmutex. The current futex_lock_pi() code drops out with an inconsistent in case it faults and returns -EFAULT to user space. User space has no way to fixup that state. When we wrote this code we thought that we could not drop the hash bucket lock at this point to handle the fault. After analysing the code again it turned out to be wrong because there are only two tasks involved which might modify the pi_state and the user space variable: - the task which acquired the rtmutex - the pending owner of the pi_state which did not get the rtmutex Both tasks drop into the fixup_pi_state() function before returning to user space. The first task which acquired the hash bucket lock faults in the fixup of the user space variable, drops the spinlock and calls futex_handle_fault() to fault in the page. Now the second task could acquire the hash bucket lock and tries to fixup the user space variable as well. It either faults as well or it succeeds because the first task already faulted the page in. One caveat is to avoid a double fixup. After returning from the fault handling we reacquire the hash bucket lock and check whether the pi_state owner has been modified already. Reported-by: David Holmes Signed-off-by: Thomas Gleixner Cc: Andrew Morton Cc: David Holmes Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Signed-off-by: Ingo Molnar kernel/futex.c | 93 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 20 deletions(-) --- kernel/futex.c | 93 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 20 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 449def8074fe..7d1136e97c14 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *newowner, + struct rw_semaphore *fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (curval == -EFAULT) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ if (pi_state->owner != NULL) { spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; + } pi_state->owner = newowner; @@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); spin_unlock_irq(&newowner->pi_lock); + return 0; /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. */ - ret = get_futex_value_locked(&uval, uaddr); +handle_fault: + spin_unlock(q->lock_ptr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + spin_lock(q->lock_ptr); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - return ret; + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* @@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released @@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, int res; owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); /* propagate -EFAULT, if the fixup failed */ if (res) -- cgit v1.2.3 From 87afd448b186c885d67a08b7417cd46253b6a9d6 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Mon, 23 Jun 2008 09:29:58 -0700 Subject: IB/mthca: Clear ICM pages before handing to FW Current memfree FW has a bug which in some cases, assumes that ICM pages passed to it are cleared. This patch uses __GFP_ZERO to allocate all ICM pages passed to the FW. Once firmware with a fix is released, we can make the workaround conditional on firmware version. This fixes the bug reported by Arthur Kepner here: http://lists.openfabrics.org/pipermail/general/2008-May/050026.html Cc: Signed-off-by: Eli Cohen [ Rewritten to be a one-liner using __GFP_ZERO instead of vmap()ing ICM memory and memset()ing it to 0. - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mthca/mthca_memfree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index b224079d4e1f..d5862e5d99a0 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -109,7 +109,11 @@ static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_m { struct page *page; - page = alloc_pages(gfp_mask, order); + /* + * Use __GFP_ZERO because buggy firmware assumes ICM pages are + * cleared, and subtle failures are seen if they aren't. + */ + page = alloc_pages(gfp_mask | __GFP_ZERO, order); if (!page) return -ENOMEM; -- cgit v1.2.3 From 36c7343b4ecac2432430f5393314f1bdc2c219a5 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 23 Jun 2008 12:06:52 +0100 Subject: tty_driver: Update required method documentation Some of the requirement rules are now more relaxed. Also correct a contradiction in the previous update Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- include/linux/tty_driver.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 59f1c0bd8f9c..d2a003586761 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -27,8 +27,7 @@ * This routine is called by the kernel to write a series of * characters to the tty device. The characters may come from * user space or kernel space. This routine will return the - * number of characters actually accepted for writing. This - * routine is mandatory. + * number of characters actually accepted for writing. * * Optional: Required for writable devices. * @@ -134,7 +133,7 @@ * This routine notifies the tty driver that it should hangup the * tty device. * - * Required: + * Optional: * * void (*break_ctl)(struct tty_stuct *tty, int state); * -- cgit v1.2.3 From 96a331b1d6426726c37242ddbe939ee14b255790 Mon Sep 17 00:00:00 2001 From: Gustavo Fernando Padovan Date: Mon, 23 Jun 2008 12:07:06 +0100 Subject: removed unused var real_tty on n_tty_ioctl() I noted that the 'struct tty_struct *real_tty' is not used in this function, so I removed the code about 'real_tty'. Signed-off-by: Gustavo Fernando Padovan Acked-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/char/tty_ioctl.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c index b1a757a5ee27..8f81139d6194 100644 --- a/drivers/char/tty_ioctl.c +++ b/drivers/char/tty_ioctl.c @@ -981,16 +981,9 @@ EXPORT_SYMBOL_GPL(tty_perform_flush); int n_tty_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) { - struct tty_struct *real_tty; unsigned long flags; int retval; - if (tty->driver->type == TTY_DRIVER_TYPE_PTY && - tty->driver->subtype == PTY_TYPE_MASTER) - real_tty = tty->link; - else - real_tty = tty; - switch (cmd) { case TCXONC: retval = tty_check_change(tty); -- cgit v1.2.3 From 672ca28e300c17bf8d792a2a7a8631193e580c74 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 23 Jun 2008 11:21:37 -0700 Subject: Fix ZERO_PAGE breakage with vmware Commit 89f5b7da2a6bad2e84670422ab8192382a5aeb9f ("Reinstate ZERO_PAGE optimization in 'get_user_pages()' and fix XIP") broke vmware, as reported by Jeff Chua: "This broke vmware 6.0.4. Jun 22 14:53:03.845: vmx| NOT_IMPLEMENTED /build/mts/release/bora-93057/bora/vmx/main/vmmonPosix.c:774" and the reason seems to be that there's an old bug in how we handle do FOLL_ANON on VM_SHARED areas in get_user_pages(), but since it only triggered if the whole page table was missing, nobody had apparently hit it before. The recent changes to 'follow_page()' made the FOLL_ANON logic trigger not just for whole missing page tables, but for individual pages as well, and exposed this problem. This fixes it by making the test for when FOLL_ANON is used more careful, and also makes the code easier to read and understand by moving the logic to a separate inline function. Reported-and-tested-by: Jeff Chua Signed-off-by: Linus Torvalds --- mm/memory.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 9aefaae46858..423e0e7c2f73 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1045,6 +1045,26 @@ no_page_table: return page; } +/* Can we do the FOLL_ANON optimization? */ +static inline int use_zero_page(struct vm_area_struct *vma) +{ + /* + * We don't want to optimize FOLL_ANON for make_pages_present() + * when it tries to page in a VM_LOCKED region. As to VM_SHARED, + * we want to get the page from the page tables to make sure + * that we serialize and update with any other user of that + * mapping. + */ + if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) + return 0; + /* + * And if we have a fault or a nopfn routine, it's not an + * anonymous region. + */ + return !vma->vm_ops || + (!vma->vm_ops->fault && !vma->vm_ops->nopfn); +} + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -1119,8 +1139,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags = FOLL_TOUCH; if (pages) foll_flags |= FOLL_GET; - if (!write && !(vma->vm_flags & VM_LOCKED) && - (!vma->vm_ops || !vma->vm_ops->fault)) + if (!write && use_zero_page(vma)) foll_flags |= FOLL_ANON; do { -- cgit v1.2.3 From 945754a1754f9d4c2974a8241ad4f92fad7f3a6a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 23 Jun 2008 14:30:30 +0200 Subject: mm: fix race in COW logic There is a race in the COW logic. It contains a shortcut to avoid the COW and reuse the page if we have the sole reference on the page, however it is possible to have two racing do_wp_page()ers with one causing the other to mistakenly believe it is safe to take the shortcut when it is not. This could lead to data corruption. Process 1 and process2 each have a wp pte of the same anon page (ie. one forked the other). The page's mapcount is 2. Then they both attempt to write to it around the same time... proc1 proc2 thr1 proc2 thr2 CPU0 CPU1 CPU3 do_wp_page() do_wp_page() trylock_page() can_share_swap_page() load page mapcount (==2) reuse = 0 pte unlock copy page to new_page pte lock page_remove_rmap(page); trylock_page() can_share_swap_page() load page mapcount (==1) reuse = 1 ptep_set_access_flags (allow W) write private key into page read from page ptep_clear_flush() set_pte_at(pte of new_page) Fix this by moving the page_remove_rmap of the old page after the pte clear and flush. Potentially the entire branch could be moved down here, but in order to stay consistent, I won't (should probably move all the *_mm_counter stuff with one patch). Signed-off-by: Nick Piggin Acked-by: Hugh Dickins Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 423e0e7c2f73..d14b251a25a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1785,7 +1785,6 @@ gotten: page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { - page_remove_rmap(old_page, vma); if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); @@ -1807,6 +1806,32 @@ gotten: lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page, vma); + } + /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; -- cgit v1.2.3 From 33852a1f2bb014e4047a844556c0d76a2f790c37 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 19 Jun 2008 14:20:11 -0400 Subject: NFS: Reduce the NFS mount code stack usage. This appears to fix the Oops reported in http://bugzilla.kernel.org/show_bug.cgi?id=10826 Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 68 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2a4a024a4e7b..dac663dc5611 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options, { struct nfs_mount_data *data = (struct nfs_mount_data *)options; - memset(args, 0, sizeof(*args)); - if (data == NULL) goto out_no_data; @@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type, { struct nfs_server *server = NULL; struct super_block *s; - struct nfs_fh mntfh; - struct nfs_parsed_mount_data data; + struct nfs_parsed_mount_data *data; + struct nfs_fh *mntfh; struct dentry *mntroot; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error; + int error = -ENOMEM; - security_init_mnt_opts(&data.lsm_opts); + data = kzalloc(sizeof(*data), GFP_KERNEL); + mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + + security_init_mnt_opts(&data->lsm_opts); /* Validate the mount data */ - error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); + error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs_create_server(&data, &mntfh); + server = nfs_create_server(data, mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (!s->s_root) { /* initial superblock/root creation */ - nfs_fill_super(s, &data); + nfs_fill_super(s, data); } - mntroot = nfs_get_root(s, &mntfh); + mntroot = nfs_get_root(s, mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; } - error = security_sb_set_mnt_opts(s, &data.lsm_opts); + error = security_sb_set_mnt_opts(s, &data->lsm_opts); if (error) goto error_splat_root; @@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(data.nfs_server.hostname); - kfree(data.mount_server.hostname); - security_free_mnt_opts(&data.lsm_opts); + kfree(data->nfs_server.hostname); + kfree(data->mount_server.hostname); + security_free_mnt_opts(&data->lsm_opts); +out_free_fh: + kfree(mntfh); + kfree(data); return error; out_err_nosb: @@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options, struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; - memset(args, 0, sizeof(*args)); - if (data == NULL) goto out_no_data; @@ -1959,26 +1963,31 @@ out_no_client_address: static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { - struct nfs_parsed_mount_data data; + struct nfs_parsed_mount_data *data; struct super_block *s; struct nfs_server *server; - struct nfs_fh mntfh; + struct nfs_fh *mntfh; struct dentry *mntroot; int (*compare_super)(struct super_block *, void *) = nfs_compare_super; struct nfs_sb_mountdata sb_mntdata = { .mntflags = flags, }; - int error; + int error = -ENOMEM; - security_init_mnt_opts(&data.lsm_opts); + data = kzalloc(sizeof(*data), GFP_KERNEL); + mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); + if (data == NULL || mntfh == NULL) + goto out_free_fh; + + security_init_mnt_opts(&data->lsm_opts); /* Validate the mount data */ - error = nfs4_validate_mount_data(raw_data, &data, dev_name); + error = nfs4_validate_mount_data(raw_data, data, dev_name); if (error < 0) goto out; /* Get a volume representation */ - server = nfs4_create_server(&data, &mntfh); + server = nfs4_create_server(data, mntfh); if (IS_ERR(server)) { error = PTR_ERR(server); goto out; @@ -2009,13 +2018,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, nfs4_fill_super(s); } - mntroot = nfs4_get_root(s, &mntfh); + mntroot = nfs4_get_root(s, mntfh); if (IS_ERR(mntroot)) { error = PTR_ERR(mntroot); goto error_splat_super; } - error = security_sb_set_mnt_opts(s, &data.lsm_opts); + error = security_sb_set_mnt_opts(s, &data->lsm_opts); if (error) goto error_splat_root; @@ -2025,10 +2034,13 @@ static int nfs4_get_sb(struct file_system_type *fs_type, error = 0; out: - kfree(data.client_address); - kfree(data.nfs_server.export_path); - kfree(data.nfs_server.hostname); - security_free_mnt_opts(&data.lsm_opts); + kfree(data->client_address); + kfree(data->nfs_server.export_path); + kfree(data->nfs_server.hostname); + security_free_mnt_opts(&data->lsm_opts); +out_free_fh: + kfree(mntfh); + kfree(data); return error; out_free: -- cgit v1.2.3 From b7e2445737ff69cef892b6fd9cd71cae2c9e9515 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 19 Jun 2008 15:21:11 -0400 Subject: NFS: Fix filehandle size comparisons in the mount code Fix a sign issue in xdr_decode_fhstatus3() Fix incorrect comparison in nfs_validate_mount_data() Signed-off-by: Trond Myklebust --- fs/nfs/mount_clnt.c | 5 +++-- fs/nfs/super.c | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 49c7cd0502cc..779d2eb649c5 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res) { struct nfs_fh *fh = res->fh; + unsigned size; if ((res->status = ntohl(*p++)) == 0) { - int size = ntohl(*p++); - if (size <= NFS3_FHSIZE) { + size = ntohl(*p++); + if (size <= NFS3_FHSIZE && size != 0) { fh->size = size; memcpy(fh->data, p, size); } else diff --git a/fs/nfs/super.c b/fs/nfs/super.c index dac663dc5611..614efeed5437 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1249,13 +1249,13 @@ static int nfs_validate_mount_data(void *options, case 5: memset(data->context, 0, sizeof(data->context)); case 6: - if (data->flags & NFS_MOUNT_VER3) + if (data->flags & NFS_MOUNT_VER3) { + if (data->root.size > NFS3_FHSIZE || data->root.size == 0) + goto out_invalid_fh; mntfh->size = data->root.size; - else + } else mntfh->size = NFS2_FHSIZE; - if (mntfh->size > sizeof(mntfh->data)) - goto out_invalid_fh; memcpy(mntfh->data, data->root.data, mntfh->size); if (mntfh->size < sizeof(mntfh->data)) -- cgit v1.2.3 From 03fa9e84e5dc10aeacb0e4eb2f708cd9fc36a5b8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 5 Jun 2008 16:02:35 -0400 Subject: NFS: nfs_updatepage(): don't mark page as dirty if an error occurred Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6d8ace3e3259..f333848fd3be 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page, } status = nfs_writepage_setup(ctx, page, offset, count); - __set_page_dirty_nobuffers(page); + if (status < 0) + nfs_set_pageerror(page); + else + __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", status, (long long)i_size_read(inode)); - if (status < 0) - nfs_set_pageerror(page); return status; } -- cgit v1.2.3 From 72c6e251ed84b3a9cdfde6711191155c47bb2b9c Mon Sep 17 00:00:00 2001 From: Thorsten Kranzkowski Date: Mon, 23 Jun 2008 20:57:22 +0000 Subject: alpha: fix compile error in arch/alpha/mm/init.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 9267b4b3880d00dc2dab90f1d817c856939114f7 ("alpha: fix module load failures on smp (bug #10926)") causes a regression for my ev4 uniprocessor build: CC arch/alpha/mm/init.o /export/data/repositories/linux-2.6/arch/alpha/mm/init.c:34: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘typeof’ make[2]: *** [arch/alpha/mm/init.o] Error 1 make[1]: *** [arch/alpha/mm] Error 2 make: *** [sub-make] Error 2 This fixes it for me (compile and boot tested): Signed-off-by: Thorsten Kranzkowski Acked-by: Ivan Kokshaysky Signed-off-by: Linus Torvalds --- include/asm-alpha/percpu.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-alpha/percpu.h b/include/asm-alpha/percpu.h index 82e8a94b4b2f..3495e8e00d70 100644 --- a/include/asm-alpha/percpu.h +++ b/include/asm-alpha/percpu.h @@ -69,6 +69,8 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) +#define PER_CPU_ATTRIBUTES + #endif /* SMP */ #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu_var(name) -- cgit v1.2.3 From d4acf7e7abe45457e751525a2a4d5b693dfdd597 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 6 Jun 2008 16:37:35 -0300 Subject: KVM: Fix race between timer migration and vcpu migration A guest vcpu instance can be scheduled to a different physical CPU between the test for KVM_REQ_MIGRATE_TIMER and local_irq_disable(). If that happens, the timer will only be migrated to the current pCPU on the next exit, meaning that guest LAPIC timer event can be delayed until a host interrupt is triggered. Fix it by cancelling guest entry if any vcpu request is pending. This has the side effect of nicely consolidating vcpu->requests checks. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00acf1301a15..b90744a1dc3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2759,6 +2759,8 @@ again: if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests)) { kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; @@ -2781,21 +2783,13 @@ again: local_irq_disable(); - if (need_resched()) { + if (vcpu->requests || need_resched()) { local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (vcpu->requests) - if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) { - local_irq_enable(); - preempt_enable(); - r = 1; - goto out; - } - if (signal_pending(current)) { local_irq_enable(); preempt_enable(); @@ -2825,9 +2819,6 @@ again: kvm_guest_enter(); - if (vcpu->requests) - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) - kvm_x86_ops->tlb_flush(vcpu); KVMTRACE_0D(VMENTRY, vcpu, entryexit); kvm_x86_ops->run(vcpu, kvm_run); -- cgit v1.2.3 From 06e05645661211b9eaadaf6344c335d2e80f0ba2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 6 Jun 2008 16:37:36 -0300 Subject: KVM: close timer injection race window in __vcpu_run If a timer fires after kvm_inject_pending_timer_irqs() but before local_irq_disable() the code will enter guest mode and only inject such timer interrupt the next time an unrelated event causes an exit. It would be simpler if the timer->pending irq conversion could be done with IRQ's disabled, so that the above problem cannot happen. For now introduce a new vcpu requests bit to cancel guest entry. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 9 ++++++--- arch/x86/kvm/lapic.c | 1 + arch/x86/kvm/x86.c | 1 + include/linux/kvm_host.h | 1 + 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index f2f5d260874e..3829aa7b663f 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,9 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps) atomic_inc(&pt->pending); smp_mb__after_atomic_inc(); - if (vcpu0 && waitqueue_active(&vcpu0->wq)) { - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; - wake_up_interruptible(&vcpu0->wq); + if (vcpu0) { + set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); + if (waitqueue_active(&vcpu0->wq)) { + vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; + wake_up_interruptible(&vcpu0->wq); + } } pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c297c50eba63..ebc03f5ae162 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic) wait_queue_head_t *q = &apic->vcpu->wq; atomic_inc(&apic->timer.pending); + set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); if (waitqueue_active(q)) { apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; wake_up_interruptible(q); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b90744a1dc3a..b08812d6b34c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2774,6 +2774,7 @@ again: } } + clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); kvm_inject_pending_timer_irqs(vcpu); preempt_disable(); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 092b1b25291d..de9d1df4bba2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -33,6 +33,7 @@ #define KVM_REQ_REPORT_TPR_ACCESS 2 #define KVM_REQ_MMU_RELOAD 3 #define KVM_REQ_TRIPLE_FAULT 4 +#define KVM_REQ_PENDING_TIMER 5 struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; -- cgit v1.2.3 From 6597ca09e6c0e5aec7ffd2b8ab48c671d3c28414 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sun, 8 Jun 2008 01:48:53 -0300 Subject: KVM: MMU: Fix rmap_write_protect() hugepage iteration bug rmap_next() does not work correctly after rmap_remove(), as it expects the rmap chains not to change during iteration. Fix (for now) by restarting iteration from the beginning. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ee3f53098f0c..9628091c574d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) rmap_remove(kvm, spte); --kvm->stat.lpages; set_shadow_pte(spte, shadow_trap_nonpresent_pte); + spte = NULL; write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); -- cgit v1.2.3 From 3094538739415a9225afd2a6c78cb0fe1c1f641b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 11 Jun 2008 20:32:40 -0300 Subject: KVM: MMU: large page update_pte issue with non-PAE 32-bit guests (resend) kvm_mmu_pte_write() does not handle 32-bit non-PAE large page backed guests properly. It will instantiate two 2MB sptes pointing to the same physical 2MB page when a guest large pte update is trapped. Instead of duplicating code to handle this, disallow directory level updates to happen through kvm_mmu_pte_write(), so the two 2MB sptes emulating one guest 4MB pte can be correctly created by the page fault handling path. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9628091c574d..baa6503894d3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1581,11 +1581,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, u64 *spte, const void *new) { - if ((sp->role.level != PT_PAGE_TABLE_LEVEL) - && !vcpu->arch.update_pte.largepage) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if (!vcpu->arch.update_pte.largepage || + sp->role.glevels == PT32_ROOT_LEVEL) { + ++vcpu->kvm->stat.mmu_pde_zapped; + return; + } + } ++vcpu->kvm->stat.mmu_pte_updated; if (sp->role.glevels == PT32_ROOT_LEVEL) -- cgit v1.2.3 From 6bf6a9532fd03ad719f0c86654f16ef777b78fc6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 12 Jun 2008 16:54:41 +0300 Subject: KVM: MMU: Fix oops on guest userspace access to guest pagetable KVM has a heuristic to unshadow guest pagetables when userspace accesses them, on the assumption that most guests do not allow userspace to access pagetables directly. Unfortunately, in addition to unshadowing the pagetables, it also oopses. This never triggers on ordinary guests since sane OSes will clear the pagetables before assigning them to userspace, which will trigger the flood heuristic, unshadowing the pagetables before the first userspace access. One particular guest, though (Xenner) will run the kernel in userspace, triggering the oops. Since the heuristic is incorrect in this case, we can simply remove it. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index baa6503894d3..7e7c3969f7a2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1083,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, struct kvm_mmu_page *shadow; spte |= PT_WRITABLE_MASK; - if (user_fault) { - mmu_unshadow(vcpu->kvm, gfn); - goto unshadowed; - } shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); if (shadow || @@ -1103,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } -unshadowed: - if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); -- cgit v1.2.3 From 4fa6b9c5dc4134bdeac341d731a87783cc11ca10 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 17 Jun 2008 15:36:36 -0700 Subject: KVM: ioapic: fix lost interrupt when changing a device's irq The ioapic acknowledge path translates interrupt vectors to irqs. It currently uses a first match algorithm, stopping when it finds the first redirection table entry containing the vector. That fails however if the guest changes the irq to a different line, leaving the old redirection table entry in place (though masked). Result is interrupts not making it to the guest. Fix by always scanning the entire redirection table. Signed-off-by: Avi Kivity --- virt/kvm/ioapic.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 98778cb69c6e..1dcf9f3d1107 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -269,28 +269,9 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) } } -static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector) +static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi) { - int i; - - for (i = 0; i < IOAPIC_NUM_PINS; i++) - if (ioapic->redirtbl[i].fields.vector == vector) - return i; - return -1; -} - -void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) -{ - struct kvm_ioapic *ioapic = kvm->arch.vioapic; union ioapic_redir_entry *ent; - int gsi; - - gsi = get_eoi_gsi(ioapic, vector); - if (gsi == -1) { - printk(KERN_WARNING "Can't find redir item for %d EOI\n", - vector); - return; - } ent = &ioapic->redirtbl[gsi]; ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); @@ -300,6 +281,16 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) ioapic_deliver(ioapic, gsi); } +void kvm_ioapic_update_eoi(struct kvm *kvm, int vector) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + int i; + + for (i = 0; i < IOAPIC_NUM_PINS; i++) + if (ioapic->redirtbl[i].fields.vector == vector) + __kvm_ioapic_update_eoi(ioapic, i); +} + static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr) { struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private; -- cgit v1.2.3 From a9b21b622958afc3f3bc5a23d266dd9ed1171fd3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 24 Jun 2008 11:48:49 +0300 Subject: KVM: VMX: Fix host msr corruption with preemption enabled Switching msrs can occur either synchronously as a result of calls to the msr management functions (usually in response to the guest touching virtualized msrs), or asynchronously when preempting a kvm thread that has guest state loaded. If we're unlucky enough to have the two at the same time, host msrs are corrupted and the machine goes kaput on the next syscall. Most easily triggered by Windows Server 2008, as it does a lot of msr switching during bootup. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 02efbe75f317..540e95179074 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) load_transition_efer(vmx); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +static void __vmx_load_host_state(struct vcpu_vmx *vmx) { unsigned long flags; @@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) reload_host_efer(vmx); } +static void vmx_load_host_state(struct vcpu_vmx *vmx) +{ + preempt_disable(); + __vmx_load_host_state(vmx); + preempt_enable(); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { - vmx_load_host_state(to_vmx(vcpu)); + __vmx_load_host_state(to_vmx(vcpu)); } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) switch (msr_index) { #ifdef CONFIG_X86_64 case MSR_EFER: + vmx_load_host_state(vmx); ret = kvm_set_msr_common(vcpu, msr_index, data); - if (vmx->host_state.loaded) { - reload_host_efer(vmx); - load_transition_efer(vmx); - } break; case MSR_FS_BASE: vmcs_writel(GUEST_FS_BASE, data); @@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) guest_write_tsc(data); break; default: + vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; - if (vmx->host_state.loaded) - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); break; } ret = kvm_set_msr_common(vcpu, msr_index, data); -- cgit v1.2.3 From 63842cccb285259345f52025ef57bdfd79657a2d Mon Sep 17 00:00:00 2001 From: Wim Van Sebroeck Date: Tue, 24 Jun 2008 13:09:26 +0000 Subject: Revert "[WATCHDOG] hpwdt: Add CFLAGS to get driver working" After Linus fixed the inline assembly, the CFLAGS option is not needed anymore. Signed-off-by: Thomas Mingarelli Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 8662a6b7a30b..25b352b664d9 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -68,7 +68,6 @@ obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o obj-$(CONFIG_I6300ESB_WDT) += i6300esb.o obj-$(CONFIG_ITCO_WDT) += iTCO_wdt.o iTCO_vendor_support.o obj-$(CONFIG_IT8712F_WDT) += it8712f_wdt.o -CFLAGS_hpwdt.o += -O obj-$(CONFIG_HP_WATCHDOG) += hpwdt.o obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o -- cgit v1.2.3 From 17c15da00c0e7289375ad57e8fea0c7892b74aa0 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 18 Jun 2008 11:30:40 -0500 Subject: [GFS2] BUG: unable to handle kernel paging request at ffff81002690e000 This patch fixes bugzilla bug bz448866: gfs2: BUG: unable to handle kernel paging request at ffff81002690e000. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6387523a3153..3401628d742b 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -195,7 +195,7 @@ ulong_aligned: depending on architecture. I've experimented with several ways of writing this section such as using an else before the goto but this one seems to be the fastest. */ - while ((unsigned char *)plong < end - 1) { + while ((unsigned char *)plong < end - sizeof(unsigned long)) { prefetch(plong + 1); if (((*plong) & LBITMASK) != lskipval) break; -- cgit v1.2.3 From 28499143933f19b28008a556ed59255d6009391a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 9 May 2008 12:05:57 +0100 Subject: xen: remove support for non-PAE 32-bit Non-PAE operation has been deprecated in Xen for a while, and is rarely tested or used. xen-unstable has now officially dropped non-PAE support. Since Xen/pvops' non-PAE support has also been broken for a while, we may as well completely drop it altogether. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 2 +- arch/x86/xen/enlighten.c | 51 ++++++++++++++++------------------------------ arch/x86/xen/mmu.c | 19 ++--------------- arch/x86/xen/mmu.h | 24 ++++++---------------- arch/x86/xen/xen-head.S | 4 ---- include/asm-x86/xen/page.h | 4 ---- 6 files changed, 27 insertions(+), 77 deletions(-) diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 2e641be2737e..525b108411bd 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,7 +6,7 @@ config XEN bool "Xen guest support" select PARAVIRT depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) + depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c048de34d6a1..f09c1c69c37a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + int i; /* special set_pte for pagetable initialization */ pv_mmu_ops.set_pte = xen_set_pte_init; init_mm.pgd = base; /* - * copy top-level of Xen-supplied pagetable into place. For - * !PAE we can use this as-is, but for PAE it is a stand-in - * while we copy the pmd pages. + * copy top-level of Xen-supplied pagetable into place. This + * is a stand-in while we copy the pmd pages. */ memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - if (PTRS_PER_PMD > 1) { - int i; - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); - make_lowmem_page_readonly(pmd); + make_lowme