From 70b50f94f1644e2aa7cb374819cfd93f3c28d725 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:36:59 -0700 Subject: mm: thp: tail page refcounting fix Michel while working on the working set estimation code, noticed that calling get_page_unless_zero() on a random pfn_to_page(random_pfn) wasn't safe, if the pfn ended up being a tail page of a transparent hugepage under splitting by __split_huge_page_refcount(). He then found the problem could also theoretically materialize with page_cache_get_speculative() during the speculative radix tree lookups that uses get_page_unless_zero() in SMP if the radix tree page is freed and reallocated and get_user_pages is called on it before page_cache_get_speculative has a chance to call get_page_unless_zero(). So the best way to fix the problem is to keep page_tail->_count zero at all times. This will guarantee that get_page_unless_zero() can never succeed on any tail page. page_tail->_mapcount is guaranteed zero and is unused for all tail pages of a compound page, so we can simply account the tail page references there and transfer them to tail_page->_count in __split_huge_page_refcount() (in addition to the head_page->_mapcount). While debugging this s/_count/_mapcount/ change I also noticed get_page is called by direct-io.c on pages returned by get_user_pages. That wasn't entirely safe because the two atomic_inc in get_page weren't atomic. As opposed to other get_user_page users like secondary-MMU page fault to establish the shadow pagetables would never call any superflous get_page after get_user_page returns. It's safer to make get_page universally safe for tail pages and to use get_page_foll() within follow_page (inside get_user_pages()). get_page_foll() is safe to do the refcounting for tail pages without taking any locks because it is run within PT lock protected critical sections (PT lock for pte and page_table_lock for pmd_trans_huge). The standard get_page() as invoked by direct-io instead will now take the compound_lock but still only for tail pages. The direct-io paths are usually I/O bound and the compound_lock is per THP so very finegrined, so there's no risk of scalability issues with it. A simple direct-io benchmarks with all lockdep prove locking and spinlock debugging infrastructure enabled shows identical performance and no overhead. So it's worth it. Ideally direct-io should stop calling get_page() on pages returned by get_user_pages(). The spinlock in get_page() is already optimized away for no-THP builds but doing get_page() on tail pages returned by GUP is generally a rare operation and usually only run in I/O paths. This new refcounting on page_tail->_mapcount in addition to avoiding new RCU critical sections will also allow the working set estimation code to work without any further complexity associated to the tail page refcounting with THP. Signed-off-by: Andrea Arcangeli Reported-by: Michel Lespinasse Reviewed-by: Michel Lespinasse Reviewed-by: Minchan Kim Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/gup.c | 5 +-- arch/x86/mm/gup.c | 5 +-- include/linux/mm.h | 56 +++++++++++++------------------- include/linux/mm_types.h | 21 +++++++++--- mm/huge_memory.c | 37 ++++++++++++++------- mm/internal.h | 46 +++++++++++++++++++++++++++ mm/memory.c | 2 +- mm/swap.c | 83 +++++++++++++++++++++++++++++++----------------- 8 files changed, 171 insertions(+), 84 deletions(-) diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index fec13200868f..b9e1c7ff5f6d 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -22,8 +22,9 @@ static inline void get_huge_page_tail(struct page *page) * __split_huge_page_refcount() cannot run * from under us. */ - VM_BUG_ON(atomic_read(&page->_count) < 0); - atomic_inc(&page->_count); + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); } /* diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dbe34b931374..3b5032a62b0f 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -114,8 +114,9 @@ static inline void get_huge_page_tail(struct page *page) * __split_huge_page_refcount() cannot run * from under us. */ - VM_BUG_ON(atomic_read(&page->_count) < 0); - atomic_inc(&page->_count); + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); } static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b3e3b8bb706..f81b7b41930c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -356,36 +356,39 @@ static inline struct page *compound_head(struct page *page) return page; } +/* + * The atomic page->_mapcount, starts from -1: so that transitions + * both from it and to it can be tracked, using atomic_inc_and_test + * and atomic_add_negative(-1). + */ +static inline void reset_page_mapcount(struct page *page) +{ + atomic_set(&(page)->_mapcount, -1); +} + +static inline int page_mapcount(struct page *page) +{ + return atomic_read(&(page)->_mapcount) + 1; +} + static inline int page_count(struct page *page) { return atomic_read(&compound_head(page)->_count); } +extern bool __get_page_tail(struct page *page); + static inline void get_page(struct page *page) { + if (unlikely(PageTail(page))) + if (likely(__get_page_tail(page))) + return; /* * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. Only if - * we're getting a tail page, the elevated page->_count is - * required only in the head page, so for tail pages the - * bugcheck only verifies that the page->_count isn't - * negative. + * requires to already have an elevated page->_count. */ - VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); + VM_BUG_ON(atomic_read(&page->_count) <= 0); atomic_inc(&page->_count); - /* - * Getting a tail page will elevate both the head and tail - * page->_count(s). - */ - if (unlikely(PageTail(page))) { - /* - * This is safe only because - * __split_huge_page_refcount can't run under - * get_page(). - */ - VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); - atomic_inc(&page->first_page->_count); - } } static inline struct page *virt_to_head_page(const void *x) @@ -803,21 +806,6 @@ static inline pgoff_t page_index(struct page *page) return page->index; } -/* - * The atomic page->_mapcount, like _count, starts from -1: - * so that transitions both from it and to it can be tracked, - * using atomic_inc_and_test and atomic_add_negative(-1). - */ -static inline void reset_page_mapcount(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); -} - -static inline int page_mapcount(struct page *page) -{ - return atomic_read(&(page)->_mapcount) + 1; -} - /* * Return true if this page is mapped into pagetables. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e01a19a91e8..5b42f1b34eb7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -62,10 +62,23 @@ struct page { struct { union { - atomic_t _mapcount; /* Count of ptes mapped in mms, - * to show when page is mapped - * & limit reverse map searches. - */ + /* + * Count of ptes mapped in + * mms, to show when page is + * mapped & limit reverse map + * searches. + * + * Used also for tail pages + * refcounting instead of + * _count. Tail pages cannot + * be mapped and keeping the + * tail page _count zero at + * all times guarantees + * get_page_unless_zero() will + * never succeed on tail + * pages. + */ + atomic_t _mapcount; struct { unsigned inuse:16; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 860ec211ddd6..4298abaae153 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -990,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON(!PageCompound(page)); if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); out: return page; @@ -1202,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page) unsigned long head_index = page->index; struct zone *zone = page_zone(page); int zonestat; + int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1210,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page) for (i = 1; i < HPAGE_PMD_NR; i++) { struct page *page_tail = page + i; - /* tail_page->_count cannot change */ - atomic_sub(atomic_read(&page_tail->_count), &page->_count); - BUG_ON(page_count(page) <= 0); - atomic_add(page_mapcount(page) + 1, &page_tail->_count); - BUG_ON(atomic_read(&page_tail->_count) <= 0); + /* tail_page->_mapcount cannot change */ + BUG_ON(page_mapcount(page_tail) < 0); + tail_count += page_mapcount(page_tail); + /* check for overflow */ + BUG_ON(tail_count < 0); + BUG_ON(atomic_read(&page_tail->_count) != 0); + /* + * tail_page->_count is zero and not changing from + * under us. But get_page_unless_zero() may be running + * from under us on the tail_page. If we used + * atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is + * implemented in C not using locked ops. spin_unlock + * on x86 sometime uses locked ops because of PPro + * errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and + * not only on x86), it's safer to use atomic_add(). + */ + atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, + &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ smp_mb(); @@ -1232,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page) (1L << PG_uptodate))); page_tail->flags |= (1L << PG_dirty); - /* - * 1) clear PageTail before overwriting first_page - * 2) clear PageTail before clearing PageHead for VM_BUG_ON - */ + /* clear PageTail before overwriting first_page */ smp_wmb(); /* @@ -1252,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page) * status is achieved setting a reserved bit in the * pmd, not by clearing the present bit. */ - BUG_ON(page_mapcount(page_tail)); page_tail->_mapcount = page->_mapcount; BUG_ON(page_tail->mapping); @@ -1269,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page) lru_add_page_tail(zone, page, page_tail); } + atomic_sub(tail_count, &page->_count); + BUG_ON(atomic_read(&page->_count) <= 0); __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); diff --git a/mm/internal.h b/mm/internal.h index d071d380fb49..2189af491783 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +static inline void __get_page_tail_foll(struct page *page, + bool get_page_head) +{ + /* + * If we're getting a tail page, the elevated page->_count is + * required only in the head page and we will elevate the head + * page->_count and tail page->_mapcount. + * + * We elevate page_tail->_mapcount for tail pages to force + * page_tail->_count to be zero at all times to avoid getting + * false positives from get_page_unless_zero() with + * speculative page access (like in + * page_cache_get_speculative()) on tail pages. + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + VM_BUG_ON(page_mapcount(page) < 0); + if (get_page_head) + atomic_inc(&page->first_page->_count); + atomic_inc(&page->_mapcount); +} + +/* + * This is meant to be called as the FOLL_GET operation of + * follow_page() and it must be called while holding the proper PT + * lock while the pte (or pmd_trans_huge) is still mapping the page. + */ +static inline void get_page_foll(struct page *page) +{ + if (unlikely(PageTail(page))) + /* + * This is safe only because + * __split_huge_page_refcount() can't run under + * get_page_foll() because we hold the proper PT lock. + */ + __get_page_tail_foll(page, true); + else { + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_inc(&page->_count); + } +} + extern unsigned long highest_memmap_pfn; /* diff --git a/mm/memory.c b/mm/memory.c index a56e3ba816b2..b2b87315cdc6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1503,7 +1503,7 @@ split_fallthrough: } if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) diff --git a/mm/swap.c b/mm/swap.c index 3a442f18b0b3..87627f181c3f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) { if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ - struct page *page_head = page->first_page; - smp_rmb(); - /* - * If PageTail is still set after smp_rmb() we can be sure - * that the page->first_page we read wasn't a dangling pointer. - * See __split_huge_page_refcount() smp_wmb(). - */ - if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && + get_page_unless_zero(page_head))) { unsigned long flags; /* - * Verify that our page_head wasn't converted - * to a a regular page before we got a - * reference on it. + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. */ - if (unlikely(!PageHead(page_head))) { - /* PageHead is cleared after PageTail */ - smp_rmb(); - VM_BUG_ON(PageTail(page)); - goto out_put_head; - } - /* - * Only run compound_lock on a valid PageHead, - * after having it pinned with - * get_page_unless_zero() above. - */ - smp_mb(); - /* page_head wasn't a dangling pointer */ flags = compound_lock_irqsave(page_head); if (unlikely(!PageTail(page))) { /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); VM_BUG_ON(PageHead(page_head)); - out_put_head: if (put_page_testzero(page_head)) __put_single_page(page_head); out_put_single: @@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) VM_BUG_ON(page_head != page->first_page); /* * We can release the refcount taken by - * get_page_unless_zero now that - * split_huge_page_refcount is blocked on the - * compound_lock. + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on + * the compound_lock. */ if (put_page_testzero(page_head)) VM_BUG_ON(1); /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(atomic_read(&page->_count) <= 0); - atomic_dec(&page->_count); + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { if (PageHead(page_head)) @@ -160,6 +144,45 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/* + * This function is exported but must not be called by anything other + * than get_page(). It implements the slow path of get_page(). + */ +bool __get_page_tail(struct page *page) +{ + /* + * This takes care of get_page() if run on a tail page + * returned by one of the get_user_pages/follow_page variants. + * get_user_pages/follow_page itself doesn't need the compound + * lock because it runs __get_page_tail_foll() under the + * proper PT lock that already serializes against + * split_huge_page(). + */ + unsigned long flags; + bool got = false; + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + got = true; + } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); + } + return got; +} +EXPORT_SYMBOL(__get_page_tail); + /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru -- cgit v1.2.3 From 2839bdc1bfc0af76a2f0f11eca011590520a04fa Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:03 -0700 Subject: powerpc: remove superfluous PageTail checks on the pte gup_fast This part of gup_fast doesn't seem capable of handling hugetlbfs ptes, those should be handled by gup_hugepd only, so these checks are superfluous. Plus if this wasn't a noop, it would have oopsed because, the insistence of using the speculative refcounting would trigger a VM_BUG_ON if a tail page was encountered in the page_cache_get_speculative(). Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/gup.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index b9e1c7ff5f6d..d7efdbf640c7 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -16,17 +16,6 @@ #ifdef __HAVE_ARCH_PTE_SPECIAL -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -58,8 +47,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(page); return 0; } - if (PageTail(page)) - get_huge_page_tail(page); pages[*nr] = page; (*nr)++; -- cgit v1.2.3 From 405e44f2e312dd5dd63e5a9f459bffcbcd4368ef Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:08 -0700 Subject: powerpc: get_hugepte() don't put_page() the wrong page "page" may have changed to point to the next hugepage after the loop completed, The references have been taken on the head page, so the put_page must happen there too. This is a longstanding issue pre-thp inclusion. It's totally unclear how these page_cache_add_speculative and pte_val(pte) != pte_val(*ptep) checks are necessary across all the powerpc gup_fast code, when x86 doesn't need any of that: there's no way the page can be freed with irq disabled so we're guaranteed the atomic_inc will happen on a page with page_count > 0 (so not needing the speculative check). The pte check is also meaningless on x86: no need to rollback on x86 if the pte changed, because the pte can still change a CPU tick after the check succeeded and it won't be rolled back in that case. The important thing is we got a reference on a valid page that was mapped there a CPU tick ago. So not knowing the soft tlb refill code of ppc64 in great detail I'm not removing the "speculative" page_count increase and the pte checks across all the code, but unless there's a strong reason for it they should be later cleaned up too. If a pte can change from huge to non-huge (like it could happen with THP) passing a pte_t *ptep to gup_hugepte() would also require to repeat the is_hugepd in gup_hugepte(), but that shouldn't happen with hugetlbfs only so I'm not altering that. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0b9a5c1901b9..b649c288af90 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -429,7 +429,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ while (*nr) { - put_page(page); + put_page(head); (*nr)--; } } -- cgit v1.2.3 From 8596468487e2062cae2aad56e973784e03959245 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:11 -0700 Subject: powerpc: gup_hugepte() avoid freeing the head page too many times We only taken "refs" pins on the head page not "*nr" pins. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b649c288af90..78b14abded65 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -428,10 +428,9 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ - while (*nr) { + *nr -= refs; + while (refs--) put_page(head); - (*nr)--; - } } return 1; -- cgit v1.2.3 From 3526741f0964c88bc2ce511e1078359052bf225b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:15 -0700 Subject: powerpc: gup_hugepte() support THP based tail recounting Up to this point the code assumed old refcounting for hugepages (pre-thp). This updates the code directly to the thp mapcount tail page refcounting. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 78b14abded65..a618ef01bfad 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -385,12 +385,23 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask; unsigned long pte_end; - struct page *head, *page; + struct page *head, *page, *tail; pte_t pte; int refs; @@ -413,6 +424,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add head = pte_page(pte); page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -431,6 +443,16 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add *nr -= refs; while (refs--) put_page(head); + } else { + /* + * Any tail page need their mapcount reference taken + * before we return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } } return 1; -- cgit v1.2.3 From cf592bf768c4fa40282b8fce58a80820065de2cb Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:19 -0700 Subject: powerpc: gup_huge_pmd() return 0 if pte changes powerpc didn't return 0 in that case, if it's rolling back the *nr pointer it should also return zero to avoid adding pages to the array at the wrong offset. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Acked-by: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a618ef01bfad..1c59d94f5942 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -443,16 +443,17 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add *nr -= refs; while (refs--) put_page(head); - } else { - /* - * Any tail page need their mapcount reference taken - * before we return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; } return 1; -- cgit v1.2.3 From 220a2eb228d032acde60e9fd044ca802706ff583 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:25 -0700 Subject: s390: gup_huge_pmd() support THP tail recounting Up to this point the code assumed old refcounting for hugepages (pre-thp). This updates the code directly to the thp mapcount tail page refcounting. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/gup.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 45b405ca2567..668dda964f20 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -48,11 +48,22 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 1; } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask, result; - struct page *head, *page; + struct page *head, *page, *tail; int refs; result = write ? 0 : _SEGMENT_ENTRY_RO; @@ -64,6 +75,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, refs = 0; head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -81,6 +93,16 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, *nr -= refs; while (refs--) put_page(head); + } else { + /* + * Any tail page need their mapcount reference taken + * before we return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } } return 1; -- cgit v1.2.3 From 0693bc9ce2cc4f6a1b9c3c05790fc149a74c0b87 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:28 -0700 Subject: s390: gup_huge_pmd() return 0 if pte changes s390 didn't return 0 in that case, if it's rolling back the *nr pointer it should also return zero to avoid adding pages to the array at the wrong offset. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/gup.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 668dda964f20..da33a0281d9d 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -93,16 +93,17 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, *nr -= refs; while (refs--) put_page(head); - } else { - /* - * Any tail page need their mapcount reference taken - * before we return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; } return 1; -- cgit v1.2.3 From e0d85a366c2300efd230ef82a9b22110b0658331 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:31 -0700 Subject: sparc: gup_pte_range() support THP based tail recounting Up to this point the code assumed old refcounting for hugepages (pre-thp). This updates the code directly to the thp mapcount tail page refcounting. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Acked-by: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/mm/gup.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index a986b5d05712..afcebac144fb 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -12,6 +12,17 @@ #include #include +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -56,6 +67,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(head); return 0; } + if (head != page) + get_huge_page_tail(page); pages[*nr] = page; (*nr)++; -- cgit v1.2.3 From b35a35b556f5e6b7993ad0baf20173e75c09ce8c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 2 Nov 2011 13:37:36 -0700 Subject: thp: share get_huge_page_tail() This avoids duplicating the function in every arch gup_fast. Signed-off-by: Andrea Arcangeli Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Johannes Weiner Cc: Rik van Riel Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 11 ----------- arch/s390/mm/gup.c | 11 ----------- arch/sparc/mm/gup.c | 11 ----------- arch/x86/mm/gup.c | 11 ----------- include/linux/mm.h | 11 +++++++++++ 5 files changed, 11 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1c59d94f5942..da5eb3885702 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -385,17 +385,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index da33a0281d9d..65cb06e2af4e 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -48,17 +48,6 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 1; } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index afcebac144fb..42c55df3aec3 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -12,17 +12,6 @@ #include #include -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 3b5032a62b0f..ea305856151c 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -108,17 +108,6 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(page_mapcount(page) < 0); - VM_BUG_ON(atomic_read(&page->_count) != 0); - atomic_inc(&page->_mapcount); -} - static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/include/linux/mm.h b/include/linux/mm.h index f81b7b41930c..3dc3a8c2c485 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -376,6 +376,17 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(page_mapcount(page) < 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + atomic_inc(&page->_mapcount); +} + extern bool __get_page_tail(struct page *page); static inline void get_page(struct page *page) -- cgit v1.2.3 From a3defbe5c337dbc6da911f8cc49ae3cc3b49b453 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 2 Nov 2011 13:37:41 -0700 Subject: binfmt_elf: fix PIE execution with randomization disabled The case of address space randomization being disabled in runtime through randomize_va_space sysctl is not treated properly in load_elf_binary(), resulting in SIGKILL coming at exec() time for certain PIE-linked binaries in case the randomization has been disabled at runtime prior to calling exec(). Handle the randomize_va_space == 0 case the same way as if we were not supporting .text randomization at all. Based on original patch by H.J. Lu and Josh Boyer. Signed-off-by: Jiri Kosina Cc: Ingo Molnar Cc: Russell King Cc: H.J. Lu Cc: Tested-by: Josh Boyer Acked-by: Nicolas Pitre Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index dd0fdfc56d38..21ac5ee4b43f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -795,7 +795,16 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * might try to exec. This is because the brk will * follow the loader, and is not movable. */ #if defined(CONFIG_X86) || defined(CONFIG_ARM) - load_bias = 0; + /* Memory randomization might have been switched off + * in runtime via sysctl. + * If that is the case, retain the original non-zero + * load_bias value in order to establish proper + * non-randomized mappings. + */ + if (current->flags & PF_RANDOMIZE) + load_bias = 0; + else + load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #else load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); #endif -- cgit v1.2.3 From 0620d9193cb976ba635d56a6cfd11cb81616d02b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 2 Nov 2011 13:37:45 -0700 Subject: ramfs: remove module leftovers Since ramfs is hard-selected to "y", the module leftovers make no sense. Signed-off-by: Richard Weinberger Reviewed-by: WANG Cong Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/inode.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index eacb166fb259..462ceb38fec6 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -23,7 +23,6 @@ * caches is sufficient. */ -#include #include #include #include @@ -288,14 +287,7 @@ static int __init init_ramfs_fs(void) { return register_filesystem(&ramfs_fs_type); } - -static void __exit exit_ramfs_fs(void) -{ - unregister_filesystem(&ramfs_fs_type); -} - module_init(init_ramfs_fs) -module_exit(exit_ramfs_fs) int __init init_rootfs(void) { @@ -311,5 +303,3 @@ int __init init_rootfs(void) return err; } - -MODULE_LICENSE("GPL"); -- cgit v1.2.3 From f919b9235f930e649b374a50009c6c268bd9a073 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Wed, 2 Nov 2011 13:37:47 -0700 Subject: init/do_mounts_rd.c: fix ramdisk identification for padded cramfs When a cramfs ramdisk padded with 512 bytes is given to the kernel, the current identify_ramdisk_image function fails to identify it. Tested with a padded cramfs image on an ARM based board. Signed-off-by: Neil Armstrong Cc: Namhyung Kim Cc: Davidlohr Bueso Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/do_mounts_rd.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index fe9acb0ae480..887629e24c54 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -119,6 +119,20 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) goto done; } + /* + * Read 512 bytes further to check if cramfs is padded + */ + sys_lseek(fd, start_block * BLOCK_SIZE + 0x200, 0); + sys_read(fd, buf, size); + + if (cramfsb->magic == CRAMFS_MAGIC) { + printk(KERN_NOTICE + "RAMDISK: cramfs filesystem found at block %d\n", + start_block); + nblocks = (cramfsb->size + BLOCK_SIZE - 1) >> BLOCK_SIZE_BITS; + goto done; + } + /* * Read block 1 to test for minix and ext2 superblock */ -- cgit v1.2.3 From 6d03d06db8881f4f9da87d5c77234b98c40a30e9 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Wed, 2 Nov 2011 13:37:49 -0700 Subject: drivers/rtc/class.c: convert idr to ida and use ida_simple_get() This is the one use of an ida that doesn't retry on receiving -EAGAIN. I'm assuming do so will cause no harm and may help on a rare occasion. Signed-off-by: Jonathan Cameron Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/class.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 01a7df5317c1..e8326f26fa2f 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -21,16 +21,13 @@ #include "rtc-core.h" -static DEFINE_IDR(rtc_idr); -static DEFINE_MUTEX(idr_lock); +static DEFINE_IDA(rtc_ida); struct class *rtc_class; static void rtc_device_release(struct device *dev) { struct rtc_device *rtc = to_rtc_device(dev); - mutex_lock(&idr_lock); - idr_remove(&rtc_idr, rtc->id); - mutex_unlock(&idr_lock); + ida_simple_remove(&rtc_ida, rtc->id); kfree(rtc); } @@ -146,25 +143,16 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, struct rtc_wkalrm alrm; int id, err; - if (idr_pre_get(&rtc_idr, GFP_KERNEL) == 0) { - err = -ENOMEM; + id = ida_simple_get(&rtc_ida, 0, 0, GFP_KERNEL); + if (id < 0) { + err = id; goto exit; } - - mutex_lock(&idr_lock); - err = idr_get_new(&rtc_idr, NULL, &id); - mutex_unlock(&idr_lock); - - if (err < 0) - goto exit; - - id = id & MAX_ID_MASK; - rtc = kzalloc(sizeof(struct rtc_device), GFP_KERNEL); if (rtc == NULL) { err = -ENOMEM; - goto exit_idr; + goto exit_ida; } rtc->id = id; @@ -222,10 +210,8 @@ struct rtc_device *rtc_device_register(const char *name, struct device *dev, exit_kfree: kfree(rtc); -exit_idr: - mutex_lock(&idr_lock); - idr_remove(&rtc_idr, id); - mutex_unlock(&idr_lock); +exit_ida: + ida_simple_remove(&rtc_ida, id); exit: dev_err(dev, "rtc core: unable to register %s, err = %d\n", @@ -276,7 +262,7 @@ static void __exit rtc_exit(void) { rtc_dev_exit(); class_destroy(rtc_class); - idr_destroy(&rtc_idr); + ida_destroy(&rtc_ida); } subsys_initcall(rtc_init); -- cgit v1.2.3 From 43fcb81550f7a16be192b19c77a379c9b27b1585 Mon Sep 17 00:00:00 2001 From: David Anders Date: Wed, 2 Nov 2011 13:37:53 -0700 Subject: rtc: add initial support for mcp7941x parts Add initial support for the microchip mcp7941x series of real time clocks. The mcp7941x series is generally compatible with the ds1307 and ds1337 rtc devices from dallas semiconductor. minor differences include a backup battery enable bit, and the polarity of the oscillator enable bit. Signed-off-by: David Anders Cc: Alessandro Zummo Reviewed-by: Wolfram Sang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1307.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index b2005b44e4f7..62b0763b7b9a 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -34,6 +34,7 @@ enum ds_type { ds_1388, ds_3231, m41t00, + mcp7941x, rx_8025, // rs5c372 too? different address... }; @@ -43,6 +44,7 @@ enum ds_type { #define DS1307_REG_SECS 0x00 /* 00-59 */ # define DS1307_BIT_CH 0x80 # define DS1340_BIT_nEOSC 0x80 +# define MCP7941X_BIT_ST 0x80 #define DS1307_REG_MIN 0x01 /* 00-59 */ #define DS1307_REG_HOUR 0x02 /* 00-23, or 1-12{am,pm} */ # define DS1307_BIT_12HR 0x40 /* in REG_HOUR */ @@ -50,6 +52,7 @@ enum ds_type { # define DS1340_BIT_CENTURY_EN 0x80 /* in REG_HOUR */ # define DS1340_BIT_CENTURY 0x40 /* in REG_HOUR */ #define DS1307_REG_WDAY 0x03 /* 01-07 */ +# define MCP7941X_BIT_VBATEN 0x08 #define DS1307_REG_MDAY 0x04 /* 01-31 */ #define DS1307_REG_MONTH 0x05 /* 01-12 */ # define DS1337_BIT_CENTURY 0x80 /* in REG_MONTH */ @@ -137,6 +140,8 @@ static const struct chip_desc chips[] = { }, [m41t00] = { }, +[mcp7941x] = { +}, [rx_8025] = { }, }; @@ -149,6 +154,7 @@ static const struct i2c_device_id ds1307_id[] = { { "ds1340", ds_1340 }, { "ds3231", ds_3231 }, { "m41t00", m41t00 }, + { "mcp7941x", mcp7941x }, { "pt7c4338", ds_1307 }, { "rx8025", rx_8025 }, { } @@ -365,6 +371,10 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t) buf[DS1307_REG_HOUR] |= DS1340_BIT_CENTURY_EN | DS1340_BIT_CENTURY; break; + case mcp7941x: + buf[DS1307_REG_SECS] |= MCP7941X_BIT_ST; + buf[DS1307_REG_WDAY] |= MCP7941X_BIT_VBATEN; + break; default: break; } @@ -808,6 +818,23 @@ read_rtc: i2c_smbus_write_byte_data(client, DS1340_REG_FLAG, 0); dev_warn(&client->dev, "SET TIME!\n"); } + break; + case mcp7941x: + /* make sure that the backup battery is enabled */ + if (!(ds1307->regs[DS1307_REG_WDAY] & MCP7941X_BIT_VBATEN)) { + i2c_smbus_write_byte_data(client, DS1307_REG_WDAY, + ds1307->regs[DS1307_REG_WDAY] + | MCP7941X_BIT_VBATEN); + } + + /* clock halted? turn it on, so clock can tick. */ + if (!(tmp & MCP7941X_BIT_ST)) { + i2c_smbus_write_byte_data(client, DS1307_REG_SECS, + MCP7941X_BIT_ST); + dev_warn(&client->dev, "SET TIME!\n"); + goto read_rtc; + } + break; case rx_8025: case ds_1337: -- cgit v1.2.3 From db5cf8d1ac4ac3fa06d89345154ce20068aeb097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 2 Nov 2011 13:37:56 -0700 Subject: drivers/rtc/rtc-mc13xxx.c: move probe and remove callbacks to .init.text and .exit.text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver is added using platform_driver_probe(), so the callbacks can be discarded more aggessively. Signed-off-by: Uwe Kleine-König Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index a1a278bc340d..9d0c3b478d55 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -309,7 +309,7 @@ static irqreturn_t mc13xxx_rtc_reset_handler(int irq, void *dev) return IRQ_HANDLED; } -static int __devinit mc13xxx_rtc_probe(struct platform_device *pdev) +static int __init mc13xxx_rtc_probe(struct platform_device *pdev) { int ret; struct mc13xxx_rtc *priv; @@ -378,7 +378,7 @@ err_reset_irq_request: return ret; } -static int __devexit mc13xxx_rtc_remove(struct platform_device *pdev) +static int __exit mc13xxx_rtc_remove(struct platform_device *pdev) { struct mc13xxx_rtc *priv = platform_get_drvdata(pdev); @@ -410,7 +410,7 @@ const struct platform_device_id mc13xxx_rtc_idtable[] = { static struct platform_driver mc13xxx_rtc_driver = { .id_table = mc13xxx_rtc_idtable, - .remove = __devexit_p(mc13xxx_rtc_remove), + .remove = __exit_p(mc13xxx_rtc_remove), .driver = { .name = DRIVER_NAME, .owner = THIS_MODULE, -- cgit v1.2.3 From b6eb48d02dc73d19bebc396a9e92dd64a65d3199 Mon Sep 17 00:00:00 2001 From: Sami Kerola Date: Wed, 2 Nov 2011 13:37:58 -0700 Subject: minix: describe usage of different magic numbers One can get this information from minix/inode.c, but adding the explanations at the definition sites is more appropriate. Signed-off-by: Sami Kerola Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/magic.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/magic.h b/include/linux/magic.h index 1e5df2af8d84..2d4beab0d5b7 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -30,11 +30,11 @@ #define ANON_INODE_FS_MAGIC 0x09041934 #define PSTOREFS_MAGIC 0x6165676C -#define MINIX_SUPER_MAGIC 0x137F /* original minix fs */ -#define MINIX_SUPER_MAGIC2 0x138F /* minix fs, 30 char names */ -#define MINIX2_SUPER_MAGIC 0x2468 /* minix V2 fs */ -#define MINIX2_SUPER_MAGIC2 0x2478 /* minix V2 fs, 30 char names */ -#define MINIX3_SUPER_MAGIC 0x4d5a /* minix V3 fs */ +#define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ +#define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ +#define MINIX2_SUPER_MAGIC 0x2468 /* minix v2 fs, 14 char names */ +#define MINIX2_SUPER_MAGIC2 0x2478 /* minix v2 fs, 30 char names */ +#define MINIX3_SUPER_MAGIC 0x4d5a /* minix v3 fs, 60 char names */ #define MSDOS_SUPER_MAGIC 0x4d44 /* MD */ #define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */ -- cgit v1.2.3 From 3069083cc8def2ffad8520f0f24c6f95f140aac5 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 2 Nov 2011 13:38:00 -0700 Subject: isofs: add readpages support Use mpage_readpages() instead of multiple calls to isofs_readpage() to reduce the CPU utilization and make performance higher. Signed-off-by: Namjae Jeon Cc: Al Viro Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index a5d03672d04e..46844ff39d61 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "isofs.h" #include "zisofs.h" @@ -1148,7 +1149,13 @@ struct buffer_head *isofs_bread(struct inode *inode, sector_t block) static int isofs_readpage(struct file *file, struct page *page) { - return block_read_full_page(page,isofs_get_block); + return mpage_readpage(page, isofs_get_block); +} + +static int isofs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1158,6 +1165,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, + .readpages = isofs_readpages, .bmap = _isofs_bmap }; -- cgit v1.2.3 From 434a964daa14b9db083ce20404a4a2add54d037a Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Wed, 2 Nov 2011 13:38:01 -0700 Subject: hfs: fix hfs_find_init() sb->ext_tree NULL ptr oops Clement Lecigne reports a filesystem which causes a kernel oops in hfs_find_init() trying to dereference sb->ext_tree which is NULL. This proves to be because the filesystem has a corrupted MDB extent record, where the extents file does not fit into the first three extents in the file record (the first blocks). In hfs_get_block() when looking up the blocks for the extent file (HFS_EXT_CNID), it fails the first blocks special case, and falls through to the extent code (which ultimately calls hfs_find_init()) which is in the process of being initialised. Hfs avoids this scenario by always having the extents b-tree fitting into the first blocks (the extents B-tree can't have overflow extents). The fix is to check at mount time that the B-tree fits into first blocks, i.e. fail if HFS_I(inode)->alloc_blocks >= HFS_I(inode)->first_blocks Note, the existing commit 47f365eb57573 ("hfs: fix oops on mount with corrupted btree extent records") becomes subsumed into this as a special case, but only for the extents B-tree (HFS_EXT_CNID), it is perfectly acceptable for the catalog B-Tree file to grow beyond three extents, with the remaining extent descriptors in the extents overfow. This fixes CVE-2011-2203 Reported-by: Clement LECIGNE Signed-off-by: Phillip Lougher Cc: Jeff Mahoney Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/btree.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 3ebc437736fe..1cbdeea1db44 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -46,11 +46,26 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke case HFS_EXT_CNID: hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize, mdb->drXTFlSize, be32_to_cpu(mdb->drXTClpSiz)); + if (HFS_I(tree->inode)->alloc_blocks > + HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; case HFS_CAT_CNID: hfs_inode_read_fork(tree->inode, mdb->drCTExtRec, mdb->drCTFlSize, mdb->drCTFlSize, be32_to_cpu(mdb->drCTClpSiz)); + + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records " + "(0 size).\n"); + unlock_new_inode(tree->inode); + goto free_inode; + } + tree->inode->i_mapping->a_ops = &hfs_btree_aops; break; default: @@ -59,11 +74,6 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } unlock_new_inode(tree->inode); - if (!HFS_I(tree->inode)->first_blocks) { - printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n"); - goto free_inode; - } - mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) -- cgit v1.2.3 From 33ef6b6984403a688189317ef46bb3caab3b70e0 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:05 -0700 Subject: cgroups: more safe tasklist locking in cgroup_attach_proc Fix unstable tasklist locking in cgroup_attach_proc. According to this thread - https://lkml.org/lkml/2011/7/27/243 - RCU is not sufficient to guarantee the tasklist is stable w.r.t. de_thread and exit. Taking tasklist_lock for reading, instead of rcu_read_lock, ensures proper exclusion. Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Oleg Nesterov Cc: Frederic Weisbecker Cc: "Paul E. McKenney" Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 453100a4159d..64b0e73402df 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2027,7 +2027,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) goto out_free_group_list; /* prevent changes to the threadgroup list while we take a snapshot. */ - rcu_read_lock(); + read_lock(&tasklist_lock); if (!thread_group_leader(leader)) { /* * a race with de_thread from another thread's exec() may strip @@ -2036,7 +2036,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * throw this task away and try again (from cgroup_procs_write); * this is "double-double-toil-and-trouble-check locking". */ - rcu_read_unlock(); + read_unlock(&tasklist_lock); retval = -EAGAIN; goto out_free_group_list; } @@ -2057,7 +2057,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) } while_each_thread(leader, tsk); /* remember the number of threads in the array for later. */ group_size = i; - rcu_read_unlock(); + read_unlock(&tasklist_lock); /* * step 1: check that we can legitimately attach to the cgroup. -- cgit v1.2.3 From 77ceab8ea590d7dc6c8f055ce43dfebd74428107 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 2 Nov 2011 13:38:07 -0700 Subject: cgroups: don't attach task to subsystem if migration failed If a task has exited to the point it has called cgroup_exit() already, then we can't migrate it to another cgroup anymore. This can happen when we are attaching a task to a new cgroup between the call to ->can_attach_task() on subsystems and the migration that is eventually tried in cgroup_task_migrate(). In this case cgroup_task_migrate() returns -ESRCH and we don't want to attach the task to the subsystems because the attachment to the new cgroup itself failed. Fix this by only calling ->attach_task() on the subsystems if the cgroup migration succeeded. Reported-by: Oleg Nesterov Signed-off-by: Ben Blum Acked-by: Paul Menage Cc: Li Zefan Cc: Tejun Heo Signed-off-by: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 64b0e73402df..8386b21224ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2135,14 +2135,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) continue; - /* attach each task to each subsystem */ - for_each_subsys(root, ss) { - if (ss->attach_task) - ss->attach_task(cgrp, tsk); - } /* if the thread is PF_EXITING, it can just get skipped. */ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); - BUG_ON(retval != 0 && retval != -ESRCH); + if (retval == 0) { + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + } else { + BUG_ON(retval != -ESRCH); + } } /* nothing is sensitive to fork() after this point. */ -- cgit v1.2.3 From ff7ee93f47151e23601856e7eb5510babf956571 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Nov 2011 13:38:11 -0700 Subject: cgroup/kmemleak: Annotate alloc_page() for cgroup allocations When the cgroup base was allocated with kmalloc, it was necessary to annotate the variable with kmemleak_not_leak(). But because it has recently been changed to be allocated with alloc_page() (which skips kmemleak checks) causes a warning on boot up. I was triggering this output: allocated 8388608 bytes of page_cgroup please try 'cgroup_disable=memory' option if you don't want memory cgroups kmemleak: Trying to color unknown object at 0xf5840000 as Grey Pid: 0, comm: swapper Not tainted 3.0.0-test #12 Call Trace: [] ? printk+0x1d/0x1f^M [] paint_ptr+0x4f/0x78 [] kmemleak_not_leak+0x58/0x7d [] ? __rcu_read_unlock+0x9/0x7d [] kmemleak_init+0x19d/0x1e9 [] start_kernel+0x346/0x3ec [] ? loglevel+0x18/0x18 [] i386_start_kernel+0xaa/0xb0 After a bit of debugging I tracked the object 0xf840000 (and others) down to the cgroup code. The change from allocating base with kmalloc to alloc_page() has the base not calling kmemleak_alloc() which adds the pointer to the object_tree_root, but kmemleak_not_leak() adds it to the crt_early_log[] table. On kmemleak_init(), the entry is found in the early_log[] but not the object_tree_root, and this error message is displayed. If alloc_page() fails then it defaults back to vmalloc() which still uses the kmemleak_alloc() which makes us still need the kmemleak_not_leak() call. The solution is to call the kmemleak_alloc() directly if the alloc_page() succeeds. Reviewed-by: Michal Hocko Signed-off-by: Steven Rostedt Acked-by: Catalin Marinas Signed-off-by: Jonathan Nieder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 6bdc67dbbc28..3749ae15a8c8 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc) static void *__meminit alloc_page_cgroup(size_t size, int nid) { void *addr = NULL; + gfp_t flags = GFP_KERNEL | __GFP_NOWARN; - addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); - if (addr) + addr = alloc_pages_exact_nid(nid, size, flags); + if (addr) { + kmemleak_alloc(addr, size, 1, flags); return addr; + } if (node_state(nid, N_HIGH_MEMORY)) addr = vmalloc_node(size, nid); -- cgit