From 79bd9814e5ec9a288d6599f53aeac0b548fdfe52 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:42 -0500
Subject: cgroup, memcg: move cgroup_event implementation to memcg

cgroup_event is way over-designed and tries to build a generic
flexible event mechanism into cgroup - fully customizable event
specification for each user of the interface.  This is utterly
unnecessary and overboard especially in the light of the planned
unified hierarchy as there's gonna be single agent.  Simply generating
events at fixed points, or if that's too restrictive, configureable
cadence or single set of configureable points should be enough.

Thankfully, memcg is the only user and gets to keep it.  Replacing it
with something simpler on sane_behavior is strongly recommended.

This patch moves cgroup_event and "cgroup.event_control"
implementation to mm/memcontrol.c.  Clearing of events on cgroup
destruction is moved from cgroup_destroy_locked() to
mem_cgroup_css_offline(), which shouldn't make any noticeable
difference.

cgroup_css() and __file_cft() are exported to enable the move;
however, this will soon be reverted once the event code is updated to
be memcg specific.

Note that "cgroup.event_control" will now exist only on the hierarchy
with memcg attached to it.  While this change is visible to userland,
it is unlikely to be noticeable as the file has never been meaningful
outside memcg.

Aside from the above change, this is pure code relocation.

v2: Per Li Zefan's comments, init/Kconfig updated accordingly and
    poll.h inclusion moved from cgroup.c to memcontrol.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
---
 kernel/cgroup.c | 253 +-------------------------------------------------------
 1 file changed, 3 insertions(+), 250 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8bd9cfdc70d7..4bccaa7dda35 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -56,11 +56,8 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/eventfd.h>
-#include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
-#include <linux/file.h>
 
 #include <linux/atomic.h>
 
@@ -156,36 +153,6 @@ struct css_id {
 	unsigned short stack[0]; /* Array of Length (depth+1) */
 };
 
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct cgroup_event {
-	/*
-	 * css which the event belongs to.
-	 */
-	struct cgroup_subsys_state *css;
-	/*
-	 * Control file which the event associated.
-	 */
-	struct cftype *cft;
-	/*
-	 * eventfd to signal userspace about the event.
-	 */
-	struct eventfd_ctx *eventfd;
-	/*
-	 * Each of these stored in a list by the cgroup.
-	 */
-	struct list_head list;
-	/*
-	 * All fields below needed to unregister event when
-	 * userspace closes eventfd.
-	 */
-	poll_table pt;
-	wait_queue_head_t *wqh;
-	wait_queue_t wait;
-	struct work_struct remove;
-};
-
 /* The list of hierarchy roots */
 
 static LIST_HEAD(cgroup_roots);
@@ -235,8 +202,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
  * keep accessing it outside the said locks.  This function may return
  * %NULL if @cgrp doesn't have @subsys_id enabled.
  */
-static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
-					      struct cgroup_subsys *ss)
+struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+				       struct cgroup_subsys *ss)
 {
 	if (ss)
 		return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
@@ -2663,7 +2630,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
 /*
  * Check if a file is a control file
  */
-static inline struct cftype *__file_cft(struct file *file)
+struct cftype *__file_cft(struct file *file)
 {
 	if (file_inode(file)->i_fop != &cgroup_file_operations)
 		return ERR_PTR(-EINVAL);
@@ -3949,202 +3916,6 @@ static void cgroup_dput(struct cgroup *cgrp)
 	deactivate_super(sb);
 }
 
-/*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
-static void cgroup_event_remove(struct work_struct *work)
-{
-	struct cgroup_event *event = container_of(work, struct cgroup_event,
-			remove);
-	struct cgroup_subsys_state *css = event->css;
-
-	remove_wait_queue(event->wqh, &event->wait);
-
-	event->cft->unregister_event(css, event->cft, event->eventfd);
-
-	/* Notify userspace the event is going away. */
-	eventfd_signal(event->eventfd, 1);
-
-	eventfd_ctx_put(event->eventfd);
-	kfree(event);
-	css_put(css);
-}
-
-/*
- * Gets called on POLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
-		int sync, void *key)
-{
-	struct cgroup_event *event = container_of(wait,
-			struct cgroup_event, wait);
-	struct cgroup *cgrp = event->css->cgroup;
-	unsigned long flags = (unsigned long)key;
-
-	if (flags & POLLHUP) {
-		/*
-		 * If the event has been detached at cgroup removal, we
-		 * can simply return knowing the other side will cleanup
-		 * for us.
-		 *
-		 * We can't race against event freeing since the other
-		 * side will require wqh->lock via remove_wait_queue(),
-		 * which we hold.
-		 */
-		spin_lock(&cgrp->event_list_lock);
-		if (!list_empty(&event->list)) {
-			list_del_init(&event->list);
-			/*
-			 * We are in atomic context, but cgroup_event_remove()
-			 * may sleep, so we have to call it in workqueue.
-			 */
-			schedule_work(&event->remove);
-		}
-		spin_unlock(&cgrp->event_list_lock);
-	}
-
-	return 0;
-}
-
-static void cgroup_event_ptable_queue_proc(struct file *file,
-		wait_queue_head_t *wqh, poll_table *pt)
-{
-	struct cgroup_event *event = container_of(pt,
-			struct cgroup_event, pt);
-
-	event->wqh = wqh;
-	add_wait_queue(wqh, &event->wait);
-}
-
-/*
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
-static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
-				      struct cftype *cft, const char *buffer)
-{
-	struct cgroup *cgrp = dummy_css->cgroup;
-	struct cgroup_event *event;
-	struct cgroup_subsys_state *cfile_css;
-	unsigned int efd, cfd;
-	struct fd efile;
-	struct fd cfile;
-	char *endp;
-	int ret;
-
-	efd = simple_strtoul(buffer, &endp, 10);
-	if (*endp != ' ')
-		return -EINVAL;
-	buffer = endp + 1;
-
-	cfd = simple_strtoul(buffer, &endp, 10);
-	if ((*endp != ' ') && (*endp != '\0'))
-		return -EINVAL;
-	buffer = endp + 1;
-
-	event = kzalloc(sizeof(*event), GFP_KERNEL);
-	if (!event)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&event->list);
-	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
-	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
-	INIT_WORK(&event->remove, cgroup_event_remove);
-
-	efile = fdget(efd);
-	if (!efile.file) {
-		ret = -EBADF;
-		goto out_kfree;
-	}
-
-	event->eventfd = eventfd_ctx_fileget(efile.file);
-	if (IS_ERR(event->eventfd)) {
-		ret = PTR_ERR(event->eventfd);
-		goto out_put_efile;
-	}
-
-	cfile = fdget(cfd);
-	if (!cfile.file) {
-		ret = -EBADF;
-		goto out_put_eventfd;
-	}
-
-	/* the process need read permission on control file */
-	/* AV: shouldn't we check that it's been opened for read instead? */
-	ret = inode_permission(file_inode(cfile.file), MAY_READ);
-	if (ret < 0)
-		goto out_put_cfile;
-
-	event->cft = __file_cft(cfile.file);
-	if (IS_ERR(event->cft)) {
-		ret = PTR_ERR(event->cft);
-		goto out_put_cfile;
-	}
-
-	if (!event->cft->ss) {
-		ret = -EBADF;
-		goto out_put_cfile;
-	}
-
-	/*
-	 * Determine the css of @cfile, verify it belongs to the same
-	 * cgroup as cgroup.event_control, and associate @event with it.
-	 * Remaining events are automatically removed on cgroup destruction
-	 * but the removal is asynchronous, so take an extra ref.
-	 */
-	rcu_read_lock();
-
-	ret = -EINVAL;
-	event->css = cgroup_css(cgrp, event->cft->ss);
-	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
-	if (event->css && event->css == cfile_css && css_tryget(event->css))
-		ret = 0;
-
-	rcu_read_unlock();
-	if (ret)
-		goto out_put_cfile;
-
-	if (!event->cft->register_event || !event->cft->unregister_event) {
-		ret = -EINVAL;
-		goto out_put_css;
-	}
-
-	ret = event->cft->register_event(event->css, event->cft,
-			event->eventfd, buffer);
-	if (ret)
-		goto out_put_css;
-
-	efile.file->f_op->poll(efile.file, &event->pt);
-
-	spin_lock(&cgrp->event_list_lock);
-	list_add(&event->list, &cgrp->event_list);
-	spin_unlock(&cgrp->event_list_lock);
-
-	fdput(cfile);
-	fdput(efile);
-
-	return 0;
-
-out_put_css:
-	css_put(event->css);
-out_put_cfile:
-	fdput(cfile);
-out_put_eventfd:
-	eventfd_ctx_put(event->eventfd);
-out_put_efile:
-	fdput(efile);
-out_kfree:
-	kfree(event);
-
-	return ret;
-}
-
 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
 				      struct cftype *cft)
 {
@@ -4169,11 +3940,6 @@ static struct cftype cgroup_base_files[] = {
 		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
-	{
-		.name = "cgroup.event_control",
-		.write_string = cgroup_write_event_control,
-		.mode = S_IWUGO,
-	},
 	{
 		.name = "cgroup.clone_children",
 		.flags = CFTYPE_INSANE,
@@ -4666,7 +4432,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct dentry *d = cgrp->dentry;
-	struct cgroup_event *event, *tmp;
 	struct cgroup_subsys *ss;
 	struct cgroup *child;
 	bool empty;
@@ -4741,18 +4506,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	dget(d);
 	cgroup_d_remove_dir(d);
 
-	/*
-	 * Unregister events and notify userspace.
-	 * Notify userspace about cgroup removing only after rmdir of cgroup
-	 * directory to avoid race between userspace and kernelspace.
-	 */
-	spin_lock(&cgrp->event_list_lock);
-	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-		list_del_init(&event->list);
-		schedule_work(&event->remove);
-	}
-	spin_unlock(&cgrp->event_list_lock);
-
 	return 0;
 };
 
-- 
cgit v1.2.3


From fba94807837850e211f8975e1970e23e7804ff4d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:43 -0500
Subject: cgroup, memcg: move cgroup->event_list[_lock] and event callbacks
 into memcg

cgroup_event is being moved from cgroup core to memcg and the
implementation is already moved by the previous patch.  This patch
moves the data fields and callbacks.

* cgroup->event_list[_lock] are moved to mem_cgroup.

* cftype->[un]register_event() are moved to cgroup_event.  This makes
  it impossible for individual cftype definitions to specify their
  event callbacks.  This is worked around by simply hard-coding
  filename to event callback mapping in cgroup_write_event_control().
  This is awkward and inflexible, which is actually desirable given
  that we don't want to grow more usages of this feature.

* eventfd_ctx declaration is removed from cgroup.h, which makes
  vmpressure.h miss eventfd_ctx declaration.  Include eventfd.h from
  vmpressure.h.

v2: Use file name from dentry instead of cftype.  This will allow
    removing all cftype handling in the function.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
---
 kernel/cgroup.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4bccaa7dda35..feda7c54fa6b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1352,8 +1352,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->dummy_css.cgroup = cgrp;
-	INIT_LIST_HEAD(&cgrp->event_list);
-	spin_lock_init(&cgrp->event_list_lock);
 	simple_xattrs_init(&cgrp->xattrs);
 }
 
-- 
cgit v1.2.3


From b36824c75c7855585d6476eef2b234f6e0e68872 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 Nov 2013 18:20:44 -0500
Subject: cgroup: unexport cgroup_css() and remove __file_cft()

Now that cgroup_event is made memcg specific, the temporarily exported
functions are no longer necessary.  Unexport cgroup_css() and remove
__file_cft() which doesn't have any user left.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 kernel/cgroup.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index feda7c54fa6b..c0248e16461d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -202,8 +202,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
  * keep accessing it outside the said locks.  This function may return
  * %NULL if @cgrp doesn't have @subsys_id enabled.
  */
-struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
-				       struct cgroup_subsys *ss)
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+					      struct cgroup_subsys *ss)
 {
 	if (ss)
 		return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
@@ -2625,16 +2625,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
 	.removexattr = cgroup_removexattr,
 };
 
-/*
- * Check if a file is a control file
- */
-struct cftype *__file_cft(struct file *file)
-{
-	if (file_inode(file)->i_fop != &cgroup_file_operations)
-		return ERR_PTR(-EINVAL);
-	return __d_cft(file->f_dentry);
-}
-
 static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 				struct super_block *sb)
 {
-- 
cgit v1.2.3


From 4f024f3797c43cb4b73cd2c50cec728842d0e49e Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Fri, 11 Oct 2013 15:44:27 -0700
Subject: block: Abstract out bvec iterator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Immutable biovecs are going to require an explicit iterator. To
implement immutable bvecs, a later patch is going to add a bi_bvec_done
member to this struct; for now, this patch effectively just renames
things.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "Ed L. Cashin" <ecashin@coraid.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Lars Ellenberg <drbd-dev@lists.linbit.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Yehuda Sadeh <yehuda@inktank.com>
Cc: Sage Weil <sage@inktank.com>
Cc: Alex Elder <elder@inktank.com>
Cc: ceph-devel@vger.kernel.org
Cc: Joshua Morris <josh.h.morris@us.ibm.com>
Cc: Philip Kelleher <pjk1939@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Neil Brown <neilb@suse.de>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: dm-devel@redhat.com
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux390@de.ibm.com
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: Benny Halevy <bhalevy@tonian.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <chris.mason@fusionio.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@kernel.org>
Cc: Joern Engel <joern@logfs.org>
Cc: Prasad Joshi <prasadjoshi.linux@gmail.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Ben Myers <bpm@sgi.com>
Cc: xfs@oss.sgi.com
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Herton Ronaldo Krzesinski <herton.krzesinski@canonical.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Guo Chao <yan@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Cc: "Roger Pau Monné" <roger.pau@citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Jerome Marchand <jmarchand@redhat.com>
Cc: Joe Perches <joe@perches.com>
Cc: Peng Tao <tao.peng@emc.com>
Cc: Andy Adamson <andros@netapp.com>
Cc: fanchaoting <fanchaoting@cn.fujitsu.com>
Cc: Jie Liu <jeff.liu@oracle.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Namjae Jeon <namjae.jeon@samsung.com>
Cc: Pankaj Kumar <pankaj.km@samsung.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Mel Gorman <mgorman@suse.de>6
---
 kernel/power/block_io.c |  2 +-
 kernel/trace/blktrace.c | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index d09dd10c5a5e..9a58bc258810 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -32,7 +32,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector,
 	struct bio *bio;
 
 	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
-	bio->bi_sector = sector;
+	bio->bi_iter.bi_sector = sector;
 	bio->bi_bdev = bdev;
 	bio->bi_end_io = end_swap_bio_read;
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f785aef65799..b418cb0d7242 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -781,8 +781,8 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 	if (!error && !bio_flagged(bio, BIO_UPTODATE))
 		error = EIO;
 
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
-			error, 0, NULL);
+	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
+			bio->bi_rw, what, error, 0, NULL);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
@@ -885,8 +885,9 @@ static void blk_add_trace_split(void *ignore,
 	if (bt) {
 		__be64 rpdu = cpu_to_be64(pdu);
 
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
-				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+		__blk_add_trace(bt, bio->bi_iter.bi_sector,
+				bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
+				!bio_flagged(bio, BIO_UPTODATE),
 				sizeof(rpdu), &rpdu);
 	}
 }
@@ -918,9 +919,9 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
 	r.sector_from = cpu_to_be64(from);
 
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
-			BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
-			sizeof(r), &r);
+	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
+			bio->bi_rw, BLK_TA_REMAP,
+			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
 }
 
 /**
-- 
cgit v1.2.3


From ac1e69aa78e2e799a8d5475595ba744bdf29b597 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:58 -0500
Subject: cgroup: don't skip seq_open on write only opens on pidlist files

Currently, cgroup_pidlist_open() skips seq_open() and pidlist loading
if the file is opened write-only, which is a sensible optimization as
pidlist loading can be costly and there often are occasions where
tasks or cgroup.procs is opened write-only.  However, pidlist init and
release are planned to be moved to cgroup_pidlist_start/stop()
respectively which would make this optimization unnecessary.

This patch removes the optimization and always fully initializes
pidlist files regardless of open mode.  This will help moving pidlist
handling to start/stop by unifying rw paths and removes the need for
specifying cftype->release() in addition to .release in
cgroup_pidlist_operations as file->f_op is now always overridden.  As
pidlist files were the only user of cftype->release(), the next patch
will remove the method.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 34fd1be0d9bf..1e09ded0387a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3781,12 +3781,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
 {
 	struct cgroup_pidlist *l;
-	if (!(file->f_mode & FMODE_READ))
-		return 0;
-	/*
-	 * the seq_file will only be initialized if the file was opened for
-	 * reading; hence we check if it's not null only in that case.
-	 */
+
 	l = ((struct seq_file *)file->private_data)->private;
 	cgroup_release_pid_array(l);
 	return seq_release(inode, file);
@@ -3811,10 +3806,6 @@ static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 	struct cgroup_pidlist *l;
 	int retval;
 
-	/* Nothing to do for write-only files */
-	if (!(file->f_mode & FMODE_READ))
-		return 0;
-
 	/* have the array populated */
 	retval = pidlist_array_load(cgrp, type, &l);
 	if (retval)
@@ -3894,7 +3885,6 @@ static struct cftype cgroup_base_files[] = {
 		.name = "cgroup.procs",
 		.open = cgroup_procs_open,
 		.write_u64 = cgroup_procs_write,
-		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
@@ -3919,7 +3909,6 @@ static struct cftype cgroup_base_files[] = {
 		.flags = CFTYPE_INSANE,		/* use "procs" instead */
 		.open = cgroup_tasks_open,
 		.write_u64 = cgroup_tasks_write,
-		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
-- 
cgit v1.2.3


From b9f3cecaba592d4e98cd155c91b1414323ed51e8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:58 -0500
Subject: cgroup: remove cftype->release()

Now that pidlist files don't use cftype->release(), it doesn't have
any user left.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1e09ded0387a..b59e3453eae7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2448,12 +2448,9 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 static int cgroup_file_release(struct inode *inode, struct file *file)
 {
 	struct cfent *cfe = __d_cfe(file->f_dentry);
-	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup_subsys_state *css = cfe->css;
 	int ret = 0;
 
-	if (cft->release)
-		ret = cft->release(inode, file);
 	if (css->ss)
 		css_put(css);
 	if (file->f_op == &cgroup_seqfile_operations)
-- 
cgit v1.2.3


From b1a21367314f36a819c0676e0999f34db12ee6ed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:58 -0500
Subject: cgroup: implement delayed destruction for cgroup_pidlist

Currently, pidlists are reference counted from file open and release
methods.  This means that holding onto an open file may waste memory
and reads may return data which is very stale.  Both aren't critical
because pidlists are keyed and shared per namespace and, well, the
user isn't supposed to have large delay between open and reads.

cgroup is planned to be converted to use kernfs and it'd be best if we
can stick to just the seq_file operations - start, next, stop and
show.  This can be achieved by loading pidlist on demand from start
and release with time delay from stop, so that consecutive reads don't
end up reloading the pidlist on each iteration.  This would remove the
need for hooking into open and release while also avoiding issues with
holding onto pidlist for too long.

This patch implements delayed release of pidlist.  As pidlists could
be lingering on cgroup removal waiting for the timer to expire, cgroup
free path needs to queue the destruction work item immediately and
flush.  As those work items are self-destroying, each work item can't
be flushed directly.  A new workqueue - cgroup_pidlist_destroy_wq - is
added to serve as flush domain.

Note that this patch just adds delayed release on top of the current
implementation and doesn't change where pidlist is loaded and
released.  Following patches will make those changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 102 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 77 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b59e3453eae7..acdcddf8ab82 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -61,6 +61,14 @@
 
 #include <linux/atomic.h>
 
+/*
+ * pidlists linger the following amount before being destroyed.  The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY	HZ
+
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -94,6 +102,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);
  */
 static struct workqueue_struct *cgroup_destroy_wq;
 
+/*
+ * pidlist destructions need to be flushed on cgroup destruction.  Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
 /*
  * Generate an array of cgroup subsystem pointers. At boot time, this is
  * populated with the built in subsystems, and modular subsystems are
@@ -167,6 +181,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 static int cgroup_file_release(struct inode *inode, struct file *file);
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -830,11 +845,7 @@ static void cgroup_free_fn(struct work_struct *work)
 	 */
 	deactivate_super(cgrp->root->sb);
 
-	/*
-	 * if we're getting rid of the cgroup, refcount should ensure
-	 * that there are no pidlists left.
-	 */
-	BUG_ON(!list_empty(&cgrp->pidlists));
+	cgroup_pidlist_destroy_all(cgrp);
 
 	simple_xattrs_free(&cgrp->xattrs);
 
@@ -2449,13 +2460,12 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
 {
 	struct cfent *cfe = __d_cfe(file->f_dentry);
 	struct cgroup_subsys_state *css = cfe->css;
-	int ret = 0;
 
 	if (css->ss)
 		css_put(css);
 	if (file->f_op == &cgroup_seqfile_operations)
 		single_release(inode, file);
-	return ret;
+	return 0;
 }
 
 /*
@@ -3454,6 +3464,8 @@ struct cgroup_pidlist {
 	struct cgroup *owner;
 	/* protects the other fields */
 	struct rw_semaphore rwsem;
+	/* for delayed destruction */
+	struct delayed_work destroy_dwork;
 };
 
 /*
@@ -3469,6 +3481,7 @@ static void *pidlist_allocate(int count)
 	else
 		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
 }
+
 static void pidlist_free(void *p)
 {
 	if (is_vmalloc_addr(p))
@@ -3477,6 +3490,49 @@ static void pidlist_free(void *p)
 		kfree(p);
 }
 
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer.  None
+ * should be left afterwards.
+ */
+static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
+{
+	struct cgroup_pidlist *l, *tmp_l;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+	mutex_unlock(&cgrp->pidlist_mutex);
+
+	flush_workqueue(cgroup_pidlist_destroy_wq);
+	BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+						destroy_dwork);
+	struct cgroup_pidlist *tofree = NULL;
+
+	mutex_lock(&l->owner->pidlist_mutex);
+	down_write(&l->rwsem);
+
+	/*
+	 * Destroy iff we didn't race with a new user or get queued again.
+	 * Queued state won't change as it can only be queued while locked.
+	 */
+	if (!l->use_count && !delayed_work_pending(dwork)) {
+		list_del(&l->links);
+		pidlist_free(l->list);
+		put_pid_ns(l->key.ns);
+		tofree = l;
+	}
+
+	up_write(&l->rwsem);
+	mutex_unlock(&l->owner->pidlist_mutex);
+	kfree(tofree);
+}
+
 /*
  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
  * Returns the number of unique elements.
@@ -3547,6 +3603,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 		return l;
 	}
 	init_rwsem(&l->rwsem);
+	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
 	down_write(&l->rwsem);
 	l->key.type = type;
 	l->key.ns = get_pid_ns(ns);
@@ -3752,26 +3809,12 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
 
 static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 {
-	/*
-	 * the case where we're the last user of this particular pidlist will
-	 * have us remove it from the cgroup's list, which entails taking the
-	 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
-	 * pidlist_mutex, we have to take pidlist_mutex first.
-	 */
-	mutex_lock(&l->owner->pidlist_mutex);
 	down_write(&l->rwsem);
 	BUG_ON(!l->use_count);
-	if (!--l->use_count) {
-		/* we're the last user if refcount is 0; remove and free */
-		list_del(&l->links);
-		mutex_unlock(&l->owner->pidlist_mutex);
-		pidlist_free(l->list);
-		put_pid_ns(l->key.ns);
-		up_write(&l->rwsem);
-		kfree(l);
-		return;
-	}
-	mutex_unlock(&l->owner->pidlist_mutex);
+	/* if the last user, arm the destroy work */
+	if (!--l->use_count)
+		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+				 CGROUP_PIDLIST_DESTROY_DELAY);
 	up_write(&l->rwsem);
 }
 
@@ -4813,6 +4856,15 @@ static int __init cgroup_wq_init(void)
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
+
+	/*
+	 * Used to destroy pidlists and separate to serve as flush domain.
+	 * Cap @max_active to 1 too.
+	 */
+	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+						    0, 1);
+	BUG_ON(!cgroup_pidlist_destroy_wq);
+
 	return 0;
 }
 core_initcall(cgroup_wq_init);
-- 
cgit v1.2.3


From 62236858f3cc558135d1ab6b2af44d22c489bb7f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:58 -0500
Subject: cgroup: introduce struct cgroup_pidlist_open_file

For pidlist files, seq_file->private pointed to the loaded
cgroup_pidlist; however, pidlist loading is planned to be moved to
cgroup_pidlist_start() for kernfs conversion and seq_file->private
needs to carry more information from open to allow that.

This patch introduces struct cgroup_pidlist_open_file which contains
type, cgrp and pidlist and updates pidlist seq_file->private to point
to it using seq_open_private() and seq_release_private().  Note that
this eventually will be replaced by kernfs_open_file.

While this patch makes more information available to seq_file
operations, they don't use it yet and this patch doesn't introduce any
behavior changes except for allocation of the extra private struct.

v2: use __seq_open_private() instead of seq_open_private() for brevity
    as suggested by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index acdcddf8ab82..ef019c075d5d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3468,6 +3468,13 @@ struct cgroup_pidlist {
 	struct delayed_work destroy_dwork;
 };
 
+/* seq_file->private points to the following */
+struct cgroup_pidlist_open_file {
+	enum cgroup_filetype		type;
+	struct cgroup			*cgrp;
+	struct cgroup_pidlist		*pidlist;
+};
+
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3739,7 +3746,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
-	struct cgroup_pidlist *l = s->private;
+	struct cgroup_pidlist_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->pidlist;
 	int index = 0, pid = *pos;
 	int *iter;
 
@@ -3769,13 +3777,15 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
-	struct cgroup_pidlist *l = s->private;
-	up_read(&l->rwsem);
+	struct cgroup_pidlist_open_file *of = s->private;
+
+	up_read(&of->pidlist->rwsem);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct cgroup_pidlist *l = s->private;
+	struct cgroup_pidlist_open_file *of = s->private;
+	struct cgroup_pidlist *l = of->pidlist;
 	pid_t *p = v;
 	pid_t *end = l->list + l->length;
 	/*
@@ -3820,11 +3830,11 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 
 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
 {
-	struct cgroup_pidlist *l;
+	struct cgroup_pidlist_open_file *of;
 
-	l = ((struct seq_file *)file->private_data)->private;
-	cgroup_release_pid_array(l);
-	return seq_release(inode, file);
+	of = ((struct seq_file *)file->private_data)->private;
+	cgroup_release_pid_array(of->pidlist);
+	return seq_release_private(inode, file);
 }
 
 static const struct file_operations cgroup_pidlist_operations = {
@@ -3843,6 +3853,7 @@ static const struct file_operations cgroup_pidlist_operations = {
 static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+	struct cgroup_pidlist_open_file *of;
 	struct cgroup_pidlist *l;
 	int retval;
 
@@ -3853,12 +3864,16 @@ static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 	/* configure file information */
 	file->f_op = &cgroup_pidlist_operations;
 
-	retval = seq_open(file, &cgroup_pidlist_seq_operations);
-	if (retval) {
+	of = __seq_open_private(file, &cgroup_pidlist_seq_operations,
+				sizeof(*of));
+	if (!of) {
 		cgroup_release_pid_array(l);
-		return retval;
+		return -ENOMEM;
 	}
-	((struct seq_file *)file->private_data)->private = l;
+
+	of->type = type;
+	of->cgrp = cgrp;
+	of->pidlist = l;
 	return 0;
 }
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
-- 
cgit v1.2.3


From e6b817103d168a76e4044ebcdbc08225d77a81cb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: refactor cgroup_pidlist_find()

Rename cgroup_pidlist_find() to cgroup_pidlist_find_create() and
separate out finding proper to cgroup_pidlist_find().  Also, move
locking to the caller.

This patch is preparation for pidlist restructure and doesn't
introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 65 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ef019c075d5d..d58c30d3b828 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3575,48 +3575,50 @@ static int cmppid(const void *a, const void *b)
 	return *(pid_t *)a - *(pid_t *)b;
 }
 
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+						  enum cgroup_filetype type)
+{
+	struct cgroup_pidlist *l;
+	/* don't need task_nsproxy() if we're looking at ourself */
+	struct pid_namespace *ns = task_active_pid_ns(current);
+
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	list_for_each_entry(l, &cgrp->pidlists, links)
+		if (l->key.type == type && l->key.ns == ns)
+			return l;
+	return NULL;
+}
+
 /*
  * find the appropriate pidlist for our purpose (given procs vs tasks)
  * returns with the lock on that pidlist already held, and takes care
  * of the use count, or returns NULL with no locks held if we're out of
  * memory.
  */
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
-						  enum cgroup_filetype type)
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+						enum cgroup_filetype type)
 {
 	struct cgroup_pidlist *l;
-	/* don't need task_nsproxy() if we're looking at ourself */
-	struct pid_namespace *ns = task_active_pid_ns(current);
 
-	/*
-	 * We can't drop the pidlist_mutex before taking the l->rwsem in case
-	 * the last ref-holder is trying to remove l from the list at the same
-	 * time. Holding the pidlist_mutex precludes somebody taking whichever
-	 * list we find out from under us - compare release_pid_array().
-	 */
-	mutex_lock(&cgrp->pidlist_mutex);
-	list_for_each_entry(l, &cgrp->pidlists, links) {
-		if (l->key.type == type && l->key.ns == ns) {
-			/* make sure l doesn't vanish out from under us */
-			down_write(&l->rwsem);
-			mutex_unlock(&cgrp->pidlist_mutex);
-			return l;
-		}
-	}
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
+	l = cgroup_pidlist_find(cgrp, type);
+	if (l)
+		return l;
+
 	/* entry not found; create a new one */
 	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
-	if (!l) {
-		mutex_unlock(&cgrp->pidlist_mutex);
+	if (!l)
 		return l;
-	}
+
 	init_rwsem(&l->rwsem);
 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
-	down_write(&l->rwsem);
 	l->key.type = type;
-	l->key.ns = get_pid_ns(ns);
+	/* don't need task_nsproxy() if we're looking at ourself */
+	l->key.ns = get_pid_ns(task_active_pid_ns(current));
 	l->owner = cgrp;
 	list_add(&l->links, &cgrp->pidlists);
-	mutex_unlock(&cgrp->pidlist_mutex);
 	return l;
 }
 
@@ -3662,17 +3664,26 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
-	l = cgroup_pidlist_find(cgrp, type);
+
+	mutex_lock(&cgrp->pidlist_mutex);
+
+	l = cgroup_pidlist_find_create(cgrp, type);
 	if (!l) {
+		mutex_unlock(&cgrp->pidlist_mutex);
 		pidlist_free(array);
 		return -ENOMEM;
 	}
-	/* store array, freeing old if necessary - lock already held */
+
+	/* store array, freeing old if necessary */
+	down_write(&l->rwsem);
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
 	l->use_count++;
 	up_write(&l->rwsem);
+
+	mutex_unlock(&cgrp->pidlist_mutex);
+
 	*lp = l;
 	return 0;
 }
-- 
cgit v1.2.3


From 069df3b7aeb3f4e926c4da9630c92010909af512 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: remove cgroup_pidlist->rwsem

cgroup_pidlist locking is needlessly complicated.  It has outer
cgroup->pidlist_mutex to protect the list of pidlists associated with
a cgroup and then each pidlist has rwsem to synchronize updates and
reads.  Given that the only read access is from seq_file operations
which are always invoked back-to-back, the rwsem is a giant overkill.
All it does is adding unnecessary complexity.

This patch removes cgroup_pidlist->rwsem and protects all accesses to
pidlists belonging to a cgroup with cgroup->pidlist_mutex.
pidlist->rwsem locking is removed if it's nested inside
cgroup->pidlist_mutex; otherwise, it's replaced with
cgroup->pidlist_mutex locking.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d58c30d3b828..dc39e1774542 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3462,8 +3462,6 @@ struct cgroup_pidlist {
 	struct list_head links;
 	/* pointer to the cgroup we belong to, for list removal purposes */
 	struct cgroup *owner;
-	/* protects the other fields */
-	struct rw_semaphore rwsem;
 	/* for delayed destruction */
 	struct delayed_work destroy_dwork;
 };
@@ -3522,7 +3520,6 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 	struct cgroup_pidlist *tofree = NULL;
 
 	mutex_lock(&l->owner->pidlist_mutex);
-	down_write(&l->rwsem);
 
 	/*
 	 * Destroy iff we didn't race with a new user or get queued again.
@@ -3535,7 +3532,6 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 		tofree = l;
 	}
 
-	up_write(&l->rwsem);
 	mutex_unlock(&l->owner->pidlist_mutex);
 	kfree(tofree);
 }
@@ -3612,7 +3608,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 	if (!l)
 		return l;
 
-	init_rwsem(&l->rwsem);
 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
 	l->key.type = type;
 	/* don't need task_nsproxy() if we're looking at ourself */
@@ -3675,12 +3670,10 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	}
 
 	/* store array, freeing old if necessary */
-	down_write(&l->rwsem);
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
 	l->use_count++;
-	up_write(&l->rwsem);
 
 	mutex_unlock(&cgrp->pidlist_mutex);
 
@@ -3762,7 +3755,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 	int index = 0, pid = *pos;
 	int *iter;
 
-	down_read(&l->rwsem);
+	mutex_lock(&of->cgrp->pidlist_mutex);
 	if (pid) {
 		int end = l->length;
 
@@ -3790,7 +3783,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
 	struct cgroup_pidlist_open_file *of = s->private;
 
-	up_read(&of->pidlist->rwsem);
+	mutex_unlock(&of->cgrp->pidlist_mutex);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3830,13 +3823,13 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
 
 static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 {
-	down_write(&l->rwsem);
+	mutex_lock(&l->owner->pidlist_mutex);
 	BUG_ON(!l->use_count);
 	/* if the last user, arm the destroy work */
 	if (!--l->use_count)
 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
 				 CGROUP_PIDLIST_DESTROY_DELAY);
-	up_write(&l->rwsem);
+	mutex_unlock(&l->owner->pidlist_mutex);
 }
 
 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
-- 
cgit v1.2.3


From 4bac00d16a8760eae7205e41d2c246477d42a210 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: load and release pidlists from seq_file start and stop
 respectively

Currently, pidlists are reference counted from file open and release
methods.  This means that holding onto an open file may waste memory
and reads may return data which is very stale.  Both aren't critical
because pidlists are keyed and shared per namespace and, well, the
user isn't supposed to have large delay between open and reads.

cgroup is planned to be converted to use kernfs and it'd be best if we
can stick to just the seq_file operations - start, next, stop and
show.  This can be achieved by loading pidlist on demand from start
and release with time delay from stop, so that consecutive reads don't
end up reloading the pidlist on each iteration.  This would remove the
need for hooking into open and release while also avoiding issues with
holding onto pidlist for too long.

The previous patches implemented delayed release and restructured
pidlist handling so that pidlists can be loaded and released from
seq_file start / stop.  This patch actually moves pidlist load to
start and release to stop.

This means that pidlist is pinned only between start and stop and may
go away between two consecutive read calls if the two calls are apart
by more than CGROUP_PIDLIST_DESTROY_DELAY.  cgroup_pidlist_start()
thus can't re-use the stored cgroup_pid_list_open_file->pidlist
directly.  During start, it's only used as a hint indicating whether
this is the first start after open or not and pidlist is always looked
up or created.

pidlist_mutex locking and reference counting are moved out of
pidlist_array_load() so that pidlist_array_load() can perform lookup
and creation atomically.  While this enlarges the area covered by
pidlist_mutex, given how the lock is used, it's highly unlikely to be
noticeable.

v2: Refreshed on top of the updated "cgroup: introduce struct
    cgroup_pidlist_open_file".

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 63 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dc39e1774542..671cbde883e9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3473,6 +3473,8 @@ struct cgroup_pidlist_open_file {
 	struct cgroup_pidlist		*pidlist;
 };
 
+static void cgroup_release_pid_array(struct cgroup_pidlist *l);
+
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3630,6 +3632,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	struct task_struct *tsk;
 	struct cgroup_pidlist *l;
 
+	lockdep_assert_held(&cgrp->pidlist_mutex);
+
 	/*
 	 * If cgroup gets more users after we read count, we won't have
 	 * enough space - tough.  This race is indistinguishable to the
@@ -3660,8 +3664,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
 
-	mutex_lock(&cgrp->pidlist_mutex);
-
 	l = cgroup_pidlist_find_create(cgrp, type);
 	if (!l) {
 		mutex_unlock(&cgrp->pidlist_mutex);
@@ -3673,10 +3675,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
-	l->use_count++;
-
-	mutex_unlock(&cgrp->pidlist_mutex);
-
 	*lp = l;
 	return 0;
 }
@@ -3751,11 +3749,34 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 	 * next pid to display, if any
 	 */
 	struct cgroup_pidlist_open_file *of = s->private;
-	struct cgroup_pidlist *l = of->pidlist;
+	struct cgroup *cgrp = of->cgrp;
+	struct cgroup_pidlist *l;
 	int index = 0, pid = *pos;
-	int *iter;
+	int *iter, ret;
+
+	mutex_lock(&cgrp->pidlist_mutex);
+
+	/*
+	 * !NULL @of->pidlist indicates that this isn't the first start()
+	 * after open.  If the matching pidlist is around, we can use that.
+	 * Look for it.  Note that @of->pidlist can't be used directly.  It
+	 * could already have been destroyed.
+	 */
+	if (of->pidlist)
+		of->pidlist = cgroup_pidlist_find(cgrp, of->type);
+
+	/*
+	 * Either this is the first start() after open or the matching
+	 * pidlist has been destroyed inbetween.  Create a new one.
+	 */
+	if (!of->pidlist) {
+		ret = pidlist_array_load(of->cgrp, of->type, &of->pidlist);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+	l = of->pidlist;
+	l->use_count++;
 
-	mutex_lock(&of->cgrp->pidlist_mutex);
 	if (pid) {
 		int end = l->length;
 
@@ -3784,6 +3805,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 	struct cgroup_pidlist_open_file *of = s->private;
 
 	mutex_unlock(&of->cgrp->pidlist_mutex);
+	if (of->pidlist)
+		cgroup_release_pid_array(of->pidlist);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3832,20 +3855,11 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
 	mutex_unlock(&l->owner->pidlist_mutex);
 }
 
-static int cgroup_pidlist_release(struct inode *inode, struct file *file)
-{
-	struct cgroup_pidlist_open_file *of;
-
-	of = ((struct seq_file *)file->private_data)->private;
-	cgroup_release_pid_array(of->pidlist);
-	return seq_release_private(inode, file);
-}
-
 static const struct file_operations cgroup_pidlist_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.write = cgroup_file_write,
-	.release = cgroup_pidlist_release,
+	.release = seq_release_private,
 };
 
 /*
@@ -3858,26 +3872,17 @@ static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 	struct cgroup_pidlist_open_file *of;
-	struct cgroup_pidlist *l;
-	int retval;
 
-	/* have the array populated */
-	retval = pidlist_array_load(cgrp, type, &l);
-	if (retval)
-		return retval;
 	/* configure file information */
 	file->f_op = &cgroup_pidlist_operations;
 
 	of = __seq_open_private(file, &cgroup_pidlist_seq_operations,
 				sizeof(*of));
-	if (!of) {
-		cgroup_release_pid_array(l);
+	if (!of)
 		return -ENOMEM;
-	}
 
 	of->type = type;
 	of->cgrp = cgrp;
-	of->pidlist = l;
 	return 0;
 }
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
-- 
cgit v1.2.3


From 045023658ca1e30dc0bb1f148b42c95b740d3e02 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: remove cgroup_pidlist->use_count

After the recent changes, pidlist ref is held only between
cgroup_pidlist_start() and cgroup_pidlist_stop() during which
cgroup->pidlist_mutex is also held.  IOW, the reference count is
redundant now.  While in use, it's always one and pidlist_mutex is
held - holding the mutex has exactly the same protection.

This patch collapses destroy_dwork queueing into cgroup_pidlist_stop()
so that pidlist_mutex is not released inbetween and drops
pidlist->use_count.

This patch shouldn't introduce any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 671cbde883e9..a2458031d851 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3456,8 +3456,6 @@ struct cgroup_pidlist {
 	pid_t *list;
 	/* how many elements the above list has */
 	int length;
-	/* how many files are using the current array */
-	int use_count;
 	/* each of these stored in a list by its cgroup */
 	struct list_head links;
 	/* pointer to the cgroup we belong to, for list removal purposes */
@@ -3473,8 +3471,6 @@ struct cgroup_pidlist_open_file {
 	struct cgroup_pidlist		*pidlist;
 };
 
-static void cgroup_release_pid_array(struct cgroup_pidlist *l);
-
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3524,10 +3520,10 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 	mutex_lock(&l->owner->pidlist_mutex);
 
 	/*
-	 * Destroy iff we didn't race with a new user or get queued again.
-	 * Queued state won't change as it can only be queued while locked.
+	 * Destroy iff we didn't get queued again.  The state won't change
+	 * as destroy_dwork can only be queued while locked.
 	 */
-	if (!l->use_count && !delayed_work_pending(dwork)) {
+	if (!delayed_work_pending(dwork)) {
 		list_del(&l->links);
 		pidlist_free(l->list);
 		put_pid_ns(l->key.ns);
@@ -3775,7 +3771,6 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 			return ERR_PTR(ret);
 	}
 	l = of->pidlist;
-	l->use_count++;
 
 	if (pid) {
 		int end = l->length;
@@ -3804,9 +3799,11 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
 	struct cgroup_pidlist_open_file *of = s->private;
 
-	mutex_unlock(&of->cgrp->pidlist_mutex);
 	if (of->pidlist)
-		cgroup_release_pid_array(of->pidlist);
+		mod_delayed_work(cgroup_pidlist_destroy_wq,
+				 &of->pidlist->destroy_dwork,
+				 CGROUP_PIDLIST_DESTROY_DELAY);
+	mutex_unlock(&of->cgrp->pidlist_mutex);
 }
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3844,17 +3841,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = {
 	.show = cgroup_pidlist_show,
 };
 
-static void cgroup_release_pid_array(struct cgroup_pidlist *l)
-{
-	mutex_lock(&l->owner->pidlist_mutex);
-	BUG_ON(!l->use_count);
-	/* if the last user, arm the destroy work */
-	if (!--l->use_count)
-		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
-				 CGROUP_PIDLIST_DESTROY_DELAY);
-	mutex_unlock(&l->owner->pidlist_mutex);
-}
-
 static const struct file_operations cgroup_pidlist_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
-- 
cgit v1.2.3


From afb2bc14e1c989cf0635bd04edb5ff55b8c1c7bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 29 Nov 2013 10:42:59 -0500
Subject: cgroup: don't guarantee cgroup.procs is sorted if sane_behavior

For some reason, tasks and cgroup.procs guarantee that the result is
sorted.  This is the only reason this whole pidlist logic is necessary
instead of just iterating through sorted member tasks.  We can't do
anything about the existing interface but at least ensure that such
expectation doesn't exist for the new interface so that pidlist logic
may be removed in the distant future.

This patch scrambles the sort order if sane_behavior so that the
output is usually not sorted in the new interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a2458031d851..f9f5fe3526ac 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3564,11 +3564,49 @@ after:
 	return dest;
 }
 
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco.  As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer.  As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property.  In the long term, we
+ * want to do away with it.  Explicitly scramble sort order if
+ * sane_behavior so that no such expectation exists in the new interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+static pid_t pid_fry(pid_t pid)
+{
+	unsigned a = pid & 0x55555555;
+	unsigned b = pid & 0xAAAAAAAA;
+
+	return (a << 1) | (b >> 1);
+}
+
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+{
+	if (cgroup_sane_behavior(cgrp))
+		return pid_fry(pid);
+	else
+		return pid;
+}
+
 static int cmppid(const void *a, const void *b)
 {
 	return *(pid_t *)a - *(pid_t *)b;
 }
 
+static int fried_cmppid(const void *a, const void *b)
+{
+	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
+
 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 						  enum cgroup_filetype type)
 {
@@ -3656,7 +3694,10 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 	css_task_iter_end(&it);
 	length = n;
 	/* now sort & (if procs) strip out duplicates */
-	sort(array, length, sizeof(pid_t), cmppid, NULL);
+	if (cgroup_sane_behavior(cgrp))
+		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+	else
+		sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
 
@@ -3777,10 +3818,10 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 
 		while (index < end) {
 			int mid = (index + end) / 2;
-			if (l->list[mid] == pid) {
+			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
 				index = mid;
 				break;
-			} else if (l->list[mid] <= pid)
+			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
 				index = mid + 1;
 			else
 				end = mid;
@@ -3791,7 +3832,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
 	iter = l->list + index;
-	*pos = *iter;
+	*pos = cgroup_pid_fry(cgrp, *iter);
 	return iter;
 }
 
@@ -3820,7 +3861,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 	if (p >= end) {
 		return NULL;
 	} else {
-		*pos = *p;
+		*pos = cgroup_pid_fry(of->cgrp, *p);
 		return p;
 	}
 }
-- 
cgit v1.2.3


From e0c7855e364dda1ddd65b0b092cfc07ce9d66373 Mon Sep 17 00:00:00 2001
From: Leonardo Potenza <leonardo.potenza@intel.com>
Date: Tue, 19 Nov 2013 12:27:41 +0000
Subject: PM / hibernate: export hibernation_set_ops

To support the ability to implement PM hibernation code as modules
the hibernation_set_ops function requires to be exported.

Similar solution already available for suspend_set_ops
(please refer to commit a5e4fd8783a2bec861ecf1138cdc042269ff59aa).

Signed-off-by: Leonardo Potenza <leonardo.potenza@intel.com>
Signed-off-by: Edwin Verplanke <edwin.verplanke@intel.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 0121dab83f43..bc13d087ea14 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -82,6 +82,7 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
 
 	unlock_system_sleep();
 }
+EXPORT_SYMBOL_GPL(hibernation_set_ops);
 
 static bool entering_platform_hibernation;
 
-- 
cgit v1.2.3


From 88a88b320a9068294aaa2841464e4809af2ff454 Mon Sep 17 00:00:00 2001
From: Felipe Contreras <felipe.contreras@gmail.com>
Date: Wed, 4 Dec 2013 14:09:38 +1030
Subject: params: improve standard definitions

We are repeating the functionality of kstrtol in param_set_long, and the
same for kstrtoint. We can get rid of the extra code by using the right
functions.

Signed-off-by: Felipe Contreras <felipe.contreras@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/params.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index c00d5b502aa4..b00142e7f3ba 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -227,17 +227,10 @@ int parse_args(const char *doing,
 }
 
 /* Lazy bastard, eh? */
-#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn)      	\
+#define STANDARD_PARAM_DEF(name, type, format, strtolfn)      		\
 	int param_set_##name(const char *val, const struct kernel_param *kp) \
 	{								\
-		tmptype l;						\
-		int ret;						\
-									\
-		ret = strtolfn(val, 0, &l);				\
-		if (ret < 0 || ((type)l != l))				\
-			return ret < 0 ? ret : -EINVAL;			\
-		*((type *)kp->arg) = l;					\
-		return 0;						\
+		return strtolfn(val, 0, (type *)kp->arg);		\
 	}								\
 	int param_get_##name(char *buffer, const struct kernel_param *kp) \
 	{								\
@@ -253,13 +246,13 @@ int parse_args(const char *doing,
 	EXPORT_SYMBOL(param_ops_##name)
 
 
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
-STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
-STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
-STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
 
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
-- 
cgit v1.2.3


From c0e656b7a6e1ac03b2921d49211a735893efd544 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 28 Nov 2013 19:20:05 +0100
Subject: padata: Fix wrong usage of rcu_dereference()

A kernel with enabled lockdep complains about the wrong usage of
rcu_dereference() under a rcu_read_lock_bh() protected region.

  ===============================
  [ INFO: suspicious RCU usage. ]
  3.13.0-rc1+ #126 Not tainted
  -------------------------------
  linux/kernel/padata.c:115 suspicious rcu_dereference_check() usage!

  other info that might help us debug this:

  rcu_scheduler_active = 1, debug_locks = 1
  1 lock held by cryptomgr_test/153:
   #0:  (rcu_read_lock_bh){.+....}, at: [<ffffffff8115c235>] padata_do_parallel+0x5/0x270

Fix that by using rcu_dereference_bh() instead.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 2abd25d79cc8..161402f0b517 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -112,7 +112,7 @@ int padata_do_parallel(struct padata_instance *pinst,
 
 	rcu_read_lock_bh();
 
-	pd = rcu_dereference(pinst->pd);
+	pd = rcu_dereference_bh(pinst->pd);
 
 	err = -EINVAL;
 	if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
-- 
cgit v1.2.3


From 44ffc75ba9a63f972dbebd4fab6888db5fcd3b0e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 5 Dec 2013 12:28:01 -0500
Subject: cgroup, sched: convert away from cftype->read_map()

In preparation of conversion to kernfs, cgroup file handling is being
consolidated so that it can be easily mapped to the seq_file based
interface of kernfs.

cftype->read_map() doesn't add any value and being replaced with
->read_seq_string().  Update cpu_stats_show() and cpuacct_stats_show()
accordingly.

This patch doesn't make any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/core.c    | 10 +++++-----
 kernel/sched/cpuacct.c |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1808606ee5f..f28ec6722f0b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7257,14 +7257,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 }
 
 static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
-		struct cgroup_map_cb *cb)
+			  struct seq_file *sf)
 {
 	struct task_group *tg = css_tg(css);
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 
-	cb->fill(cb, "nr_periods", cfs_b->nr_periods);
-	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
-	cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
+	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
+	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
 
 	return 0;
 }
@@ -7318,7 +7318,7 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "stat",
-		.read_map = cpu_stats_show,
+		.read_seq_string = cpu_stats_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f64722ff0299..dd88738cd4a9 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -184,7 +184,7 @@ static const char * const cpuacct_stat_desc[] = {
 };
 
 static int cpuacct_stats_show(struct cgroup_subsys_state *css,
-			      struct cftype *cft, struct cgroup_map_cb *cb)
+			      struct cftype *cft, struct seq_file *sf)
 {
 	struct cpuacct *ca = css_ca(css);
 	int cpu;
@@ -196,7 +196,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
 		val += kcpustat->cpustat[CPUTIME_NICE];
 	}
 	val = cput