aboutsummaryrefslogtreecommitdiff
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c173
1 files changed, 88 insertions, 85 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 6c36d9dc6926..a3090b446af1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -148,13 +148,6 @@ struct epitem {
/* The file descriptor information this item refers to */
struct epoll_filefd ffd;
- /*
- * Protected by file->f_lock, true for to-be-released epitem already
- * removed from the "struct file" items list; together with
- * eventpoll->refcount orchestrates "struct eventpoll" disposal
- */
- bool dying;
-
/* List containing poll wait queues */
struct eppoll_entry *pwqlist;
@@ -220,12 +213,12 @@ struct eventpoll {
struct hlist_head refs;
u8 loop_check_depth;
- /*
- * usage count, used together with epitem->dying to
- * orchestrate the disposal of this struct
- */
+ /* usage count, orchestrates "struct eventpoll" disposal */
refcount_t refcount;
+ /* used to defer freeing past ep_get_upwards_depth_proc() RCU walk */
+ struct rcu_head rcu;
+
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
@@ -819,40 +812,52 @@ static void ep_free(struct eventpoll *ep)
mutex_destroy(&ep->mtx);
free_uid(ep->user);
wakeup_source_unregister(ep->ws);
- kfree(ep);
+ /* ep_get_upwards_depth_proc() may still hold epi->ep under RCU */
+ kfree_rcu(ep, rcu);
}
/*
- * Removes a "struct epitem" from the eventpoll RB tree and deallocates
- * all the associated resources. Must be called with "mtx" held.
- * If the dying flag is set, do the removal only if force is true.
- * This prevents ep_clear_and_put() from dropping all the ep references
- * while running concurrently with eventpoll_release_file().
- * Returns true if the eventpoll can be disposed.
+ * The ffd.file pointer may be in the process of being torn down due to
+ * being closed, but we may not have finished eventpoll_release() yet.
+ *
+ * Normally, even with the atomic_long_inc_not_zero, the file may have
+ * been free'd and then gotten re-allocated to something else (since
+ * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
+ *
+ * But for epoll, users hold the ep->mtx mutex, and as such any file in
+ * the process of being free'd will block in eventpoll_release_file()
+ * and thus the underlying file allocation will not be free'd, and the
+ * file re-use cannot happen.
+ *
+ * For the same reason we can avoid a rcu_read_lock() around the
+ * operation - 'ffd.file' cannot go away even if the refcount has
+ * reached zero (but we must still not call out to ->poll() functions
+ * etc).
*/
-static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
+static struct file *epi_fget(const struct epitem *epi)
{
- struct file *file = epi->ffd.file;
- struct epitems_head *to_free;
- struct hlist_head *head;
+ struct file *file;
- lockdep_assert_irqs_enabled();
+ file = epi->ffd.file;
+ if (!file_ref_get(&file->f_ref))
+ file = NULL;
+ return file;
+}
- /*
- * Removes poll wait queue hooks.
- */
- ep_unregister_pollwait(ep, epi);
+/*
+ * Takes &file->f_lock; returns with it released.
+ */
+static void ep_remove_file(struct eventpoll *ep, struct epitem *epi,
+ struct file *file)
+{
+ struct epitems_head *to_free = NULL;
+ struct hlist_head *head;
- /* Remove the current item from the list of epoll hooks */
- spin_lock(&file->f_lock);
- if (epi->dying && !force) {
- spin_unlock(&file->f_lock);
- return false;
- }
+ lockdep_assert_held(&ep->mtx);
- to_free = NULL;
+ spin_lock(&file->f_lock);
head = file->f_ep;
- if (head->first == &epi->fllink && !epi->fllink.next) {
+ if (hlist_is_singular_node(&epi->fllink, head)) {
/* See eventpoll_release() for details. */
WRITE_ONCE(file->f_ep, NULL);
if (!is_file_epoll(file)) {
@@ -865,6 +870,11 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
hlist_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
free_ephead(to_free);
+}
+
+static void ep_remove_epi(struct eventpoll *ep, struct epitem *epi)
+{
+ lockdep_assert_held(&ep->mtx);
rb_erase_cached(&epi->rbn, &ep->rbr);
@@ -884,16 +894,32 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
kfree_rcu(epi, rcu);
percpu_counter_dec(&ep->user->epoll_watches);
- return true;
}
/*
* ep_remove variant for callers owing an additional reference to the ep
*/
-static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
+static void ep_remove(struct eventpoll *ep, struct epitem *epi)
{
- if (__ep_remove(ep, epi, false))
- WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
+ struct file *file __free(fput) = NULL;
+
+ lockdep_assert_irqs_enabled();
+ lockdep_assert_held(&ep->mtx);
+
+ ep_unregister_pollwait(ep, epi);
+
+ /*
+ * If we manage to grab a reference it means we're not in
+ * eventpoll_release_file() and aren't going to be: once @file's
+ * refcount has reached zero, file_ref_get() cannot bring it back.
+ */
+ file = epi_fget(epi);
+ if (!file)
+ return;
+
+ ep_remove_file(ep, epi, file);
+ ep_remove_epi(ep, epi);
+ WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
}
static void ep_clear_and_put(struct eventpoll *ep)
@@ -919,7 +945,7 @@ static void ep_clear_and_put(struct eventpoll *ep)
/*
* Walks through the whole tree and try to free each "struct epitem".
- * Note that ep_remove_safe() will not remove the epitem in case of a
+ * Note that ep_remove() will not remove the epitem in case of a
* racing eventpoll_release_file(); the latter will do the removal.
* At this point we are sure no poll callbacks will be lingering around.
* Since we still own a reference to the eventpoll struct, the loop can't
@@ -928,7 +954,7 @@ static void ep_clear_and_put(struct eventpoll *ep)
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
next = rb_next(rbp);
epi = rb_entry(rbp, struct epitem, rbn);
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
cond_resched();
}
@@ -1009,34 +1035,6 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
}
/*
- * The ffd.file pointer may be in the process of being torn down due to
- * being closed, but we may not have finished eventpoll_release() yet.
- *
- * Normally, even with the atomic_long_inc_not_zero, the file may have
- * been free'd and then gotten re-allocated to something else (since
- * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
- *
- * But for epoll, users hold the ep->mtx mutex, and as such any file in
- * the process of being free'd will block in eventpoll_release_file()
- * and thus the underlying file allocation will not be free'd, and the
- * file re-use cannot happen.
- *
- * For the same reason we can avoid a rcu_read_lock() around the
- * operation - 'ffd.file' cannot go away even if the refcount has
- * reached zero (but we must still not call out to ->poll() functions
- * etc).
- */
-static struct file *epi_fget(const struct epitem *epi)
-{
- struct file *file;
-
- file = epi->ffd.file;
- if (!file_ref_get(&file->f_ref))
- file = NULL;
- return file;
-}
-
-/*
* Differs from ep_eventpoll_poll() in that internal callers already have
* the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
* is correctly annotated.
@@ -1080,7 +1078,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f)
struct inode *inode = file_inode(epi->ffd.file);
seq_printf(m, "tfd: %8d events: %8x data: %16llx "
- " pos:%lli ino:%lx sdev:%x\n",
+ " pos:%lli ino:%llx sdev:%x\n",
epi->ffd.fd, epi->event.events,
(long long)epi->event.data,
(long long)epi->ffd.file->f_pos,
@@ -1113,18 +1111,17 @@ void eventpoll_release_file(struct file *file)
{
struct eventpoll *ep;
struct epitem *epi;
- bool dispose;
/*
- * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
- * touching the epitems list before eventpoll_release_file() can access
- * the ep->mtx.
+ * A concurrent ep_remove() cannot outrace us: it pins @file via
+ * epi_fget(), which fails once __fput() has dropped the refcount
+ * to zero -- the path we're on. So any racing ep_remove() bails
+ * and leaves the epi for us to clean up here.
*/
again:
spin_lock(&file->f_lock);
if (file->f_ep && file->f_ep->first) {
epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
- epi->dying = true;
spin_unlock(&file->f_lock);
/*
@@ -1133,10 +1130,15 @@ again:
*/
ep = epi->ep;
mutex_lock(&ep->mtx);
- dispose = __ep_remove(ep, epi, true);
+
+ ep_unregister_pollwait(ep, epi);
+
+ ep_remove_file(ep, epi, file);
+ ep_remove_epi(ep, epi);
+
mutex_unlock(&ep->mtx);
- if (dispose && ep_refcount_dec_and_test(ep))
+ if (ep_refcount_dec_and_test(ep))
ep_free(ep);
goto again;
}
@@ -1147,7 +1149,7 @@ static int ep_alloc(struct eventpoll **pep)
{
struct eventpoll *ep;
- ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+ ep = kzalloc_obj(*ep);
if (unlikely(!ep))
return -ENOMEM;
@@ -1615,21 +1617,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
mutex_unlock(&tep->mtx);
/*
- * ep_remove_safe() calls in the later error paths can't lead to
+ * ep_remove() calls in the later error paths can't lead to
* ep_free() as the ep file itself still holds an ep reference.
*/
ep_get(ep);
/* now check if we've created too many backpaths */
if (unlikely(full_check && reverse_path_check())) {
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
return -EINVAL;
}
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error) {
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
return error;
}
}
@@ -1653,7 +1655,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
* high memory pressure.
*/
if (unlikely(!epq.epi)) {
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
return -ENOMEM;
}
@@ -2061,7 +2063,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
* @ep: the &struct eventpoll to be currently checked.
* @depth: Current depth of the path being checked.
*
- * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
+ * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found
+ * a loop or went too deep.
*/
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
@@ -2080,7 +2083,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
- result = INT_MAX;
+ result = EP_MAX_NESTS+1;
else
result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
if (result > EP_MAX_NESTS)
@@ -2347,7 +2350,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
* The eventpoll itself is still alive: the refcount
* can't go to zero here.
*/
- ep_remove_safe(ep, epi);
+ ep_remove(ep, epi);
error = 0;
} else {
error = -ENOENT;