diff options
Diffstat (limited to 'fs/eventpoll.c')
| -rw-r--r-- | fs/eventpoll.c | 173 |
1 files changed, 88 insertions, 85 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 6c36d9dc6926..a3090b446af1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -148,13 +148,6 @@ struct epitem { /* The file descriptor information this item refers to */ struct epoll_filefd ffd; - /* - * Protected by file->f_lock, true for to-be-released epitem already - * removed from the "struct file" items list; together with - * eventpoll->refcount orchestrates "struct eventpoll" disposal - */ - bool dying; - /* List containing poll wait queues */ struct eppoll_entry *pwqlist; @@ -220,12 +213,12 @@ struct eventpoll { struct hlist_head refs; u8 loop_check_depth; - /* - * usage count, used together with epitem->dying to - * orchestrate the disposal of this struct - */ + /* usage count, orchestrates "struct eventpoll" disposal */ refcount_t refcount; + /* used to defer freeing past ep_get_upwards_depth_proc() RCU walk */ + struct rcu_head rcu; + #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; @@ -819,40 +812,52 @@ static void ep_free(struct eventpoll *ep) mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); - kfree(ep); + /* ep_get_upwards_depth_proc() may still hold epi->ep under RCU */ + kfree_rcu(ep, rcu); } /* - * Removes a "struct epitem" from the eventpoll RB tree and deallocates - * all the associated resources. Must be called with "mtx" held. - * If the dying flag is set, do the removal only if force is true. - * This prevents ep_clear_and_put() from dropping all the ep references - * while running concurrently with eventpoll_release_file(). - * Returns true if the eventpoll can be disposed. + * The ffd.file pointer may be in the process of being torn down due to + * being closed, but we may not have finished eventpoll_release() yet. + * + * Normally, even with the atomic_long_inc_not_zero, the file may have + * been free'd and then gotten re-allocated to something else (since + * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). + * + * But for epoll, users hold the ep->mtx mutex, and as such any file in + * the process of being free'd will block in eventpoll_release_file() + * and thus the underlying file allocation will not be free'd, and the + * file re-use cannot happen. + * + * For the same reason we can avoid a rcu_read_lock() around the + * operation - 'ffd.file' cannot go away even if the refcount has + * reached zero (but we must still not call out to ->poll() functions + * etc). */ -static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) +static struct file *epi_fget(const struct epitem *epi) { - struct file *file = epi->ffd.file; - struct epitems_head *to_free; - struct hlist_head *head; + struct file *file; - lockdep_assert_irqs_enabled(); + file = epi->ffd.file; + if (!file_ref_get(&file->f_ref)) + file = NULL; + return file; +} - /* - * Removes poll wait queue hooks. - */ - ep_unregister_pollwait(ep, epi); +/* + * Takes &file->f_lock; returns with it released. + */ +static void ep_remove_file(struct eventpoll *ep, struct epitem *epi, + struct file *file) +{ + struct epitems_head *to_free = NULL; + struct hlist_head *head; - /* Remove the current item from the list of epoll hooks */ - spin_lock(&file->f_lock); - if (epi->dying && !force) { - spin_unlock(&file->f_lock); - return false; - } + lockdep_assert_held(&ep->mtx); - to_free = NULL; + spin_lock(&file->f_lock); head = file->f_ep; - if (head->first == &epi->fllink && !epi->fllink.next) { + if (hlist_is_singular_node(&epi->fllink, head)) { /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { @@ -865,6 +870,11 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); free_ephead(to_free); +} + +static void ep_remove_epi(struct eventpoll *ep, struct epitem *epi) +{ + lockdep_assert_held(&ep->mtx); rb_erase_cached(&epi->rbn, &ep->rbr); @@ -884,16 +894,32 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); - return true; } /* * ep_remove variant for callers owing an additional reference to the ep */ -static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) +static void ep_remove(struct eventpoll *ep, struct epitem *epi) { - if (__ep_remove(ep, epi, false)) - WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); + struct file *file __free(fput) = NULL; + + lockdep_assert_irqs_enabled(); + lockdep_assert_held(&ep->mtx); + + ep_unregister_pollwait(ep, epi); + + /* + * If we manage to grab a reference it means we're not in + * eventpoll_release_file() and aren't going to be: once @file's + * refcount has reached zero, file_ref_get() cannot bring it back. + */ + file = epi_fget(epi); + if (!file) + return; + + ep_remove_file(ep, epi, file); + ep_remove_epi(ep, epi); + WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } static void ep_clear_and_put(struct eventpoll *ep) @@ -919,7 +945,7 @@ static void ep_clear_and_put(struct eventpoll *ep) /* * Walks through the whole tree and try to free each "struct epitem". - * Note that ep_remove_safe() will not remove the epitem in case of a + * Note that ep_remove() will not remove the epitem in case of a * racing eventpoll_release_file(); the latter will do the removal. * At this point we are sure no poll callbacks will be lingering around. * Since we still own a reference to the eventpoll struct, the loop can't @@ -928,7 +954,7 @@ static void ep_clear_and_put(struct eventpoll *ep) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); - ep_remove_safe(ep, epi); + ep_remove(ep, epi); cond_resched(); } @@ -1009,34 +1035,6 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep } /* - * The ffd.file pointer may be in the process of being torn down due to - * being closed, but we may not have finished eventpoll_release() yet. - * - * Normally, even with the atomic_long_inc_not_zero, the file may have - * been free'd and then gotten re-allocated to something else (since - * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). - * - * But for epoll, users hold the ep->mtx mutex, and as such any file in - * the process of being free'd will block in eventpoll_release_file() - * and thus the underlying file allocation will not be free'd, and the - * file re-use cannot happen. - * - * For the same reason we can avoid a rcu_read_lock() around the - * operation - 'ffd.file' cannot go away even if the refcount has - * reached zero (but we must still not call out to ->poll() functions - * etc). - */ -static struct file *epi_fget(const struct epitem *epi) -{ - struct file *file; - - file = epi->ffd.file; - if (!file_ref_get(&file->f_ref)) - file = NULL; - return file; -} - -/* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. @@ -1080,7 +1078,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f) struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " - " pos:%lli ino:%lx sdev:%x\n", + " pos:%lli ino:%llx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, @@ -1113,18 +1111,17 @@ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; - bool dispose; /* - * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from - * touching the epitems list before eventpoll_release_file() can access - * the ep->mtx. + * A concurrent ep_remove() cannot outrace us: it pins @file via + * epi_fget(), which fails once __fput() has dropped the refcount + * to zero -- the path we're on. So any racing ep_remove() bails + * and leaves the epi for us to clean up here. */ again: spin_lock(&file->f_lock); if (file->f_ep && file->f_ep->first) { epi = hlist_entry(file->f_ep->first, struct epitem, fllink); - epi->dying = true; spin_unlock(&file->f_lock); /* @@ -1133,10 +1130,15 @@ again: */ ep = epi->ep; mutex_lock(&ep->mtx); - dispose = __ep_remove(ep, epi, true); + + ep_unregister_pollwait(ep, epi); + + ep_remove_file(ep, epi, file); + ep_remove_epi(ep, epi); + mutex_unlock(&ep->mtx); - if (dispose && ep_refcount_dec_and_test(ep)) + if (ep_refcount_dec_and_test(ep)) ep_free(ep); goto again; } @@ -1147,7 +1149,7 @@ static int ep_alloc(struct eventpoll **pep) { struct eventpoll *ep; - ep = kzalloc(sizeof(*ep), GFP_KERNEL); + ep = kzalloc_obj(*ep); if (unlikely(!ep)) return -ENOMEM; @@ -1615,21 +1617,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, mutex_unlock(&tep->mtx); /* - * ep_remove_safe() calls in the later error paths can't lead to + * ep_remove() calls in the later error paths can't lead to * ep_free() as the ep file itself still holds an ep reference. */ ep_get(ep); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { - ep_remove_safe(ep, epi); + ep_remove(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { - ep_remove_safe(ep, epi); + ep_remove(ep, epi); return error; } } @@ -1653,7 +1655,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * high memory pressure. */ if (unlikely(!epq.epi)) { - ep_remove_safe(ep, epi); + ep_remove(ep, epi); return -ENOMEM; } @@ -2061,7 +2063,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * - * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep. + * Return: depth of the subtree, or a value bigger than EP_MAX_NESTS if we found + * a loop or went too deep. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { @@ -2080,7 +2083,7 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth) struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) - result = INT_MAX; + result = EP_MAX_NESTS+1; else result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1); if (result > EP_MAX_NESTS) @@ -2347,7 +2350,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * The eventpoll itself is still alive: the refcount * can't go to zero here. */ - ep_remove_safe(ep, epi); + ep_remove(ep, epi); error = 0; } else { error = -ENOENT; |
