// SPDX-License-Identifier: GPL-2.0
/*
* Shared application/kernel submission and completion ring pairs, for
* supporting fast/efficient IO.
*
* A note on the read/write ordering memory barriers that are matched between
* the application and kernel side.
*
* After the application reads the CQ ring tail, it must use an
* appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
* before writing the tail (using smp_load_acquire to read the tail will
* do). It also needs a smp_mb() before updating CQ head (ordering the
* entry load(s) with the head store), pairing with an implicit barrier
* through a control-dependency in io_get_cqring (smp_store_release to
* store head will do). Failure to do so could lead to reading invalid
* CQ entries.
*
* Likewise, the application must use an appropriate smp_wmb() before
* writing the SQ tail (ordering SQ entry stores with the tail store),
* which pairs with smp_load_acquire in io_get_sqring (smp_store_release
* to store the tail will do). And it needs a barrier ordering the SQ
* head load before writing new SQ entries (smp_load_acquire to read
* head will do).
*
* When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
* needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
* updating the SQ tail; a full memory barrier smp_mb() is needed
* between.
*
* Also see the examples in the liburing library:
*
* git://git.kernel.dk/liburing
*
* io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
* from data shared between the kernel and application. This is done both
* for ordering purposes, but also to ensure that once a value is loaded from
* data that the application could potentially modify, it remains stable.
*
* Copyright (C) 2018-2019 Jens Axboe
* Copyright (c) 2018-2019 Christoph Hellwig
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/kthread.h>
#include <linux/blkdev.h>
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/sizes.h>
#include <linux/hugetlb.h>
#include <linux/highmem.h>
#include <linux/namei.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
#include <linux/fs_struct.h>
#include <linux/splice.h>
#include <linux/task_work.h>
#include <linux/pagemap.h>
#include <linux/io_uring.h>
#include <linux/blk-cgroup.h>
#include <linux/audit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "internal.h"
#include "io-wq.h"
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
/*
* Shift of 9 is 512 entries, or exactly one page on 64-bit archs
*/
#define IORING_FILE_TABLE_SHIFT 9
#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT)
#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1)
#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
};
/*
* This data is shared with the application through the mmap at offsets
* IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
*
* The offsets to the member fields are published through struct
* io_sqring_offsets when calling io_uring_setup.
*/
struct io_rings {
/*
* Head and tail offsets into the ring; the offsets need to be
* masked to get valid indices.
*
* The kernel controls head of the sq ring and the tail of the cq ring,
* and the application controls tail of the sq ring and the head of the
* cq ring.
*/
struct io_uring sq, cq;
/*
* Bitmasks to apply to head and tail offsets (constant, equals
* ring_entries - 1)
*/
u32 sq_ring_mask, cq_ring_mask;
/* Ring sizes (constant, power of 2) */
u32 sq_ring_entries, cq_ring_entries;
/*
* Number of invalid entries dropped by the kernel due to
* invalid index stored in array
*
* Written by the kernel, shouldn't be modified by the
* application (i.e. get number of "new events" by comparing to
* cached value).
*
* After a new SQ head value was read by the application this
* counter includes all submissions that were dropped reaching
* the new SQ head (and possibly more).
*/
u32 sq_dropped;
/*
* Runtime SQ flags
*
* Written by the kernel, shouldn't be modified by the
* application.
*
* The application needs a full memory barrier before checking
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
u32 sq_flags;
/*
* Runtime CQ flags
*
* Written by the application, shouldn't be modified by the
* kernel.
*/
u32 cq_flags;
/*
* Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure
* there are not more requests pending than there is space in
* the completion queue.
*
* Written by the kernel, shouldn't be modified by the
* application (i.e. get number of "new events" by comparing to
* cached value).
*
* As completion events come in out of order this counter is not
* ordered with any other data.
*/
u32 cq_overflow;
/*
* Ring buffer of completion events.
*
* The kernel writes completion events fresh every time they are
* produced, so the application is allowed to modify pending
* entries.
*/
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
struct io_mapped_ubuf {
u64 ubuf;
size_t len;
struct bio_vec *bvec;
unsigned int nr_bvecs;
unsigned long acct_pages;
};
struct fixed_file_table {
struct file **files;
};
struct fixed_file_ref_node {
struct percpu_ref refs;
struct list_head node;
struct list_head file_list;
struct fixed_file_data *file_data;
struct llist_node llist;
};
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
struct fixed_file_ref_node *node;
struct percpu_ref refs;
struct completion done;
struct list_head ref_list;
spinlock_t lock;
};
struct io_buffer {
struct list_head list;
__u64 addr;
__s32 len;
__u16 bid;
};
struct io_restriction {
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
u8 sqe_flags_allowed;
u8 sqe_flags_required;
bool registered;
};
struct io_sq_data {
refcount_t refs;
struct mutex lock;
/* ctx's that are using this sqd */
struct list_head ctx_list;
struct list_head ctx_new_list;
struct mutex ctx_lock;
struct task_struct *thread;
struct wait_queue_head wait;
};
struct io_ring_ctx {
struct {
struct percpu_ref refs;
} ____cacheline_aligned_in_smp;
struct {
unsigned int flags;
unsigned int compat: 1;
unsigned int limit_mem: 1;
unsigned int cq_overflow_flushed: 1;
unsigned int drain_next: 1;
unsigned int eventfd_async: 1;
unsigned int restricted: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
* mmapped by the application using the IORING_OFF_SQES offset.
*
* This indirection could e.g. be used to assign fixed
* io_uring_sqe entries to operations and only submit them to
* the queue when needed.
*
* The kernel modifies neither the indices array nor the entries
* array.
*/
u32 *sq_array;
unsigned cached_sq_head;
unsigned sq_entries;
unsigned sq_mask;
unsigned sq_thread_idle;
unsigned cached_sq_dropped;
unsigned cached_cq_overflow;
unsigned long sq_check_overflow;
struct list_head defer_list;
struct list_head timeout_list;
struct list_head cq_overflow_list;
wait_queue_head_t inflight_wait;
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
struct io_rings *rings;
/* IO offload */
struct io_wq *io_wq;
/*
* For SQPOLL usage - we hold a reference to the parent task, so we
* have access to the ->files
*/
struct task_struct *sqo_task;
/* Only used for accounting purposes */
struct mm_struct *mm_account;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *sqo_blkcg_css;
#endif
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct wait_queue_entry sqo_wait_entry;
struct list_head sqd_list;
/*
* If used, fixed file set. Writers must ensure that ->refs is dead,
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
struct fixed_file_data *file_data;
unsigned nr_user_files;
/* if used, fixed mapped user buffers */
unsigned nr_user_bufs;
struct io_mapped_ubuf *user_bufs;
struct user_struct *user;
const struct cred *creds;
#ifdef CONFIG_AUDIT
kuid_t loginuid;
unsigned int sessionid;
#endif
struct completion ref_comp;
struct completion sq_thread_comp;
/* if all else fails... */
struct io_kiocb *fallback_req;
#if defined(CONFIG_UNIX)
struct socket *ring_sock;
#endif
struct idr io_buffer_idr;
struct idr personality_idr;
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
unsigned cq_mask;
atomic_t cq_timeouts;
unsigned long cq_check_overflow;
struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync;
struct eventfd_ctx *cq_ev_fd;
} ____cacheline_aligned_in_smp;
struct {
struct mutex uring_lock;
wait_queue_head_t wait;
} ____cacheline_aligned_in_smp;
struct {
spinlock_t completion_lock;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct list_head iopoll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_file;
spinlock_t inflight_lock;
struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
struct delayed_work file_put_work;
struct llist_head file_put_llist;
struct work_struct exit_work;
struct io_restriction restrictions;
};
/*
* First field must be the file pointer in all the
* iocb unions! See also 'struct kiocb' in <linux/fs.h>
*/
struct io_poll_iocb {
struct file *file;
union {
struct wait_queue_head *head;
u64 addr;
};
__poll_t events;
bool done;
bool canceled;
struct wait_queue_entry wait;
};
struct io_close {
struct file *file;
struct file *put_file;
int fd;
};
struct io_timeout_data {
struct io_kiocb *req;
struct hrtimer timer;
struct timespec64 ts;
enum hrtimer_mode mode;
};
struct io_accept {
struct file *file;
struct sockaddr __user *addr;
int __user *addr_len;
int flags;
unsigned long nofile;
};
struct io_sync {
struct file *file;
loff_t len;
loff_t off;
int flags;
int mode;
};
struct io_cancel {
struct file *file;
u64 addr;
};
struct io_timeout {
struct file *file;
u32 off;
u32 target_seq;
struct list_head list;
};
struct io_timeout_rem {
struct file *file;
u64 addr;
};
struct io_rw {
/* NOTE: kiocb has the file as the first member, so don't do it here */
struct kiocb kiocb;
u64 addr;
u64 len;
};
struct io_connect {
struct file *file;
struct sockaddr __user *addr;
int addr_len;
};
struct io_sr_msg {
struct file *file;
union {
struct user_msghdr __user *umsg;
void __user *buf;
};
int msg_flags;
int bgid;
size_t len;
struct io_buffer *kbuf;
};
struct io_open {
struct file *file;
int dfd;
struct filename *filename;
struct open_how how;
unsigned long nofile;
};
struct io_files_update {
struct file *file;
u64 arg;
u32 nr_args;
u32 offset;
};
struct io_fadvise {
struct file *file;
u64 offset;
u32 len;
u32 advice;
};
struct io_madvise {
struct file *file;
u64 addr;
u32 len;
u32 advice;
};
struct io_epoll {
struct file *file;
int epfd;
int op;
int fd;
struct epoll_event event;
};
struct io_splice {
struct file *file_out;
struct file *file_in;
loff_t off_out;
loff_t off_in;
u64 len;
unsigned int flags;
};
struct io_provide_buf {
struct file *file;
__u64 addr;
__s32 len;
__u32 bgid;
__u16 nbufs;
__u16 bid;
};
struct io_statx {
struct file *file;
int dfd;
unsigned int mask;
unsigned int flags;
const char __user *filename;
struct statx __user *buffer;
};
struct io_completion {
struct file *file;
struct list_head list;
int cflags;
};
struct io_async_connect {
struct sockaddr_storage address;
};
struct io_async_msghdr {
struct iovec fast_iov[UIO_FASTIOV];
struct iovec *iov;
struct sockaddr __user *uaddr;
struct msghdr msg;
struct sockaddr_storage addr;
};
struct io_async_rw {
struct iovec fast_iov[UIO_FASTIOV];
const struct iovec *free_iovec;
struct iov_iter iter;
size_t bytes_done;
struct wait_page_queue wpq;
};
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
REQ_F_LINK_HEAD_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_NO_FILE_TABLE_BIT,
REQ_F_WORK_INITIALIZED_BIT,
REQ_F_LTIMEOUT_ACTIVE_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
};
enum {
/* ctx owns file */
REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
/* drain existing IO first */
REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
/* linked sqes */
REQ_F_LINK = BIT(REQ_F_LINK_BIT),
/* doesn't sever on completion < 0 */
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
/* head of a link */
REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* has or had linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* needs cleanup */
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
/* doesn't need file table for this request */
REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT),
/* io_wq_work is initialized */
REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT),
/* linked timeout is active, i.e. prepared by link's head */
REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
};
struct async_poll {
struct io_poll_iocb poll;
struct io_poll_iocb *double_poll;
};
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
* access the file pointer through any of the sub-structs,
* or directly as just 'ki_filp' in this struct.
*/
struct io_kiocb {
union {
struct file *file;
struct io_rw rw;
struct io_poll_iocb poll;
struct io_accept accept;
struct io_sync sync;
struct io_cancel cancel;
struct io_timeout timeout;
struct io_timeout_rem timeout_rem;
struct io_connect connect;
struct io_sr_msg sr_msg;
struct io_open open;
struct io_close close;
struct io_files_update files_update;
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
struct io_splice splice;
struct io_provide_buf pbuf;
struct io_statx statx;
/* use only after cleaning per-op data, see io_clean_op() */
struct io_completion compl;
};
/* opcode allocated if it needs to store data for async defer */
void *async_data;
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
u16 buf_index;
u32 result;
struct io_ring_ctx *ctx;
unsigned int flags;
refcount_t refs;
struct task_struct *task;
u64 user_data;
struct list_head link_list;
/*
* 1. used with ctx->iopoll_list with reads/writes
* 2. to track reqs with ->files (see io_op_def::file_table)
*/
struct list_head inflight_entry;
struct percpu_ref *fixed_file_refs;
struct callback_head task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
struct async_poll *apoll;
struct io_wq_work work;
};
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
u32 seq;
};
#define IO_IOPOLL_BATCH 8
struct io_comp_state {
unsigned int nr;
struct list_head list;
struct io_ring_ctx *ctx;
};
struct io_submit_state {
struct blk_plug plug;
/*
* io_kiocb alloc cache
*/
void *reqs[IO_IOPOLL_BATCH];
unsigned int free_reqs;
/*
* Batch completion logic
*/
struct io_comp_state comp;
/*
* File reference cache
*/
struct file *file;
unsigned int fd;
unsigned int has_refs;
unsigned int ios_left;
};
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
/* don't fail if file grab fails */
unsigned needs_file_no_error : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
/* set if opcode supports polled "wait" */
unsigned pollin : 1;
unsigned pollout : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
/* must always have async data allocated */
unsigned needs_async_data : 1;
/* size of async data needed, if any */
unsigned short async_size;
unsigned work_flags;
};
static const struct io_op_def io_op_defs[] = {
[IORING_OP_NOP] = {},
[IORING_OP_READV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITEV] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
.work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
IO_WQ_WORK_MM,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
.work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FS,
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_msghdr),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FS,
},
[IORING_OP_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
.work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_TIMEOUT_REMOVE] = {},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
.needs_async_data = 1,
.async_size = sizeof(struct io_timeout_data),
.work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_CONNECT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_connect),
.work_flags = IO_WQ_WORK_MM,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
},
[IORING_OP_OPENAT] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FS,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
.needs_file_no_error = 1,
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
},
[IORING_OP_FILES_UPDATE] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
},
[IORING_OP_STATX] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
},
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_WRITE] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
IO_WQ_WORK_FSIZE,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
.work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_MADVISE] = {
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
},
[IORING_OP_OPENAT2] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
IO_WQ_WORK_BLKCG,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
.work_flags = IO_WQ_WORK_FILES,
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.work_flags = IO_WQ_WORK_BLKCG,
},
[IORING_OP_PROVIDE_BUFFERS] = {},
[IORING_OP_REMOVE_BUFFERS] = {},
[IORING_OP_TEE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
};
enum io_mem_account {
ACCT_LOCKED,
ACCT_PINNED,
};
static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
struct io_comp_state *cs);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void io_put_req(struct io_kiocb *req);
static void io_put_req_deferred(struct io_kiocb *req, int nr);
static void io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void __io_queue_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static void __io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_submit_state *state,
struct io_kiocb *req, int fd, bool fixed);
static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
static void io_file_put_work(struct work_struct *work);
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
struct iovec **iovec, struct iov_iter *iter,
bool needs_lock);
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
const struct iovec *fast_iov,
struct iov_iter *iter, bool force);
static struct kmem_cache *req_cachep;
static const struct file_operations io_uring_fops;
struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
if (file->f_op == &io_uring_fops) {
struct io_ring_ctx *ctx = file->private_data;
return ctx->ring_sock->sk;
}
#endif
return NULL;
}
EXPORT_SYMBOL(io_uring_get_socket);
static inline void io_clean_op(struct io_kiocb *req)
{
if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
REQ_F_INFLIGHT))
__io_clean_op(req);
}
static void io_sq_thread_drop_mm(void)
{
struct mm_struct *mm = current->mm;
if (mm) {
kthread_unuse_mm(mm);
mmput(mm);
}
}
static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
{
if (!current->mm) {
if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
!ctx->sqo_task->mm ||
!mmget_not_zero(ctx->sqo_task->mm)))
return -EFAULT;
kthread_use_mm(ctx->sqo_task->mm);
}
return 0;
}
static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
return 0;
return __io_sq_thread_acquire_mm(ctx);
}
static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
struct cgroup_subsys_state **cur_css)
{
#ifdef CONFIG_BLK_CGROUP
/* puts the old one when swapping */
if (*cur_css != ctx->sqo_blkcg_css) {
kthread_associate_blkcg(ctx->sqo_blkcg_css);
*cur_css = ctx->sqo_blkcg_css;
}
#endif
}
static void io_sq_thread_unassociate_blkcg(void)
{
#ifdef CONFIG_BLK_CGROUP
kthread_associate_blkcg(NULL);
#endif
}
static inline void req_set_fail_links(struct io_kiocb *req)
{
if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
req->flags |= REQ_F_FAIL_LINK;
}
/*
* None of these are dereferenced, they are simply used to check if any of
* them have changed. If we're under current and check they are still the
* same, we're fine to grab references to them for actual out-of-line use.
*/
static void io_init_identity(struct io_identity *id)
{
id->files = current->files;
id->mm = current->mm;
#ifdef CONFIG_BLK_CGROUP
rcu_read_lock();
id->blkcg_css = blkcg_css();
rcu_read_unlock();
#endif
id->creds = current_cred();
id->nsproxy = current->nsproxy;
id->fs = current->fs;
id->fsize = rlimit(RLIMIT_FSIZE);
#ifdef CONFIG_AUDIT
id->loginuid = current->loginuid;
id->sessionid = current->sessionid;
#endif
refcount_set(&id->count, 1);
}
static inline void __io_req_init_async(struct io_kiocb *req)
{
memset(&req->work, 0, sizeof(req->work));
req->flags |= REQ_F_WORK_INITIALIZED;
}
/*
* Note: must call io_req_init_async() for the first time you
* touch any members of io_wq_work.
*/
static inline void io_req_init_async(struct io_kiocb *req)
{
struct io_uring_task *tctx = current->io_uring;
if (req->flags & REQ_F_WORK_INITIALIZED)
return;
__io_req_init_async(req);
/* Grab a ref if this isn't our static identity */
req->work.identity = tctx->identity;
if (tctx->identity != &tctx->__identity)
refcount_inc(&req->work.identity->count);
}
static inline bool io_async_submit(struct io_ring_ctx *ctx)
{
return ctx->flags & IORING_SETUP_SQPOLL;
}
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
complete(&ctx->ref_comp);
}
static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
return !req->timeout.off;
}
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
if (!ctx->fallback_req)
goto err;
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
*/
hash_bits = ilog2(p->cq_entries);
hash_bits -= 5;
if (hash_bits <= 0)
hash_bits = 1;
ctx->cancel_hash_bits = hash_bits;
ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
GFP_KERNEL);
if (!ctx->cancel_hash)
goto err;
__hash_init(ctx->cancel_hash, 1U << hash_bits);
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
init_waitqueue_head(&ctx->cq_wait);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->ref_comp);
init_completion(&ctx->sq_thread_comp);
idr_init(&ctx->io_buffer_idr);
idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
init_waitqueue_head(&ctx->inflight_wait);
spin_lock_init(&ctx->inflight_lock);
INIT_LIST_HEAD(&ctx->inflight_list);
INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
init_llist_head(&ctx->file_put_llist);
return ctx;
err:
if (ctx->fallback_req)
kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx->cancel_hash);
kfree(ctx);
return NULL;
}
static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
struct io_ring_ctx *ctx = req->ctx;
return seq != ctx->cached_cq_tail
+ READ_ONCE(ctx->cached_cq_overflow);
}
return false;
}
static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
/* order cqe stores with ring update */
smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
if (wq_has_sleeper(&ctx->cq_wait)) {
wake_up_interruptible(&ctx->cq_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
}
static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
{
if (req->work.identity == &tctx->__identity)
return;
if (refcount_dec_and_test(&req->work.identity->count))
kfree(req->work.identity);
}
static void io_req_clean_work(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_WORK_INITIALIZED))
return;
req->flags &= ~REQ_F_WORK_INITIALIZED;
if (req->work.flags & IO_WQ_WORK_MM) {
mmdrop(req->work.identity->mm);
req->work.flags &= ~IO_WQ_WORK_MM;
}
#ifdef CONFIG_BLK_CGROUP
if (req->work.flags & IO_WQ_WORK_BLKCG) {
css_put(req->work.identity->blkcg_css);
req->work.flags &= ~IO_WQ_WORK_BLKCG;
}
#endif
if (req->work.flags & IO_WQ_WORK_CREDS) {
put_cred(req->work.identity->creds);
req->work.flags &= ~IO_WQ_WORK_CREDS;
}
if (req->work.flags & IO_WQ_WORK_FS) {
struct fs_struct *fs = req->work.identity->fs;
spin_lock(&req->work.identity->fs->lock);
if (--fs->users)
fs = NULL;
spin_unlock(&req->work.identity->fs->lock);
if (fs)
free_fs_struct(fs);
req->work.flags &= ~IO_WQ_WORK_FS;
}
io_put_identity(req->task->io_uring, req);
}
/*
* Create a private copy of io_identity, since some fields don't match
* the current context.
*/
static bool io_identity_cow(struct io_kiocb *req)
{
struct io_uring_task *tctx = current->io_uring;
const struct cred *creds = NULL;
struct io_identity *id;
if (req->work.flags & IO_WQ_WORK_CREDS)
creds = req->work.identity->creds;
id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
if (unlikely(!id)) {
req->work.flags |= IO_WQ_WORK_CANCEL;
return false;
}
/*
* We can safely just re-init the creds we copied Either the field
* matches the current one, or we haven't grabbed it yet. The only
* exception is ->creds, through registered personalities, so handle
* that one separately.
*/
io_init_identity(id);
if (creds)
req->work.identity->creds = creds;
/* add one for this request */
refcount_inc(&id->count);
/* drop old identity, assign new one. one ref for req, one for tctx */
if (req->work.identity != tctx->identity &&
refcount_sub_and_test(2, &req->work.identity->count))
kfree(req->work.identity);
req->work.identity = id;
tctx->identity = id;
return true;
}
static bool io_grab_identity(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_identity *id = req->work.identity;
struct io_ring_ctx *ctx = req->ctx;
if (def->work_flags & IO_WQ_WORK_FSIZE) {
if (id->fsize != rlimit(RLIMIT_FSIZE))
return false;
req->work.flags |= IO_WQ_WORK_FSIZE;
}
if (!(req->work.flags & IO_WQ_WORK_FILES) &&
(def->work_flags & IO_WQ_WORK_FILES) &&
!(req->flags & REQ_F_NO_FILE_TABLE)) {
if (id->files != current->files ||
id->nsproxy != current->nsproxy)
return false;
atomic_inc(&id->files->count);
get_nsproxy(id->nsproxy);
req->flags |= REQ_F_INFLIGHT;
spin_lock_irq(&ctx->inflight_lock);
list_add(&req->inflight_entry, &ctx->inflight_list);
spin_unlock_irq(&ctx->inflight_lock);
req->work.flags |= IO_WQ_WORK_FILES;
}
#ifdef CONFIG_BLK_CGROUP
if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
(def->work_flags & IO_WQ_WORK_BLKCG)) {
rcu_read_lock();
if (id->blkcg_css != blkcg_css()) {
rcu_read_unlock();
return false;
}
/*
* This should be rare, either the cgroup is dying or the task
* is moving cgroups. Just punt to root for the handful of ios.
*/
if (css_tryget_online(id->blkcg_css))
req->work.flags |= IO_WQ_WORK_BLKCG;
rcu_read_unlock();
}
#endif
if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
if (id->creds != current_cred())
return false;
get_cred(id->creds);
req->work.flags |= IO_WQ_WORK_CREDS;
}
#ifdef CONFIG_AUDIT
if (!uid_eq(current->loginuid, id->loginuid) ||
current->sessionid != id->sessionid)
return false;
#endif
if (!(req->work.flags & IO_WQ_WORK_FS) &&
(def->work_flags & IO_WQ_WORK_FS)) {
if (current->fs != id->fs)
return false;
spin_lock(&id->fs->lock);
if (!id->fs->in_exec) {
id->fs->users++;
req->work.flags |= IO_WQ_WORK_FS;
} else {
req->work.flags |= IO_WQ_WORK_CANCEL;
}
spin_unlock(¤t->fs->lock);
}
return true;
}
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
struct io_identity *id;
io_req_init_async(req);
id = req->work.identity;
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
} else {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
/* ->mm can never change on us */
if (!(req->work.flags & IO_WQ_WORK_MM) &&
(def->work_flags & IO_WQ_WORK_MM)) {
mmgrab(id->mm);
req->work.flags |= IO_WQ_WORK_MM;
}
/* if we fail grabbing identity, we must COW, regrab, and retry */
if (io_grab_identity(req))
return;
if (!io_identity_cow(req))
return;
/* can't fail at this point */
if (!io_grab_identity(req))
WARN_ON(1);
}
static void io_prep_async_link(struct io_kiocb *req)
{
struct io_kiocb *cur;
io_prep_async_work(req);
if (req->flags & REQ_F_LINK_HEAD)
list_for_each_entry(cur, &req->link_list, link_list)
io_prep_async_work(cur);
}
static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link = io_prep_linked_timeout(req);
trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
&req->work, req->flags);
io_wq_enqueue(ctx->io_wq, &req->work);
return link;
}
static void io_queue_async_work(struct io_kiocb *req)
{
struct io_kiocb *link;
/* init ->work of the whole link before punting */
io_prep_async_link(req);
link = __io_queue_async_work(req);
if (link)
io_queue_linked_timeout(link);
}
static void io_kill_timeout(struct io_kiocb *req)
{
struct io_timeout_data *io = req->async_data;
int ret;
ret = hrtimer_try_to_cancel(&io->timer);
if (ret != -1) {
atomic_set(&req->ctx->cq_timeouts,
atomic_read(&req->ctx->cq_timeouts) + 1);
list_del_init(&req->timeout.list);
io_cqring_fill_event(req, 0);
io_put_req_deferred(req, 1);
}
}
static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
{
struct io_ring_ctx *ctx = req->ctx;
if (!tsk || req->task == tsk)
return true;
if (ctx->flags & IORING_SETUP_SQPOLL) {
if (ctx->sq_data && req->task == ctx->sq_data->thread)
return true;
}
return false;
}
/*
* Returns true if we found and killed one or more timeouts
*/
static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
{
struct io_kiocb *req, *tmp;
int canceled = 0;
spin_lock_irq(&ctx->completion_lock);
list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
if (io_task_match(req, tsk)) {
io_kill_timeout(req);
canceled++;
}
}
spin_unlock_irq(&ctx->completion_lock);
return canceled != 0;
}
static void __io_queue_deferred(struct io_ring_ctx *ctx)
{
do {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
struct io_kiocb *link;
if (req_need_defer(de->req, de->seq))
break;
list_del_init(&de->list);
/* punt-init is done before queueing for defer */
link = __io_queue_async_work(de->req);
if (link) {
__io_queue_linked_timeout(link);
/* drop submission reference */
io_put_req_deferred(link, 1);
}
kfree(de);
} while (!list_empty(&ctx->defer_list));
}
static void io_flush_timeouts(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->timeout_list)) {
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
struct io_kiocb, timeout.list);
if (io_is_timeout_noseq(req))
break;
if (req->timeout.target_seq != ctx->cached_cq_tail
- atomic_read(&ctx->cq_timeouts))
break;
list_del_init(&req->timeout.list);
io_kill_timeout(req);
}
}
static void io_commit_cqring(struct io_ring_ctx *ctx)
{
io_flush_timeouts(ctx);
__io_commit_cqring(ctx);
if (unlikely(!list_empty(&ctx->defer_list)))
__io_queue_deferred(ctx);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
}
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
unsigned tail;
tail = ctx->cached_cq_tail;
/*
* writes to the cq entry need to come after reading head; the
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
return NULL;
ctx->cached_cq_tail++;
return &rings->cqes[tail & ctx->cq_mask];
}
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
if (!ctx->cq_ev_fd)
return false;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return false;
if (!ctx->eventfd_async)
return true;
return io_wq_current_is_worker();
}
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
if (io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
}
static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
{
if (list_empty(&ctx->cq_overflow_list)) {
clear_bit(0, &ctx->sq_check_overflow);
clear_bit(0, &ctx->cq_check_overflow);
ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
}
static inline bool io_match_files(struct io_kiocb *req,
struct files_struct *files)
{
if (!files)
return true;
if ((req->flags & REQ_F_WORK_INITIALIZED) &&
(req->work.flags & IO_WQ_WORK_FILES))
return req->work.identity->files == files;
return false;
}
/* Returns true if there are no backlogged entries after the flush */
static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
struct files_struct *files)
{
struct io_rings *rings = ctx->rings;
struct io_kiocb *req, *tmp;
struct io_uring_cqe *cqe;
unsigned long flags;
LIST_HEAD(list);
if (!force) {
if (list_empty_careful(&ctx->cq_overflow_list))
return true;
if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
rings->cq_ring_entries))
return false;
}
spin_lock_irqsave(&ctx->completion_lock, flags);
/* if force is set, the ring is going away. always drop after that */
if (force)
ctx->cq_overflow_flushed = 1;
cqe = NULL;
list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
if (tsk && req->task != tsk)
continue;
if (!io_match_files(req, files))
continue;
cqe = io_get_cqring(ctx);
if (!cqe && !force)
break;
list_move(&req->compl.list, &list);
if (cqe) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, req->result);
WRITE_ONCE(cqe->flags, req->compl.cflags);
} else {
ctx->cached_cq_overflow++;
WRITE_ONCE(ctx->rings->cq_overflow,
ctx->cached_cq_overflow);
}
}
io_commit_cqring(ctx);
io_cqring_mark_overflow(ctx);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
while (!list_empty(&list)) {
req = list_first_entry(&list, struct io_kiocb, compl.list);
list_del(&req->compl.list);
io_put_req(req);
}
return cqe != NULL;
}
static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
trace_io_uring_complete(ctx, req->user_data, res);
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
cqe = io_get_cqring(ctx);
if (likely(cqe)) {
WRITE_ONCE(cqe->user_data, req->user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
} else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) {
/*
* If we're in ring overflow flush mode, or in task cancel mode,
* then we cannot store the request for later flushing, we need
* to drop it on the floor.
*/
ctx->cached_cq_overflow++;
WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
} else {
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
io_clean_op(req);
req->result = res;
req->compl.cflags = cflags;
refcount_inc(&req->refs);
list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
}
}
static void io_cqring_fill_event(struct io_kiocb *req, long res)
{
__io_cqring_fill_event(req, res, 0);
}
static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
{
struct io_ring_ctx *