From 05e3db95ebfc5c06a29a1d8c7a3e02f46f3a25a7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:04 -0700 Subject: kthread: add a mechanism to store cgroup info kthread usually runs jobs on behalf of other threads. The jobs should be charged to cgroup of original threads. But the jobs run in a kthread, where we lose the cgroup context of original threads. The patch adds a machanism to record cgroup info of original threads in kthread context. Later we can retrieve the cgroup info and attach the cgroup info to jobs. Since this mechanism is only required by kthread, we store the cgroup info in kthread data instead of generic task_struct. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/kthread.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 82e197eeac91..bd4369c83dfb 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -3,6 +3,7 @@ /* Simple interface for creating and stopping kernel threads without mess. */ #include #include +#include __printf(4, 5) struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), @@ -198,4 +199,14 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); +#ifdef CONFIG_CGROUPS +void kthread_associate_blkcg(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *kthread_blkcg(void); +#else +static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } +static inline struct cgroup_subsys_state *kthread_blkcg(void) +{ + return NULL; +} +#endif #endif /* _LINUX_KTHREAD_H */ -- cgit v1.2.3 From af551fb3be26a22b7a6b345b3b7e7e6acfc41758 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:05 -0700 Subject: blkcg: delete unused APIs Nobody uses the APIs right now. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- include/linux/blk-cgroup.h | 12 ------------ 2 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 275c91c99516..9c75f58f6a50 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -522,13 +522,11 @@ do { \ #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); -int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } -static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } static inline void bio_clone_blkcg_association(struct bio *dst, struct bio *src) { } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9d92153dd856..0cfa8d2ef4d6 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -235,12 +235,6 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) return task_blkcg(current); } -static inline struct cgroup_subsys_state * -task_get_blkcg_css(struct task_struct *task) -{ - return task_get_css(task, io_cgrp_id); -} - /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest @@ -735,12 +729,6 @@ struct blkcg_policy { #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) -static inline struct cgroup_subsys_state * -task_get_blkcg_css(struct task_struct *task) -{ - return NULL; -} - #ifdef CONFIG_BLOCK static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -- cgit v1.2.3 From 902ec5b6de0678ac2effba0faaafdd1c3e7fcf2e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:06 -0700 Subject: block: make blkcg aware of kthread stored original cgroup info bio_blkcg is the only API to get cgroup info for a bio right now. If bio_blkcg finds current task is a kthread and has original blkcg associated, it will use the css instead of associating the bio to current task. This makes it possible that kthread dispatches bios on behalf of other threads. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 0cfa8d2ef4d6..f57e54d64529 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -19,6 +19,7 @@ #include #include #include +#include /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) @@ -223,16 +224,16 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) return css ? container_of(css, struct blkcg, css) : NULL; } -static inline struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return css_to_blkcg(task_css(tsk, io_cgrp_id)); -} - static inline struct blkcg *bio_blkcg(struct bio *bio) { + struct cgroup_subsys_state *css; + if (bio && bio->bi_css) return css_to_blkcg(bio->bi_css); - return task_blkcg(current); + css = kthread_blkcg(); + if (css) + return css_to_blkcg(css); + return css_to_blkcg(task_css(current, io_cgrp_id)); } /** -- cgit v1.2.3 From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Sep 2017 11:02:12 -0700 Subject: block: fix a build error The code is only for blkcg not for all cgroups Fixes: d4478e92d618 ("block/loop: make loop cgroup aware") Reported-by: kbuild test robot Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/kthread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index bd4369c83dfb..fb201842c635 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -199,7 +199,7 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP void kthread_associate_blkcg(struct cgroup_subsys_state *css); struct cgroup_subsys_state *kthread_blkcg(void); #else -- cgit v1.2.3 From 640ab98fb3629c0f8417b9b2532eca596495f3bb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Sep 2017 05:40:16 -0600 Subject: buffer: have alloc_page_buffers() use __GFP_NOFAIL Instead of adding weird retry logic in that function, utilize __GFP_NOFAIL to ensure that the vm takes care of handling any potential retries appropriately. This means we don't have to call free_more_memory() from here. Reviewed-by: Nikolay Borisov Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c8dae555eccf..ae2d25f01b98 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -156,7 +156,7 @@ void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); int try_to_free_buffers(struct page *); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - int retry); + bool retry); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); -- cgit v1.2.3 From 9ba4b2dfafaa711b41cc2102b0e9a529f3981218 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Sep 2017 08:58:25 -0600 Subject: fs: kill 'nr_pages' argument from wakeup_flusher_threads() Everybody is passing in 0 now, let's get rid of the argument. Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/writeback.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d5815794416c..1f9c6db5e29a 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,7 +189,7 @@ bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void sync_inodes_sb(struct super_block *); -void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); +void wakeup_flusher_threads(enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ -- cgit v1.2.3 From 47410d88f665486bf91f02242ab5d5692b8887ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:25:03 -0600 Subject: writeback: remove 'range_cyclic' argument for wb_start_writeback() All the callers pass in 'true' for range_cyclic, so kill the argument. Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 854e1bdd0b2a..0f63493de9e7 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -39,7 +39,7 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) } void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - bool range_cyclic, enum wb_reason reason); + enum wb_reason reason); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.3 From 595043e5f9ef1d8263bd9fda215cade489227491 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:26:59 -0600 Subject: writeback: provide a wakeup_flusher_threads_bdi() Similar to wakeup_flusher_threads(), except that we only wake up the flusher threads on the specified backing device. No functional changes in this patch. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/writeback.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 1f9c6db5e29a..9c0091678af4 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -190,6 +190,8 @@ bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); +void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, + enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ -- cgit v1.2.3 From 9dfb176fae57a1dea68531fd25e867037e4d9bac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:28:55 -0600 Subject: writeback: make wb_start_writeback() static We don't have any callers outside of fs-writeback.c anymore, make it private. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0f63493de9e7..157e950a70dc 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -38,8 +38,6 @@ static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); } -void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, - enum wb_reason reason); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); -- cgit v1.2.3 From aac8d41cd438f25bf3110fc6b98f1d16d7dbc169 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Sep 2017 11:31:55 -0600 Subject: writeback: only allow one inflight and pending full flush When someone calls wakeup_flusher_threads() or wakeup_flusher_threads_bdi(), they schedule writeback of all dirty pages in the system (or on that bdi). If we are tight on memory, we can get tons of these queued from kswapd/vmscan. This causes (at least) two problems: 1) We consume a ton of memory just allocating writeback work items. We've seen as much as 600 million of these writeback work items pending. That's a lot of memory to pointlessly hold hostage, while the box is under memory pressure. 2) We spend so much time processing these work items, that we introduce a softlockup in writeback processing. This is because each of the writeback work items don't end up doing any work (it's hard when you have millions of identical ones coming in to the flush machinery), so we just sit in a tight loop pulling work items and deleting/freeing them. Fix this by adding a 'start_all' bit to the writeback structure, and set that when someone attempts to flush all dirty pages. The bit is cleared when we start writeback on that work item. If the bit is already set when we attempt to queue !nr_pages writeback, then we simply ignore it. This provides us one full flush in flight, with one pending as well, and makes for more efficient handling of this type of writeback. Acked-by: Johannes Weiner Tested-by: Chris Mason Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 866c433e7d32..420de5c7c7f9 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -24,6 +24,7 @@ enum wb_state { WB_shutting_down, /* wb_shutdown() in progress */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ + WB_start_all, /* nr_pages == 0 (all) work pending */ }; enum wb_congested_state { -- cgit v1.2.3 From eaefd5abf6b095bfc55eb745bdf7c42cf66790eb Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 10:38:42 -0700 Subject: nvme-fc: add uevent for auto-connect To support auto-connecting to FC-NVME devices upon their dynamic appearance, add a uevent that can kick off connection scripts. uevent is posted against the fc_udev device. patch set tested with the following rule to kick an nvme-cli connect-all for the FC initiator and FC target ports. This is just an example for testing and not intended for real life use. ACTION=="change", SUBSYSTEM=="fc", ENV{FC_EVENT}=="nvmediscovery", \ ENV{NVMEFC_HOST_TRADDR}=="*", ENV{NVMEFC_TRADDR}=="*", \ RUN+="/bin/sh -c '/usr/local/sbin/nvme connect-all --transport=fc --host-traddr=$env{NVMEFC_HOST_TRADDR} --traddr=$env{NVMEFC_TRADDR} >> /tmp/nvme_fc.log'" I will post proposed udev/systemd scripts for possible kernel support. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index a726f96010d5..4ea03b9a5c8c 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -446,6 +446,8 @@ int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); +void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport); + /* -- cgit v1.2.3 From 85009b4f5f0399669a44f07cb9a5622c0e71d419 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 30 Sep 2017 02:09:06 -0600 Subject: writeback: eliminate work item allocation in bd_start_writeback() Handle start-all writeback like we do periodic or kupdate style writeback - by marking the bdi_writeback as needing a full flush, and simply waking the thread. This eliminates the need to allocate and queue a specific work item just for this purpose. After this change, we truly only ever have one of them running at any point in time. We mark the need to start all flushes, and the writeback thread will clear it once it has processed the request. Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 23 +++++++++++++++++++++++ include/linux/writeback.h | 22 ---------------------- 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 420de5c7c7f9..b7c7be6f5986 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -44,6 +44,28 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +/* + * why some writeback work was initiated + */ +enum wb_reason { + WB_REASON_BACKGROUND, + WB_REASON_VMSCAN, + WB_REASON_SYNC, + WB_REASON_PERIODIC, + WB_REASON_LAPTOP_TIMER, + WB_REASON_FREE_MORE_MEM, + WB_REASON_FS_FREE_SPACE, + /* + * There is no bdi forker thread any more and works are done + * by emergency worker, however, this is TPs userland visible + * and we'll be exposing exactly the same information, + * so it has a mismatch name. + */ + WB_REASON_FORKER_THREAD, + + WB_REASON_MAX, +}; + /* * For cgroup writeback, multiple wb's may map to the same blkcg. Those * wb's can operate mostly independently but should share the congested @@ -116,6 +138,7 @@ struct bdi_writeback { struct fprop_local_percpu completions; int dirty_exceeded; + enum wb_reason start_all_reason; spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 9c0091678af4..dd1d2c23f743 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -41,28 +41,6 @@ enum writeback_sync_modes { WB_SYNC_ALL, /* Wait on every mapping */ }; -/* - * why some writeback work was initiated - */ -enum wb_reason { - WB_REASON_BACKGROUND, - WB_REASON_VMSCAN, - WB_REASON_SYNC, - WB_REASON_PERIODIC, - WB_REASON_LAPTOP_TIMER, - WB_REASON_FREE_MORE_MEM, - WB_REASON_FS_FREE_SPACE, - /* - * There is no bdi forker thread any more and works are done - * by emergency worker, however, this is TPs userland visible - * and we'll be exposing exactly the same information, - * so it has a mismatch name. - */ - WB_REASON_FORKER_THREAD, - - WB_REASON_MAX, -}; - /* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised -- cgit v1.2.3 From 5fdee2127faa77c9c91862ad5e001dfab7013e92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 Oct 2017 21:22:52 +0200 Subject: block: remove QUEUE_FLAG_STACKABLE We already have a queue_is_rq_based helper to check if a request_queue is request based, so we can remove the flag for it. Acked-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 02fa42d24b52..9fb71fc7d0e8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -609,7 +609,6 @@ struct request_queue { #define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ -#define QUEUE_FLAG_STACKABLE 8 /* supports request stacking */ #define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ @@ -633,12 +632,10 @@ struct request_queue { #define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_ADD_RANDOM)) #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_POLL)) @@ -722,8 +719,6 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) -#define blk_queue_stackable(q) \ - test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) -- cgit v1.2.3 From 775d3a35dc3e13de55ec0e061c59e36faa7dd7f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Oct 2017 08:15:15 -0600 Subject: backing-dev: kill unused pdflush_proc_obsolete() After commit b35bd0d9f8a8, pdflush_proc_obsolete() is no longer used. Kill the function and declaration. Reported-by: Rakesh Pandit Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 157e950a70dc..872afa41abc2 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -172,8 +172,6 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); -int pdflush_proc_obsolete(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) { -- cgit v1.2.3 From 8264c3214f28b52b399d9e03bfa7feec275a0d71 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Mon, 9 Oct 2017 13:34:41 +0300 Subject: writeback: merge try_to_writeback_inodes_sb_nr() into caller Since commit 925a6efb8ff0c ("Btrfs: stop using try_to_writeback_inodes_sb_nr to flush delalloc") this function hasn't been used outside so stop exporting it. In addition we merge it into try_to_writeback_inodes_sb() which is the only caller. Also change return type of try_to_writeback_inodes_sb to void as the only user ext4 doesn't care. Reviewed-by: Jan Kara Signed-off-by: Rakesh Pandit Signed-off-by: Jens Axboe --- include/linux/writeback.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index dd1d2c23f743..e15ec14085ad 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -163,9 +163,7 @@ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); -bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); -bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, - enum wb_reason reason); +void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, -- cgit v1.2.3 From eca8b53a6769e60d6d8240d71202d73b0af81901 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 6 Oct 2017 17:55:59 -0700 Subject: blk-stat: delete useless code Fix two issues: - the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except sum it to global stat. We can do the calculation there. The flush just wastes cpu time. - some fields are signed int/s64. I don't see the point. Reviewed-by: Omar Sandoval Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a2d2aa709cef..3385c89f402e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) } struct blk_rq_stat { - s64 mean; + u64 mean; u64 min; u64 max; - s32 nr_samples; - s32 nr_batch; + u32 nr_samples; u64 batch; }; -- cgit v1.2.3 From 7f66721a7d5bf6251f9c49aada9200c61ddcecc5 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Thu, 12 Oct 2017 19:58:10 +0300 Subject: fs/block_dev: remove vfs_msg() interface Replaced by pr_err usage in commit ef51042472f5 ("block, dax: move "select DAX" from BLOCK to FS_DAX") Signed-off-by: Rakesh Pandit Acked-by: Ross Zwisler Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9fb71fc7d0e8..72637028f3c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -917,17 +917,6 @@ static inline void rq_flush_dcache_pages(struct request *rq) } #endif -#ifdef CONFIG_PRINTK -#define vfs_msg(sb, level, fmt, ...) \ - __vfs_msg(sb, level, fmt, ##__VA_ARGS__) -#else -#define vfs_msg(sb, level, fmt, ...) \ -do { \ - no_printk(fmt, ##__VA_ARGS__); \ - __vfs_msg(sb, "", " "); \ -} while (0) -#endif - extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); -- cgit v1.2.3 From 900148296b78c61aa8c443dc594c0da968c3be53 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:50 +0200 Subject: lightnvm: prevent target type module removal when in use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If target type module e.g. pblk here is unloaded (rmmod) while module is in use (after creating target) system crashes. We fix this by using module API refcnt. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7dfa56ebbc6d..7b80ac911d26 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -460,6 +460,7 @@ struct nvm_tgt_type { /* For internal use */ struct list_head list; + struct module *owner; }; extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); -- cgit v1.2.3 From ef56b9ce562753cacf518f081a4ff3227efdab25 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:30 +0200 Subject: lightnvm: remove unused argument from nvm_set_tgt_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vblk isn't being used anyway and if we ever have a usecase we can introduce this again. This makes the logic easier and removes unnecessary checks. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7b80ac911d26..32ec35b5e18b 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -481,7 +481,7 @@ extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *, - const struct ppa_addr *, int, int); + const struct ppa_addr *, int); extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -- cgit v1.2.3 From eb6f168f97438bf1cac8b9b1301c662eace9e39f Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:31 +0200 Subject: lightnvm: remove stale extern and unused exported symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all exported symbols are being used outside core and there were some stale entries in lightnvm.h Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 32ec35b5e18b..4f0e4a0fd204 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -463,8 +463,6 @@ struct nvm_tgt_type { struct module *owner; }; -extern struct nvm_tgt_type *nvm_find_target_type(const char *, int); - extern int nvm_register_tgt_type(struct nvm_tgt_type *); extern void nvm_unregister_tgt_type(struct nvm_tgt_type *); @@ -480,9 +478,6 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); -extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *, - const struct ppa_addr *, int); -extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); @@ -491,8 +486,6 @@ extern void nvm_end_io(struct nvm_rq *); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); -extern int nvm_dev_factory(struct nvm_dev *, int flags); - extern void nvm_part_to_tgt(struct nvm_dev *, sector_t *, int); #else /* CONFIG_NVM */ -- cgit v1.2.3 From 1a94b2d484677dc559c96251dd0e7c7b8811c378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:47 +0200 Subject: lightnvm: implement generic path for sync I/O MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a generic path for sending sync I/O on LightNVM. This allows to reuse the standard synchronous path trough blk_execute_rq(), instead of implementing a wait_for_completion on the target side (e.g., pblk). Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 4f0e4a0fd204..b7f111ff4d3b 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -56,6 +56,7 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, @@ -69,6 +70,7 @@ struct nvm_dev_ops { nvm_op_set_bb_fn *set_bb_tbl; nvm_submit_io_fn *submit_io; + nvm_submit_io_sync_fn *submit_io_sync; nvm_create_dma_pool_fn *create_dma_pool; nvm_destroy_dma_pool_fn *destroy_dma_pool; @@ -477,6 +479,7 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); +extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -- cgit v1.2.3 From 149e10f8ff71439014dff97987e90e87e2684a16 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:06 +0300 Subject: block: introduce blk_mq_tagset_iter Iterator helper to apply a function on all the tags in a given tagset. export it as it will be used outside the block layer later on. Reviewed-by: Bart Van Assche Reviewed-by: Jens Axboe Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f..6bc29f73a9aa 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -259,6 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); +int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, + int (reinit_request)(void *, struct request *)); int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, int (reinit_request)(void *, struct request *)); -- cgit v1.2.3 From dab7487bdf63c6f2d70aac604494d023b189a9fd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:08 +0300 Subject: block: remove blk_mq_reinit_tagset No callers left. Reviewed-by: Jens Axboe Reviewed-by: Bart Van Assche Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6bc29f73a9aa..cfd64e500d82 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -261,8 +261,6 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, int (reinit_request)(void *, struct request *)); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, - int (reinit_request)(void *, struct request *)); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- cgit v1.2.3 From 8ac0d9a81edf2ef4a2268b65b802a6b856dc77e6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Oct 2017 12:35:02 -0600 Subject: elevator: allow name aliases Since we now lookup elevator types with the appropriate multiqueue capability, allow schedulers to register with an alias alongside the real name. This is in preparation for allowing 'mq-deadline' to register an alias of 'deadline' as well. Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 5bc8f8682a3e..6df8b14f1f6a 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -144,6 +144,7 @@ struct elevator_type size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; + const char *elevator_alias; struct module *elevator_owner; bool uses_mq; #ifdef CONFIG_BLK_DEBUG_FS -- cgit v1.2.3 From ecad0d2cb8a7997afdc95031ee3328b997aba5c4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 23 Oct 2017 15:11:36 -0700 Subject: nvme-fc: remove NVME_FC_MAX_SEGMENTS The define is an arbitrary limit to the io size on the initiator, capping the io to 1MB-4KB. Remove the define from the transport. I/O size will solely be limited by the LLDD sg limits. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 4ea03b9a5c8c..2be4db353937 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -102,8 +102,6 @@ enum nvmefc_fcp_datadir { }; -#define NVME_FC_MAX_SEGMENTS 256 - /** * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport * to LLDD in order to perform a NVME FCP IO operation. -- cgit v1.2.3 From 7930d0a00ff5dbcc80f793d1a7a6b8de4e591f1a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:27 +0800 Subject: sbitmap: introduce __sbitmap_for_each_set() For blk-mq, we need to be able to iterate software queues starting from any queue in a round robin fashion, so introduce this helper. Reviewed-by: Omar Sandoval Cc: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 64 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index a1904aadbc45..0dcc60e820de 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb); */ bool sbitmap_any_bit_clear(const struct sbitmap *sb); +#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) +#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) + typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); /** - * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * @start: Where to start the iteration. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. @@ -222,35 +226,61 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); * This is inline even though it's non-trivial so that the function calls to the * callback will hopefully get optimized away. */ -static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, - void *data) +static inline void __sbitmap_for_each_set(struct sbitmap *sb, + unsigned int start, + sb_for_each_fn fn, void *data) { - unsigned int i; + unsigned int index; + unsigned int nr; + unsigned int scanned = 0; - for (i = 0; i < sb->map_nr; i++) { - struct sbitmap_word *word = &sb->map[i]; - unsigned int off, nr; + if (start >= sb->depth) + start = 0; + index = SB_NR_TO_INDEX(sb, start); + nr = SB_NR_TO_BIT(sb, start); - if (!word->word) - continue; + while (scanned < sb->depth) { + struct sbitmap_word *word = &sb->map[index]; + unsigned int depth = min_t(unsigned int, word->depth - nr, + sb->depth - scanned); - nr = 0; - off = i << sb->shift; + scanned += depth; + if (!word->word) + goto next; + + /* + * On the first iteration of the outer loop, we need to add the + * bit offset back to the size of the word for find_next_bit(). + * On all other iterations, nr is zero, so this is a noop. + */ + depth += nr; while (1) { - nr = find_next_bit(&word->word, word->depth, nr); - if (nr >= word->depth) + nr = find_next_bit(&word->word, depth, nr); + if (nr >= depth) break; - - if (!fn(sb, off + nr, data)) + if (!fn(sb, (index << sb->shift) + nr, data)) return; nr++; } +next: + nr = 0; + if (++index >= sb->map_nr) + index = 0; } } -#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) -#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) +/** + * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * @sb: Bitmap to iterate over. + * @fn: Callback. Should return true to continue or false to break early. + * @data: Pointer to pass to callback. + */ +static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, + void *data) +{ + __sbitmap_for_each_set(sb, 0, fn, data); +} static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr) -- cgit v1.2.3 From de1482974080ec9ef414bf048b2646b246b63f6e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:29 +0800 Subject: blk-mq: introduce .get_budget and .put_budget in blk_mq_ops For SCSI devices, there is often a per-request-queue depth, which needs to be respected before queuing one request. Currently blk-mq always dequeues the request first, then calls .queue_rq() to dispatch the request to lld. One obvious issue with this approach is that I/O merging may not be successful, because when the per-request-queue depth can't be respected, .queue_rq() has to return BLK_STS_RESOURCE, and then this request has to stay in hctx->dispatch list. This means it never gets a chance to be merged with other IO. This patch introduces .get_budget and .put_budget callback in blk_mq_ops, then we can try to get reserved budget first before dequeuing request. If the budget for queueing I/O can't be satisfied, we don't need to dequeue request at all. Hence the request can be left in the IO scheduler queue, for more merging opportunities. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f..901457df3d64 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -90,6 +90,8 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); +typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *); +typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); @@ -111,6 +113,15 @@ struct blk_mq_ops { */ queue_rq_fn *queue_rq; + /* + * Reserve budget before queue request, once .queue_rq is + * run, it is driver's responsibility to release the + * reserved budget. Also we have to handle failure case + * of .get_budget for avoiding I/O deadlock. + */ + get_budget_fn *get_budget; + put_budget_fn *put_budget; + /* * Called on request timeout */ -- cgit v1.2.3 From b347689ffbca745ac457ee27400ce1affd571c6f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:30 +0800 Subject: blk-mq-sched: improve dispatching from sw queue SCSI devices use host-wide tagset, and the shared driver tag space is often quite big. However, there is also a queue depth for each lun( .cmd_per_lun), which is often small, for example, on both lpfc and qla2xxx, .cmd_per_lun is just 3. So lots of requests may stay in sw queue, and we always flush all belonging to same hw queue and dispatch them all to driver. Unfortunately it is easy to cause queue busy because of the small .cmd_per_lun. Once these requests are flushed out, they have to stay in hctx->dispatch, and no bio merge can happen on these requests, and sequential IO performance is harmed. This patch introduces blk_mq_dequeue_from_ctx for dequeuing a request from a sw queue, so that we can dispatch them in scheduler's way. We can then avoid dequeueing too many requests from sw queue, since we don't flush ->dispatch completely. This patch improves dispatching from sw queue by using the .get_budget and .put_budget callbacks. Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 901457df3d64..e5e6becd57d3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -30,6 +30,8 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; + struct blk_mq_ctx *dispatch_from; + struct blk_mq_ctx **ctxs; unsigned int nr_ctx; -- cgit v1.2.3 From ac7fe82b6fcf77e757e88005c33b8147c1b7b73f Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:15 -0700 Subject: nvme-fc: add a dev_loss_tmo field to the remoteport Add a dev_loss_tmo value, paralleling the SCSI FC transport, for device connectivity loss. The transport initializes the value in the nvme_fc_register_remoteport() call. If the value is not set, a default of 60s is set. Add a new routine to the api, nvme_fc_set_remoteport_devloss() routine, which allows the lldd to dynamically update the value on an existing remoteport. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 2be4db353937..496ff759f84c 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -40,6 +40,8 @@ * @node_name: FC WWNN for the port * @port_name: FC WWPN for the port * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) + * @dev_loss_tmo: maximum delay for reconnects to an association on + * this device. Used only on a remoteport. * * Initialization values for dynamic port fields: * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must @@ -50,6 +52,7 @@ struct nvme_fc_port_info { u64 port_name; u32 port_role; u32 port_id; + u32 dev_loss_tmo; }; @@ -200,6 +203,9 @@ enum nvme_fc_obj_state { * The length of the buffer corresponds to the local_priv_sz * value specified in the nvme_fc_port_template supplied by * the LLDD. + * @dev_loss_tmo: maximum delay for reconnects to an association on + * this device. To modify, lldd must call + * nvme_fc_set_remoteport_devloss(). * * Fields with dynamic values. Values may change base on link state. LLDD * may reference fields directly to change them. Initialized by the @@ -257,10 +263,9 @@ struct nvme_fc_remote_port { u32 port_role; u64 node_name; u64 port_name; - struct nvme_fc_local_port *localport; - void *private; + u32 dev_loss_tmo; /* dynamic fields */ u32 port_id; @@ -446,6 +451,8 @@ int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); void nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport); +int nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *remoteport, + u32 dev_loss_tmo); /* -- cgit v1.2.3 From 8977f563845bdd61fc61c4dfb399270b7d5667c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:48 +0300 Subject: block: move REQ_NOWAIT This flag should be before the operation-specific REQ_NOUNMAP bit. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3385c89f402e..c4c6c8bced2e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -224,11 +224,11 @@ enum req_flag_bits { __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ + __REQ_NOWAIT, /* Don't wait if request will block */ /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ - __REQ_NOWAIT, /* Don't wait if request will block */ __REQ_NR_BITS, /* stops here */ }; @@ -245,9 +245,9 @@ enum req_flag_bits { #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) +#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) -#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.3 From 96222bcc732d0504363dc772637c50e53b4bd41e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:49 +0300 Subject: block: add REQ_DRV bit Set aside a bit in the request/bio flags for driver use. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c4c6c8bced2e..1b04085255fb 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -229,6 +229,9 @@ enum req_flag_bits { /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ + /* for driver use */ + __REQ_DRV, + __REQ_NR_BITS, /* stops here */ }; @@ -249,6 +252,8 @@ enum req_flag_bits { #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) +#define REQ_DRV (1ULL << __REQ_DRV) + #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.3 From f421e1d9ade4e1b88183e54425cf50e390d16a7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:50 +0300 Subject: block: provide a direct_make_request helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper allows reinserting a bio into a new queue without much overhead, but requires all queue limits to be the same for the upper and lower queues, and it does not provide any recursion preventions. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Javier González Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72637028f3c9..eda3d25c0f68 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -920,6 +920,7 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); +extern blk_qc_t direct_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); -- cgit v1.2.3 From ef71de8b15d891b27b8c983a9a8972b11cb4576a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:51 +0300 Subject: block: add a blk_steal_bios helper This helpers allows to bounce steal the uncompleted bios from a request so that they can be reissued on another path. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index eda3d25c0f68..fddda6a1f9b5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1094,6 +1094,8 @@ extern struct request *blk_peek_request(struct request_queue *q); extern void blk_start_request(struct request *rq); extern struct request *blk_fetch_request(struct request_queue *q); +void blk_steal_bios(struct bio_list *list, struct request *rq); + /* * Request completion related functions. * -- cgit v1.2.3 From 517bf3c306bad4fe0da631f90ae2ec40924dee2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:52 +0300 Subject: block: don't look at the struct device dev_t in disk_devt The hidden gendisks introduced in the next patch need to keep the dev field in their struct device empty so that udev won't try to create block device nodes for them. To support that rewrite disk_devt to look at the major and first_minor fields in the gendisk itself instead of looking into the struct device. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ea652bfcd675..5c0ed5db33c2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -234,7 +234,7 @@ static inline bool disk_part_scan_enabled(struct gendisk *disk) static inline dev_t disk_devt(struct gendisk *disk) { - return disk_to_dev(disk)->devt; + return MKDEV(disk->major, disk->first_minor); } static inline dev_t part_devt(struct hd_struct *part) -- cgit v1.2.3 From 8ddcd653257c18a669fcb75ee42c37054908e0d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:53 +0300 Subject: block: introduce GENHD_FL_HIDDEN With this flag a driver can create a gendisk that can be used for I/O submission inside the kernel, but which is not registered as user facing block device. This will be useful for the NVMe multipath implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5c0ed5db33c2..93aae3476f58 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -140,6 +140,7 @@ struct hd_struct { #define GENHD_FL_NATIVE_CAPACITY 128 #define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256 #define GENHD_FL_NO_PART_SCAN 512 +#define GENHD_FL_HIDDEN 1024 enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ -- cgit v1.2.3 From ea435e1b9392a33deceaea2a16ebaa3397bead93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:54 +0300 Subject: block: add a poll_fn callback to struct request_queue That we we can also poll non blk-mq queues. Mostly needed for the NVMe multipath code, but could also be useful elsewhere. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fddda6a1f9b5..225617dd0a3f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -266,6 +266,7 @@ struct blk_queue_ctx; typedef void (request_fn_proc) (struct request_queue *q); typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); +typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unprep_rq_fn) (struct request_queue *, struct request *); @@ -408,6 +409,7 @@ struct request_queue { request_fn_proc *request_fn; make_request_fn *make_request_fn; + poll_q_fn *poll_fn; prep_rq_fn *prep_rq_fn; unprep_rq_fn *unprep_rq_fn; softirq_done_fn *softirq_done_fn; @@ -975,7 +977,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); +bool blk_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { -- cgit v1.2.3 From 88022d7201e96b43f1754b0358fc6bcd8dbdcde1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 5 Nov 2017 02:21:12 +0800 Subject: blk-mq: don't handle failure in .get_budget It is enough to just check if we can get the budget via .get_budget(). And we don't need to deal with device state change in .get_budget(). For SCSI, one issue to be fixed is that we have to call scsi_mq_uninit_cmd() to free allocated ressources if SCSI device fails to handle the request. And it isn't enough to simply call blk_mq_end_request() to do that if this request is marked as RQF_DONTPREP. Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq) Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f2e3079eecdd..674641527da7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -92,7 +92,7 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); -typedef blk_status_t (get_budget_fn)(struct blk_mq_hw_ctx *); +typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); -- cgit v1.2.3 From 84fef62d135b6e47b52f4e9280b5dbc5bb0050ba Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 10:28:32 -0700 Subject: nvme: check admin passthru command effects The NVMe standard provides a command effects log page so the host may be aware of special requirements it may need to do for a particular command. For example, the command may need to run with IO quiesced to prevent timeouts or undefined behavior, or it may change the logical block formats that determine how the host needs to construct future commands. This patch saves the nvme command effects log page if the controller supports it, and performs appropriate actions before and after an admin passthrough command is completed. If the controller does not support the command effects log page, the driver will define the effects for known opcodes. The nvme format and santize are the only commands in this patch with known effects. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9310ce77d8e1..fd1d4508a612 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -267,6 +267,7 @@ enum { NVME_CTRL_OACS_SEC_SUPP = 1 << 0, NVME_CTRL_OACS_DIRECTIVES = 1 << 5, NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, + NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, }; struct nvme_lbaf { @@ -395,6 +396,21 @@ struct nvme_fw_slot_info_log { __u8 rsvd64[448]; }; +enum { + NVME_CMD_EFFECTS_CSUPP = 1 << 0, + NVME_CMD_EFFECTS_LBCC = 1 << 1, + NVME_CMD_EFFECTS_NCC = 1 << 2, + NVME_CMD_EFFECTS_NIC = 1 << 3, + NVME_CMD_EFFECTS_CCC = 1 << 4, + NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, +}; + +struct nvme_effects_log { + __le32 acs[256]; + __le32 iocs[256]; + __u8 resv[2048]; +}; + enum { NVME_SMART_CRIT_SPARE = 1 << 0, NVME_SMART_CRIT_TEMPERATURE = 1 << 1, @@ -681,6 +697,7 @@ enum nvme_admin_opcode { nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, + nvme_admin_sanitize_nvm = 0x84, }; enum { @@ -712,6 +729,7 @@ enum { NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_CMD_EFFECTS = 0x05, NVME_LOG_DISC = 0x70, NVME_LOG_RESERVATION = 0x80, NVME_FWACT_REPL = (0 << 3), -- cgit v1.2.3 From 83f5f7ed72f316eda4c4c3833beebfe6578926f4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 Nov 2017 11:15:37 -0700 Subject: block: kill bio_kmap/kunmap_irq() There are no users of it anymore. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9c75f58f6a50..1d7e63d7505f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -573,17 +573,6 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) } #endif -static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter, - unsigned long *flags) -{ - return bvec_kmap_irq(&bio_iter_iovec(bio, iter), flags); -} -#define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) - -#define bio_kmap_irq(bio, flags) \ - __bio_kmap_irq((bio), (bio)->bi_iter, (flags)) -#define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) - /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * -- cgit v1.2.3 From d004a5e7d4dd6335ce6e2044af42f5e0fbebb51d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Nov 2017 19:13:48 +0100 Subject: block: remove __bio_kmap_atomic This helper doesn't buy us much over calling kmap_atomic directly. In fact in the only caller it does a bit of useless work as the caller already has the bvec at hand, and said caller would even buggy for a multi-segment bio due to the use of this helper. So just remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 1d7e63d7505f..d4eec19a6d3c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -128,18 +128,6 @@ static inline void *bio_data(struct bio *bio) */ #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) -/* - * queues that have highmem support enabled may still need to r