From 08e136ebd193eae7d5eff4c66d576c4a2dabdc3f Mon Sep 17 00:00:00 2001 From: Raphael Pinsonneault-Thibeault Date: Wed, 17 Dec 2025 14:00:40 -0500 Subject: loop: don't change loop device under exclusive opener in loop_set_status loop_set_status() is allowed to change the loop device while there are other openers of the device, even exclusive ones. In this case, it causes a KASAN: slab-out-of-bounds Read in ext4_search_dir(), since when looking for an entry in an inlined directory, e_value_offs is changed underneath the filesystem by loop_set_status(). Fix the problem by forbidding loop_set_status() from modifying the loop device while there are exclusive openers of the device. This is similar to the fix in loop_configure() by commit 33ec3e53e7b1 ("loop: Don't change loop device under exclusive opener") alongside commit ecbe6bc0003b ("block: use bd_prepare_to_claim directly in the loop driver"). Reported-by: syzbot+3ee481e21fd75e14c397@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=3ee481e21fd75e14c397 Tested-by: syzbot+3ee481e21fd75e14c397@syzkaller.appspotmail.com Tested-by: Yongpeng Yang Signed-off-by: Raphael Pinsonneault-Thibeault Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/block/loop.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 32a3a5b13802..ca74cc31bf07 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1225,13 +1225,24 @@ static int loop_clr_fd(struct loop_device *lo) } static int -loop_set_status(struct loop_device *lo, const struct loop_info64 *info) +loop_set_status(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, const struct loop_info64 *info) { int err; bool partscan = false; bool size_changed = false; unsigned int memflags; + /* + * If we don't hold exclusive handle for the device, upgrade to it + * here to avoid changing device under exclusive owner. + */ + if (!(mode & BLK_OPEN_EXCL)) { + err = bd_prepare_to_claim(bdev, loop_set_status, NULL); + if (err) + goto out_reread_partitions; + } + err = mutex_lock_killable(&lo->lo_mutex); if (err) return err; @@ -1273,6 +1284,9 @@ out_unfreeze: } out_unlock: mutex_unlock(&lo->lo_mutex); + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(bdev, loop_set_status); +out_reread_partitions: if (partscan) loop_reread_partitions(lo); @@ -1352,7 +1366,9 @@ loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) } static int -loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg) +loop_set_status_old(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, + const struct loop_info __user *arg) { struct loop_info info; struct loop_info64 info64; @@ -1360,17 +1376,19 @@ loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg) if (copy_from_user(&info, arg, sizeof (struct loop_info))) return -EFAULT; loop_info64_from_old(&info, &info64); - return loop_set_status(lo, &info64); + return loop_set_status(lo, mode, bdev, &info64); } static int -loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg) +loop_set_status64(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, + const struct loop_info64 __user *arg) { struct loop_info64 info64; if (copy_from_user(&info64, arg, sizeof (struct loop_info64))) return -EFAULT; - return loop_set_status(lo, &info64); + return loop_set_status(lo, mode, bdev, &info64); } static int @@ -1549,14 +1567,14 @@ static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, case LOOP_SET_STATUS: err = -EPERM; if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) - err = loop_set_status_old(lo, argp); + err = loop_set_status_old(lo, mode, bdev, argp); break; case LOOP_GET_STATUS: return loop_get_status_old(lo, argp); case LOOP_SET_STATUS64: err = -EPERM; if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) - err = loop_set_status64(lo, argp); + err = loop_set_status64(lo, mode, bdev, argp); break; case LOOP_GET_STATUS64: return loop_get_status64(lo, argp); @@ -1650,8 +1668,9 @@ loop_info64_to_compat(const struct loop_info64 *info64, } static int -loop_set_status_compat(struct loop_device *lo, - const struct compat_loop_info __user *arg) +loop_set_status_compat(struct loop_device *lo, blk_mode_t mode, + struct block_device *bdev, + const struct compat_loop_info __user *arg) { struct loop_info64 info64; int ret; @@ -1659,7 +1678,7 @@ loop_set_status_compat(struct loop_device *lo, ret = loop_info64_from_compat(arg, &info64); if (ret < 0) return ret; - return loop_set_status(lo, &info64); + return loop_set_status(lo, mode, bdev, &info64); } static int @@ -1685,7 +1704,7 @@ static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, switch(cmd) { case LOOP_SET_STATUS: - err = loop_set_status_compat(lo, + err = loop_set_status_compat(lo, mode, bdev, (const struct compat_loop_info __user *)arg); break; case LOOP_GET_STATUS: -- cgit v1.2.3 From 7d121d701d58a92f26decb10da1d04a88b74519d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 6 Jan 2026 06:26:57 -0800 Subject: blk-rq-qos: Remove unlikely() hints from QoS checks The unlikely() annotations on QUEUE_FLAG_QOS_ENABLED checks are counterproductive. Writeback throttling (WBT) might be enabled by default, mainly because CONFIG_BLK_WBT_MQ defaults to 'y'. Branch profiling on Meta servers, which have WBT enabled, confirms 100% misprediction rates on these checks. Remove the unlikely() annotations to let the CPU's branch predictor learn the actual behavior, potentially improving I/O path performance. Signed-off-by: Breno Leitao Signed-off-by: Jens Axboe --- block/blk-rq-qos.h | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index b538f2c0febc..a747a504fe42 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -112,29 +112,26 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { - if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && - q->rq_qos) + if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && - q->rq_qos && !blk_rq_is_passthrough(rq)) + if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && + q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { - if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && - q->rq_qos) + if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { - if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && - q->rq_qos) + if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } @@ -162,8 +159,7 @@ static inline void rq_qos_done_bio(struct bio *bio) static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && - q->rq_qos) { + if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } @@ -172,16 +168,14 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { - if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && - q->rq_qos) + if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && - q->rq_qos) { + if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } @@ -189,8 +183,7 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq, static inline void rq_qos_queue_depth_changed(struct request_queue *q) { - if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) && - q->rq_qos) + if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } -- cgit v1.2.3 From 6acd4ac5f8f0ec9b946875553e52907700bcfc77 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 6 Jan 2026 13:08:37 -0700 Subject: block: don't merge bios with different app_tags nvme_set_app_tag() uses the app_tag value from the bio_integrity_payload of the struct request's first bio. This assumes all the request's bios have the same app_tag. However, it is possible for bios with different app_tag values to be merged into a single request. Add a check in blk_integrity_merge_{bio,rq}() to prevent the merging of bios/requests with different app_tag values if BIP_CHECK_APPTAG is set. Signed-off-by: Caleb Sander Mateos Fixes: 3d8b5a22d404 ("block: add support to pass user meta buffer") Signed-off-by: Jens Axboe --- block/blk-integrity.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 9b27963680dc..964eebbee14d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -140,14 +140,21 @@ EXPORT_SYMBOL_GPL(blk_rq_integrity_map_user); bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, struct request *next) { + struct bio_integrity_payload *bip, *bip_next; + if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0) return true; if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0) return false; - if (bio_integrity(req->bio)->bip_flags != - bio_integrity(next->bio)->bip_flags) + bip = bio_integrity(req->bio); + bip_next = bio_integrity(next->bio); + if (bip->bip_flags != bip_next->bip_flags) + return false; + + if (bip->bip_flags & BIP_CHECK_APPTAG && + bip->app_tag != bip_next->app_tag) return false; if (req->nr_integrity_segments + next->nr_integrity_segments > @@ -163,15 +170,21 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, struct bio *bio) { + struct bio_integrity_payload *bip, *bip_bio = bio_integrity(bio); int nr_integrity_segs; - if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL) + if (blk_integrity_rq(req) == 0 && bip_bio == NULL) return true; - if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL) + if (blk_integrity_rq(req) == 0 || bip_bio == NULL) + return false; + + bip = bio_integrity(req->bio); + if (bip->bip_flags != bip_bio->bip_flags) return false; - if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags) + if (bip->bip_flags & BIP_CHECK_APPTAG && + bip->app_tag != bip_bio->app_tag) return false; nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); -- cgit v1.2.3 From 2704024d83fa9eb8e5f16925aae340fd9d246694 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 7 Jan 2026 19:41:43 +0900 Subject: loop: add missing bd_abort_claiming in loop_set_status Commit 08e136ebd193 ("loop: don't change loop device under exclusive opener in loop_set_status") forgot to call bd_abort_claiming() when mutex_lock_killable() failed. Fixes: 08e136ebd193 ("loop: don't change loop device under exclusive opener in loop_set_status") Signed-off-by: Tetsuo Handa Signed-off-by: Jens Axboe --- drivers/block/loop.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ca74cc31bf07..bd59c0e9508b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1245,7 +1245,8 @@ loop_set_status(struct loop_device *lo, blk_mode_t mode, err = mutex_lock_killable(&lo->lo_mutex); if (err) - return err; + goto out_abort_claiming; + if (lo->lo_state != Lo_bound) { err = -ENXIO; goto out_unlock; @@ -1284,6 +1285,7 @@ out_unfreeze: } out_unlock: mutex_unlock(&lo->lo_mutex); +out_abort_claiming: if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_set_status); out_reread_partitions: -- cgit v1.2.3 From 9670db22e7ab4aefe2b2619589a47fef9d3e0c7e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 6 Jan 2026 16:56:07 +0100 Subject: blk-mq: avoid stall during boot due to synchronize_rcu_expedited On the kernel 6.19-rc, I am experiencing 15-second boot stall in a virtual machine when probing a virtio-scsi disk: [ 1.011641] SCSI subsystem initialized [ 1.013972] virtio_scsi virtio6: 16/0/0 default/read/poll queues [ 1.015983] scsi host0: Virtio SCSI HBA [ 1.019578] ACPI: \_SB_.GSIA: Enabled at IRQ 16 [ 1.020225] ahci 0000:00:1f.2: AHCI vers 0001.0000, 32 command slots, 1.5 Gbps, SATA mode [ 1.020228] ahci 0000:00:1f.2: 6/6 ports implemented (port mask 0x3f) [ 1.020230] ahci 0000:00:1f.2: flags: 64bit ncq only [ 1.024688] scsi host1: ahci [ 1.025432] scsi host2: ahci [ 1.025966] scsi host3: ahci [ 1.026511] scsi host4: ahci [ 1.028371] scsi host5: ahci [ 1.028918] scsi host6: ahci [ 1.029266] ata1: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23100 irq 16 lpm-pol 1 [ 1.029305] ata2: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23180 irq 16 lpm-pol 1 [ 1.029316] ata3: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23200 irq 16 lpm-pol 1 [ 1.029327] ata4: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23280 irq 16 lpm-pol 1 [ 1.029341] ata5: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23300 irq 16 lpm-pol 1 [ 1.029356] ata6: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23380 irq 16 lpm-pol 1 [ 1.118111] scsi 0:0:0:0: Direct-Access QEMU QEMU HARDDISK 2.5+ PQ: 0 ANSI: 5 [ 1.348916] ata1: SATA link down (SStatus 0 SControl 300) [ 1.350713] ata2: SATA link down (SStatus 0 SControl 300) [ 1.351025] ata6: SATA link down (SStatus 0 SControl 300) [ 1.351160] ata5: SATA link down (SStatus 0 SControl 300) [ 1.351326] ata3: SATA link down (SStatus 0 SControl 300) [ 1.351536] ata4: SATA link down (SStatus 0 SControl 300) [ 1.449153] input: ImExPS/2 Generic Explorer Mouse as /devices/platform/i8042/serio1/input/input2 [ 16.483477] sd 0:0:0:0: Power-on or device reset occurred [ 16.483691] sd 0:0:0:0: [sda] 2097152 512-byte logical blocks: (1.07 GB/1.00 GiB) [ 16.483762] sd 0:0:0:0: [sda] Write Protect is off [ 16.483877] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA [ 16.569225] sd 0:0:0:0: [sda] Attached SCSI disk I bisected it and it is caused by the commit 89e1fb7ceffd which introduces calls to synchronize_rcu_expedited. This commit replaces synchronize_rcu_expedited and kfree with a call to kfree_rcu_mightsleep, avoiding the 15-second delay. Signed-off-by: Mikulas Patocka Fixes: 89e1fb7ceffd ("blk-mq: fix potential uaf for 'queue_hw_ctx'") Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index eff4f72ce83b..a29d8ac9d3e3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4553,8 +4553,7 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, * Make sure reading the old queue_hw_ctx from other * context concurrently won't trigger uaf. */ - synchronize_rcu_expedited(); - kfree(hctxs); + kfree_rcu_mightsleep(hctxs); hctxs = new_hctxs; } -- cgit v1.2.3 From f0d385f6689f37a2828c686fb279121df006b4cb Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 9 Jan 2026 20:14:54 +0800 Subject: ublk: fix use-after-free in ublk_partition_scan_work A race condition exists between the async partition scan work and device teardown that can lead to a use-after-free of ub->ub_disk: 1. ublk_ctrl_start_dev() schedules partition_scan_work after add_disk() 2. ublk_stop_dev() calls ublk_stop_dev_unlocked() which does: - del_gendisk(ub->ub_disk) - ublk_detach_disk() sets ub->ub_disk = NULL - put_disk() which may free the disk 3. The worker ublk_partition_scan_work() then dereferences ub->ub_disk leading to UAF Fix this by using ublk_get_disk()/ublk_put_disk() in the worker to hold a reference to the disk during the partition scan. The spinlock in ublk_get_disk() synchronizes with ublk_detach_disk() ensuring the worker either gets a valid reference or sees NULL and exits early. Also change flush_work() to cancel_work_sync() to avoid running the partition scan work unnecessarily when the disk is already detached. Fixes: 7fc4da6a304b ("ublk: scan partition in async way") Reported-by: Ruikai Peng Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 837fedb02e0d..f6e5a0766721 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -255,20 +255,6 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, u16 q_id, u16 tag, struct ublk_io *io, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); -static void ublk_partition_scan_work(struct work_struct *work) -{ - struct ublk_device *ub = - container_of(work, struct ublk_device, partition_scan_work); - - if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN, - &ub->ub_disk->state))) - return; - - mutex_lock(&ub->ub_disk->open_mutex); - bdev_disk_changed(ub->ub_disk, false); - mutex_unlock(&ub->ub_disk->open_mutex); -} - static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) { @@ -1597,6 +1583,27 @@ static void ublk_put_disk(struct gendisk *disk) put_device(disk_to_dev(disk)); } +static void ublk_partition_scan_work(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, partition_scan_work); + /* Hold disk reference to prevent UAF during concurrent teardown */ + struct gendisk *disk = ublk_get_disk(ub); + + if (!disk) + return; + + if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN, + &disk->state))) + goto out; + + mutex_lock(&disk->open_mutex); + bdev_disk_changed(disk, false); + mutex_unlock(&disk->open_mutex); +out: + ublk_put_disk(disk); +} + /* * Use this function to ensure that ->canceling is consistently set for * the device and all queues. Do not set these flags directly. @@ -2041,7 +2048,7 @@ static void ublk_stop_dev(struct ublk_device *ub) mutex_lock(&ub->mutex); ublk_stop_dev_unlocked(ub); mutex_unlock(&ub->mutex); - flush_work(&ub->partition_scan_work); + cancel_work_sync(&ub->partition_scan_work); ublk_cancel_dev(ub); } -- cgit v1.2.3