From a260bd22a355bcdb74cedac6ab9b10739cd2c62c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Tue, 9 Dec 2025 22:09:03 +0100 Subject: media: mc: fix potential use-after-free in media_request_alloc() Commit 6f504cbf108a ("media: convert media_request_alloc() to FD_PREPARE()") moved the call to fd_install() (now hidden in fd_publish()) before the snprintf(), making the later write to potentially already freed memory, as userland is free to call close() concurrently right after the call to fd_install() which may end up in the request_fops.release() handler freeing 'req'. Fixes: 6f504cbf108a ("media: convert media_request_alloc() to FD_PREPARE()") Signed-off-by: Mathias Krause Link: https://patch.msgid.link/20251209210903.603958-1-minipli@grsecurity.net Signed-off-by: Christian Brauner --- drivers/media/mc/mc-request.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/media/mc/mc-request.c b/drivers/media/mc/mc-request.c index 2ac9ac0a740b..3cca9a0c7c97 100644 --- a/drivers/media/mc/mc-request.c +++ b/drivers/media/mc/mc-request.c @@ -315,12 +315,12 @@ int media_request_alloc(struct media_device *mdev, int *alloc_fd) fd_prepare_file(fdf)->private_data = req; - *alloc_fd = fd_publish(fdf); - snprintf(req->debug_str, sizeof(req->debug_str), "%u:%d", - atomic_inc_return(&mdev->request_id), *alloc_fd); + atomic_inc_return(&mdev->request_id), fd_prepare_fd(fdf)); dev_dbg(mdev->dev, "request: allocated %s\n", req->debug_str); + *alloc_fd = fd_publish(fdf); + return 0; err_free_req: -- cgit v1.2.3 From ed61378b4dc63efe76cb8c23a36b228043332da3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 8 Dec 2025 09:05:48 -0500 Subject: iomap: replace folio_batch allocation with stack allocation Zhang Yi points out that the dynamic folio_batch allocation in iomap_fill_dirty_folios() is problematic for the ext4 on iomap work that is under development because it doesn't sufficiently handle the allocation failure case (by allowing a retry, for example). We've also seen lockdep (via syzbot) complain recently about the scope of the allocation. The dynamic allocation was initially added for simplicity and to help indicate whether the batch was used or not by the calling fs. To address these issues, put the batch on the stack of iomap_zero_range() and use a flag to control whether the batch should be used in the iomap folio lookup path. This keeps things simple and eliminates allocation issues with lockdep and for ext4 on iomap. While here, also clean up the fill helper signature to be more consistent with the underlying filemap helper. Pass through the return value of the filemap helper (folio count) and update the lookup offset via an out param. Fixes: 395ed1ef0012 ("iomap: optional zero range dirty folio processing") Signed-off-by: Brian Foster Link: https://patch.msgid.link/20251208140548.373411-1-bfoster@redhat.com Acked-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 50 +++++++++++++++++++++++++++++++++++--------------- fs/iomap/iter.c | 6 +++--- fs/xfs/xfs_iomap.c | 11 ++++++----- include/linux/iomap.h | 8 ++++++-- 4 files changed, 50 insertions(+), 25 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index e5c1ca440d93..fd9a2cf95620 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -832,7 +832,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, if (!mapping_large_folio_support(iter->inode->i_mapping)) len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); - if (iter->fbatch) { + if (iter->iomap.flags & IOMAP_F_FOLIO_BATCH) { struct folio *folio = folio_batch_next(iter->fbatch); if (!folio) @@ -929,7 +929,7 @@ static int iomap_write_begin(struct iomap_iter *iter, * process so return and let the caller iterate and refill the batch. */ if (!folio) { - WARN_ON_ONCE(!iter->fbatch); + WARN_ON_ONCE(!(iter->iomap.flags & IOMAP_F_FOLIO_BATCH)); return 0; } @@ -1544,23 +1544,39 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, return status; } -loff_t +/** + * iomap_fill_dirty_folios - fill a folio batch with dirty folios + * @iter: Iteration structure + * @start: Start offset of range. Updated based on lookup progress. + * @end: End offset of range + * @iomap_flags: Flags to set on the associated iomap to track the batch. + * + * Returns the folio count directly. Also returns the associated control flag if + * the the batch lookup is performed and the expected offset of a subsequent + * lookup via out params. The caller is responsible to set the flag on the + * associated iomap. + */ +unsigned int iomap_fill_dirty_folios( struct iomap_iter *iter, - loff_t offset, - loff_t length) + loff_t *start, + loff_t end, + unsigned int *iomap_flags) { struct address_space *mapping = iter->inode->i_mapping; - pgoff_t start = offset >> PAGE_SHIFT; - pgoff_t end = (offset + length - 1) >> PAGE_SHIFT; + pgoff_t pstart = *start >> PAGE_SHIFT; + pgoff_t pend = (end - 1) >> PAGE_SHIFT; + unsigned int count; - iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL); - if (!iter->fbatch) - return offset + length; - folio_batch_init(iter->fbatch); + if (!iter->fbatch) { + *start = end; + return 0; + } - filemap_get_folios_dirty(mapping, &start, end, iter->fbatch); - return (start << PAGE_SHIFT); + count = filemap_get_folios_dirty(mapping, &pstart, pend, iter->fbatch); + *start = (pstart << PAGE_SHIFT); + *iomap_flags |= IOMAP_F_FOLIO_BATCH; + return count; } EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios); @@ -1569,17 +1585,21 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private) { + struct folio_batch fbatch; struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, .private = private, + .fbatch = &fbatch, }; struct address_space *mapping = inode->i_mapping; int ret; bool range_dirty; + folio_batch_init(&fbatch); + /* * To avoid an unconditional flush, check pagecache state and only flush * if dirty and the fs returns a mapping that might convert on @@ -1590,11 +1610,11 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, while ((ret = iomap_iter(&iter, ops)) > 0) { const struct iomap *srcmap = iomap_iter_srcmap(&iter); - if (WARN_ON_ONCE(iter.fbatch && + if (WARN_ON_ONCE((iter.iomap.flags & IOMAP_F_FOLIO_BATCH) && srcmap->type != IOMAP_UNWRITTEN)) return -EIO; - if (!iter.fbatch && + if (!(iter.iomap.flags & IOMAP_F_FOLIO_BATCH) && (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)) { s64 status; diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 8692e5e41c6d..c04796f6e57f 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -8,10 +8,10 @@ static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { - if (iter->fbatch) { + if (iter->iomap.flags & IOMAP_F_FOLIO_BATCH) { folio_batch_release(iter->fbatch); - kfree(iter->fbatch); - iter->fbatch = NULL; + folio_batch_reinit(iter->fbatch); + iter->iomap.flags &= ~IOMAP_F_FOLIO_BATCH; } iter->status = 0; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 04f39ea15898..37a1b33e9045 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1831,7 +1831,6 @@ xfs_buffered_write_iomap_begin( */ if (flags & IOMAP_ZERO) { xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - u64 end; if (isnullstartblock(imap.br_startblock) && offset_fsb >= eof_fsb) @@ -1851,12 +1850,14 @@ xfs_buffered_write_iomap_begin( */ if (imap.br_state == XFS_EXT_UNWRITTEN && offset_fsb < eof_fsb) { - loff_t len = min(count, - XFS_FSB_TO_B(mp, imap.br_blockcount)); + loff_t foffset = offset, fend; - end = iomap_fill_dirty_folios(iter, offset, len); + fend = offset + + min(count, XFS_FSB_TO_B(mp, imap.br_blockcount)); + iomap_fill_dirty_folios(iter, &foffset, fend, + &iomap_flags); end_fsb = min_t(xfs_fileoff_t, end_fsb, - XFS_B_TO_FSB(mp, end)); + XFS_B_TO_FSB(mp, foffset)); } xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 520e967cb501..6bb941707d12 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -88,6 +88,9 @@ struct vm_fault; /* * Flags set by the core iomap code during operations: * + * IOMAP_F_FOLIO_BATCH indicates that the folio batch mechanism is active + * for this operation, set by iomap_fill_dirty_folios(). + * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. * @@ -95,6 +98,7 @@ struct vm_fault; * range it covers needs to be remapped by the high level before the operation * can proceed. */ +#define IOMAP_F_FOLIO_BATCH (1U << 13) #define IOMAP_F_SIZE_CHANGED (1U << 14) #define IOMAP_F_STALE (1U << 15) @@ -352,8 +356,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops); -loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, - loff_t length); +unsigned int iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t *start, + loff_t end, unsigned int *iomap_flags); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -- cgit v1.2.3 From 392e317a20c32d45eebe4de8dc24408c6d1765d1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 4 Dec 2025 08:48:32 -0500 Subject: filelock: add lease_dispose_list() helper The lease-handling code paths always know they're disposing of leases, yet locks_dispose_list() checks flags at runtime to determine whether to call locks_free_lease() or locks_free_lock(). Split out a dedicated lease_dispose_list() helper for lease code paths. This makes the type handling explicit and prepares for the upcoming lease_manager enhancements where lease-specific operations are being consolidated. Reviewed-by: Chuck Lever Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251204-dir-deleg-ro-v2-1-22d37f92ce2c@kernel.org Signed-off-by: Christian Brauner --- fs/locks.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 9f565802a88c..be0b79286da8 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -369,10 +369,19 @@ locks_dispose_list(struct list_head *dispose) while (!list_empty(dispose)) { flc = list_first_entry(dispose, struct file_lock_core, flc_list); list_del_init(&flc->flc_list); - if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) - locks_free_lease(file_lease(flc)); - else - locks_free_lock(file_lock(flc)); + locks_free_lock(file_lock(flc)); + } +} + +static void +lease_dispose_list(struct list_head *dispose) +{ + struct file_lock_core *flc; + + while (!list_empty(dispose)) { + flc = list_first_entry(dispose, struct file_lock_core, flc_list); + list_del_init(&flc->flc_list); + locks_free_lease(file_lease(flc)); } } @@ -1620,7 +1629,7 @@ restart: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); - locks_dispose_list(&dispose); + lease_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->c.flc_wait, list_empty(&new_fl->c.flc_blocked_member), break_time); @@ -1643,7 +1652,7 @@ restart: out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); - locks_dispose_list(&dispose); + lease_dispose_list(&dispose); free_lock: locks_free_lease(new_fl); return error; @@ -1727,7 +1736,7 @@ static int __fcntl_getlease(struct file *filp, unsigned int flavor) spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); - locks_dispose_list(&dispose); + lease_dispose_list(&dispose); } return type; } @@ -1896,7 +1905,7 @@ out_setup: out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); - locks_dispose_list(&dispose); + lease_dispose_list(&dispose); if (is_deleg) inode_unlock(inode); if (!error && !my_fl) @@ -1932,7 +1941,7 @@ static int generic_delete_lease(struct file *filp, void *owner) error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); - locks_dispose_list(&dispose); + lease_dispose_list(&dispose); return error; } @@ -2727,7 +2736,7 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx) spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); - locks_dispose_list(&dispose); + lease_dispose_list(&dispose); } /* -- cgit v1.2.3 From 12965a190eaea614bb49e22041e8fc0d03d0310f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 4 Dec 2025 08:48:33 -0500 Subject: filelock: allow lease_managers to dictate what qualifies as a conflict Requesting a delegation on a file from the userland fcntl() interface currently succeeds when there are conflicting opens present. This is because the lease handling code ignores conflicting opens for FL_LAYOUT and FL_DELEG leases. This was a hack put in place long ago, because nfsd already checks for conflicts in its own way. The kernel needs to perform this check for userland delegations the same way it is done for leases, however. Make this dependent on the lease_manager by adding a new ->lm_open_conflict() lease_manager operation and have generic_add_lease() call that instead of check_conflicting_open(). Morph check_conflicting_open() into a ->lm_open_conflict() op that is only called for userland leases/delegations. Set the ->lm_open_conflict() operations for nfsd to trivial functions that always return 0. Reviewed-by: Chuck Lever Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251204-dir-deleg-ro-v2-2-22d37f92ce2c@kernel.org Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 1 + fs/locks.c | 90 ++++++++++++++++------------------- fs/nfsd/nfs4layouts.c | 23 ++++++++- fs/nfsd/nfs4state.c | 19 ++++++++ include/linux/filelock.h | 1 + 5 files changed, 84 insertions(+), 50 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 77704fde9845..04c7691e50e0 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -416,6 +416,7 @@ lm_change yes no no lm_breaker_owns_lease: yes no no lm_lock_expirable yes no no lm_expire_lock no no yes +lm_open_conflict yes no no ====================== ============= ================= ========= buffer_head diff --git a/fs/locks.c b/fs/locks.c index be0b79286da8..e75c8084d937 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -585,10 +585,50 @@ lease_setup(struct file_lease *fl, void **priv) __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0); } +/** + * lease_open_conflict - see if the given file points to an inode that has + * an existing open that would conflict with the + * desired lease. + * @filp: file to check + * @arg: type of lease that we're trying to acquire + * + * Check to see if there's an existing open fd on this file that would + * conflict with the lease we're trying to set. + */ +static int +lease_open_conflict(struct file *filp, const int arg) +{ + struct inode *inode = file_inode(filp); + int self_wcount = 0, self_rcount = 0; + + if (arg == F_RDLCK) + return inode_is_open_for_write(inode) ? -EAGAIN : 0; + else if (arg != F_WRLCK) + return 0; + + /* + * Make sure that only read/write count is from lease requestor. + * Note that this will result in denying write leases when i_writecount + * is negative, which is what we want. (We shouldn't grant write leases + * on files open for execution.) + */ + if (filp->f_mode & FMODE_WRITE) + self_wcount = 1; + else if (filp->f_mode & FMODE_READ) + self_rcount = 1; + + if (atomic_read(&inode->i_writecount) != self_wcount || + atomic_read(&inode->i_readcount) != self_rcount) + return -EAGAIN; + + return 0; +} + static const struct lease_manager_operations lease_manager_ops = { .lm_break = lease_break_callback, .lm_change = lease_modify, .lm_setup = lease_setup, + .lm_open_conflict = lease_open_conflict, }; /* @@ -1754,52 +1794,6 @@ int fcntl_getdeleg(struct file *filp, struct delegation *deleg) return 0; } -/** - * check_conflicting_open - see if the given file points to an inode that has - * an existing open that would conflict with the - * desired lease. - * @filp: file to check - * @arg: type of lease that we're trying to acquire - * @flags: current lock flags - * - * Check to see if there's an existing open fd on this file that would - * conflict with the lease we're trying to set. - */ -static int -check_conflicting_open(struct file *filp, const int arg, int flags) -{ - struct inode *inode = file_inode(filp); - int self_wcount = 0, self_rcount = 0; - - if (flags & FL_LAYOUT) - return 0; - if (flags & FL_DELEG) - /* We leave these checks to the caller */ - return 0; - - if (arg == F_RDLCK) - return inode_is_open_for_write(inode) ? -EAGAIN : 0; - else if (arg != F_WRLCK) - return 0; - - /* - * Make sure that only read/write count is from lease requestor. - * Note that this will result in denying write leases when i_writecount - * is negative, which is what we want. (We shouldn't grant write leases - * on files open for execution.) - */ - if (filp->f_mode & FMODE_WRITE) - self_wcount = 1; - else if (filp->f_mode & FMODE_READ) - self_rcount = 1; - - if (atomic_read(&inode->i_writecount) != self_wcount || - atomic_read(&inode->i_readcount) != self_rcount) - return -EAGAIN; - - return 0; -} - static int generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv) { @@ -1836,7 +1830,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); - error = check_conflicting_open(filp, arg, lease->c.flc_flags); + error = lease->fl_lmops->lm_open_conflict(filp, arg); if (error) goto out; @@ -1893,7 +1887,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr * precedes these checks. */ smp_mb(); - error = check_conflicting_open(filp, arg, lease->c.flc_flags); + error = lease->fl_lmops->lm_open_conflict(filp, arg); if (error) { locks_unlink_lock_ctx(&lease->c); goto out; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 683bd1130afe..ad7af8cfcf1f 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -764,9 +764,28 @@ nfsd4_layout_lm_change(struct file_lease *onlist, int arg, return lease_modify(onlist, arg, dispose); } +/** + * nfsd4_layout_lm_open_conflict - see if the given file points to an inode that has + * an existing open that would conflict with the + * desired lease. + * @filp: file to check + * @arg: type of lease that we're trying to acquire + * + * The kernel will call into this operation to determine whether there + * are conflicting opens that may prevent the layout from being granted. + * For nfsd, that check is done at a higher level, so this trivially + * returns 0. + */ +static int +nfsd4_layout_lm_open_conflict(struct file *filp, int arg) +{ + return 0; +} + static const struct lease_manager_operations nfsd4_layouts_lm_ops = { - .lm_break = nfsd4_layout_lm_break, - .lm_change = nfsd4_layout_lm_change, + .lm_break = nfsd4_layout_lm_break, + .lm_change = nfsd4_layout_lm_change, + .lm_open_conflict = nfsd4_layout_lm_open_conflict, }; int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 808c24fb5c9a..19d6d6db107f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5552,10 +5552,29 @@ nfsd_change_deleg_cb(struct file_lease *onlist, int arg, return -EAGAIN; } +/** + * nfsd4_deleg_lm_open_conflict - see if the given file points to an inode that has + * an existing open that would conflict with the + * desired lease. + * @filp: file to check + * @arg: type of lease that we're trying to acquire + * + * The kernel will call into this operation to determine whether there + * are conflicting opens that may prevent the deleg from being granted. + * For nfsd, that check is done at a higher level, so this trivially + * returns 0. + */ +static int +nfsd4_deleg_lm_open_conflict(struct file *filp, int arg) +{ + return 0; +} + static const struct lease_manager_operations nfsd_lease_mng_ops = { .lm_breaker_owns_lease = nfsd_breaker_owns_lease, .lm_break = nfsd_break_deleg_cb, .lm_change = nfsd_change_deleg_cb, + .lm_open_conflict = nfsd4_deleg_lm_open_conflict, }; static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4_stateowner *so, u32 seqid) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 54b824c05299..2f5e5588ee07 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -49,6 +49,7 @@ struct lease_manager_operations { int (*lm_change)(struct file_lease *, int, struct list_head *); void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); + int (*lm_open_conflict)(struct file *, int); }; struct lock_manager { -- cgit v1.2.3 From 570ad253a3455a520f03c2136af8714bc780186d Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 20 Dec 2025 12:31:40 +0000 Subject: netfs: Fix early read unlock of page with EOF in middle The read result collection for buffered reads seems to run ahead of the completion of subrequests under some circumstances, as can be seen in the following log snippet: 9p_client_res: client 18446612686390831168 response P9_TREAD tag 0 err 0 ... netfs_sreq: R=00001b55[1] DOWN TERM f=192 s=0 5fb2/5fb2 s=5 e=0 ... netfs_collect_folio: R=00001b55 ix=00004 r=4000-5000 t=4000/5fb2 netfs_folio: i=157f3 ix=00004-00004 read-done netfs_folio: i=157f3 ix=00004-00004 read-unlock netfs_collect_folio: R=00001b55 ix=00005 r=5000-5fb2 t=5000/5fb2 netfs_folio: i=157f3 ix=00005-00005 read-done netfs_folio: i=157f3 ix=00005-00005 read-unlock ... netfs_collect_stream: R=00001b55[0:] cto=5fb2 frn=ffffffff netfs_collect_state: R=00001b55 col=5fb2 cln=6000 n=c netfs_collect_stream: R=00001b55[0:] cto=5fb2 frn=ffffffff netfs_collect_state: R=00001b55 col=5fb2 cln=6000 n=8 ... netfs_sreq: R=00001b55[2] ZERO SUBMT f=000 s=5fb2 0/4e s=0 e=0 netfs_sreq: R=00001b55[2] ZERO TERM f=102 s=5fb2 4e/4e s=5 e=0 The 'cto=5fb2' indicates the collected file pos we've collected results to so far - but we still have 0x4e more bytes to go - so we shouldn't have collected folio ix=00005 yet. The 'ZERO' subreq that clears the tail happens after we unlock the folio, allowing the application to see the uncleared tail through mmap. The problem is that netfs_read_unlock_folios() will unlock a folio in which the amount of read results collected hits EOF position - but the ZERO subreq lies beyond that and so happens after. Fix this by changing the end check to always be the end of the folio and never the end of the file. In the future, I should look at clearing to the end of the folio here rather than adding a ZERO subreq to do this. On the other hand, the ZERO subreq can run in parallel with an async READ subreq. Further, the ZERO subreq may still be necessary to, say, handle extents in a ceph file that don't have any backing store and are thus implicitly all zeros. This can be reproduced by creating a file, the size of which doesn't align to a page boundary, e.g. 24998 (0x5fb2) bytes and then doing something like: xfs_io -c "mmap -r 0 0x6000" -c "madvise -d 0 0x6000" \ -c "mread -v 0 0x6000" /xfstest.test/x The last 0x4e bytes should all be 00, but if the tail hasn't been cleared yet, you may see rubbish there. This can be reproduced with kafs by modifying the kernel to disable the call to netfs_read_subreq_progress() and to stop afs_issue_read() from doing the async call for NETFS_READAHEAD. Reproduction can be made easier by inserting an mdelay(100) in netfs_issue_read() for the ZERO-subreq case. AFS and CIFS are normally unlikely to show this as they dispatch READ ops asynchronously, which allows the ZERO-subreq to finish first. 9P's READ op is completely synchronous, so the ZERO-subreq will always happen after. It isn't seen all the time, though, because the collection may be done in a worker thread. Reported-by: Christian Schoenebeck Link: https://lore.kernel.org/r/8622834.T7Z3S40VBb@weasel/ Signed-off-by: David Howells Link: https://patch.msgid.link/938162.1766233900@warthog.procyon.org.uk Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Tested-by: Christian Schoenebeck Acked-by: Dominique Martinet Suggested-by: Dominique Martinet cc: Dominique Martinet cc: Christian Schoenebeck cc: v9fs@lists.linux.dev cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index a95e7aadafd0..7a0ffa675fb1 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -137,7 +137,7 @@ static void netfs_read_unlock_folios(struct netfs_io_request *rreq, rreq->front_folio_order = order; fsize = PAGE_SIZE << order; fpos = folio_pos(folio); - fend = umin(fpos + fsize, rreq->i_size); + fend = fpos + fsize; trace_netfs_collect_folio(rreq, folio, fend, collected_to); -- cgit v1.2.3 From 46af9ae1305f1025fd9ff7d8945de98a6ec0a52b Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 20 Dec 2025 06:40:22 +0100 Subject: fs: make sure to fail try_to_unlazy() and try_to_unlazy() for LOOKUP_CACHED Otherwise the slowpath can be taken by the caller, defeating the flag. This regressed after calls to legitimize_links() started being conditionally elided and stems from the routine always failing after seeing the flag, regardless if there were any links. In order to address both the bug and the weird semantics make it illegal to call legitimize_links() with LOOKUP_CACHED and handle the problem at the two callsites. Fixes: 7c179096e77eca21 ("fs: add predicts based on nd->depth") Reported-by: Chris Mason Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251220054023.142134-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index bf0f66f0e9b9..f7a8b5b000c2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -830,11 +830,9 @@ static inline bool legitimize_path(struct nameidata *nd, static bool legitimize_links(struct nameidata *nd) { int i; - if (unlikely(nd->flags & LOOKUP_CACHED)) { - drop_links(nd); - nd->depth = 0; - return false; - } + + VFS_BUG_ON(nd->flags & LOOKUP_CACHED); + for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { @@ -883,6 +881,11 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); + if (unlikely(nd->flags & LOOKUP_CACHED)) { + drop_links(nd); + nd->depth = 0; + goto out1; + } if (unlikely(nd->depth && !legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) @@ -918,6 +921,11 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); + if (unlikely(nd->flags & LOOKUP_CACHED)) { + drop_links(nd); + nd->depth = 0; + goto out2; + } if (unlikely(nd->depth && !legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); -- cgit v1.2.3 From fe33729d2907415ff953d84673caebca628cbd77 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 09:46:19 +0700 Subject: fs: Describe @isnew parameter in ilookup5_nowait() Sphinx reports kernel-doc warning: WARNING: ./fs/inode.c:1607 function parameter 'isnew' not described in 'ilookup5_nowait' Describe the parameter. Fixes: a27628f4363435 ("fs: rework I_NEW handling to operate without fences") Signed-off-by: Bagas Sanjaya Link: https://patch.msgid.link/20251219024620.22880-2-bagasdotme@gmail.com Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index 521383223d8a..379f4c19845c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1593,6 +1593,9 @@ EXPORT_SYMBOL(igrab); * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test + * @isnew: return argument telling whether I_NEW was set when + * the inode was found in hash (the caller needs to + * wait for I_NEW to clear) * * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented -- cgit v1.2.3 From 73a91ef328a9d728c7f3357f925980937f0d520c Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 09:46:20 +0700 Subject: VFS: fix __start_dirop() kernel-doc warnings Sphinx report kernel-doc warnings: WARNING: ./fs/namei.c:2853 function parameter 'state' not described in '__start_dirop' WARNING: ./fs/namei.c:2853 expecting prototype for start_dirop(). Prototype was for __start_dirop() instead Fix them up. Fixes: ff7c4ea11a05c8 ("VFS: add start_creating_killable() and start_removing_killable()") Signed-off-by: Bagas Sanjaya Link: https://patch.msgid.link/20251219024620.22880-3-bagasdotme@gmail.com Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index f7a8b5b000c2..cf16b6822dd3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2844,10 +2844,11 @@ static int filename_parentat(int dfd, struct filename *name, } /** - * start_dirop - begin a create or remove dirop, performing locking and lookup + * __start_dirop - begin a create or remove dirop, performing locking and lookup * @parent: the dentry of the parent in which the operation will occur * @name: a qstr holding the name within that parent * @lookup_flags: intent and other lookup flags. + * @state: task state bitmask * * The lookup is performed and necessary locks are taken so that, on success, * the returned dentry can be operated on safely. -- cgit v1.2.3 From 3dd57ddec9e3a98387196a3f53b8c036977d8c0f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Dec 2025 08:19:39 +0000 Subject: get rid of bogus __user in struct xattr_args::value The first member of struct xattr_args is declared as __aligned_u64 __user value; which makes no sense whatsoever; __user is a qualifier and what that declaration says is "all struct xattr_args instances have .value _stored_ in user address space, no matter where the rest of the structure happens to be". Something like "int __user *p" stands for "value of p is a pointer to an instance of int that happens to live in user address space"; it says nothing about location of p itself, just as const char *p declares a pointer to unmodifiable char rather than an unmodifiable pointer to char. With xattr_args the intent clearly had been "the 64bit value represents a _pointer_ to object in user address space", but __user has nothing to do with that. All it gets us is a couple of bogus warnings in fs/xattr.c where (userland) instance of xattr_args is copied to local variable of that type (in kernel address space), followed by access to its members. Since we've told sparse that args.value must somehow be located in userland memory, we get warned that looking at that 64bit unsigned integer (in a variable already on kernel stack) is not allowed. Note that sparse has no way to express "this integer shall never be cast into a pointer to be dereferenced directly" and I don't see any way to assign a sane semantics to that. In any case, __user is not it. Signed-off-by: Al Viro Link: https://patch.msgid.link/20251216081939.GQ1712166@ZenIV Signed-off-by: Christian Brauner --- include/uapi/linux/xattr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h index c7c85bb504ba..2e5aef48fa7e 100644 --- a/include/uapi/linux/xattr.h +++ b/include/uapi/linux/xattr.h @@ -23,7 +23,7 @@ #define XATTR_REPLACE 0x2 /* set value, fail if attr does not exist */ struct xattr_args { - __aligned_u64 __user value; + __aligned_u64 value; __u32 size; __u32 flags; }; -- cgit v1.2.3 From 5f9ad16bccd351321d9cd65726fd09390d34b06c Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 23 Dec 2025 13:41:52 -0600 Subject: ecryptfs: Fix improper mknod pairing of start_creating()/end_removing() The ecryptfs_start_creating_dentry() function must be paired with the end_creating() function. Fix ecryptfs_mknod() so that end_creating() is properly called in the return path, instead of end_removing(). Fixes: f046fbb4d81d ("ecryptfs: use new start_creating/start_removing APIs") Signed-off-by: Tyler Hicks Link: https://patch.msgid.link/20251223194153.2818445-2-code@tyhicks.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/ecryptfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 3978248247dc..e73d9de676a6 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -584,7 +584,7 @@ ecryptfs_mknod(struct mnt_idmap *idmap, struct inode *dir, fsstack_copy_attr_times(dir, lower_dir); fsstack_copy_inode_size(dir, lower_dir); out: - end_removing(lower_dentry); + end_creating(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; -- cgit v1.2.3 From 5c56afd204ad266d23889ee8823fb65b2c3b63da Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 23 Dec 2025 13:41:53 -0600 Subject: ecryptfs: Release lower parent dentry after creating dir Fix a mkdir-induced usage count imbalance that tripped a umount_check() BUG while unmounting the lower filesystem. Commit f046fbb4d81d ("ecryptfs: use new start_creating/start_removing APIs") added a new dget() of the lower parent dir, in ecryptfs_mkdir(), but did not dput() the dentry before returning from that function. The BUG output as seen while running the eCryptfs test suite: $ ./run_tests.sh -b 131072 -c safe,destructive -f ext4 -K -t lp-926292.sh ... Running eCryptfs filesystem tests on ext4 lp-926292 ------------[ cut here ]------------ BUG: Dentry ffff8e6692d11988{i=c,n=ECRYPTFS_FNEK_ENCRYPTED.FXZuRGZL7QAFtER.JeA46DtdKqkkQx9H2Vpmv234J5CU8YSsrUwZJK4AbXbrN5WkZ348wnqstovKKxA-} still in use (1) [unmount of ext4 loop0] WARNING: CPU: 7 PID: 950 at fs/dcache.c:1590 umount_check+0x5e/0x80 Modules linked in: md5 libmd5 ecryptfs encrypted_keys ext4 crc16 mbcache jbd2 CPU: 7 UID: 0 PID: 950 Comm: umount Not tainted 6.18.0-rc1-00013-gf046fbb4d81d #17 PREEMPT(full) Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: 0010:umount_check+0x5e/0x80 Code: 88 38 06 00 00 48 8b 40 28 4c 8b 08 48 8b 46 68 48 85 c0 74 04 48 8b 50 38 51 48 c7 c7 60 32 9c b5 48 89 f1 e8 43 5e ca ff 90 <0f> 0b 90 90 58 31 c0 e9 46 9d 6c 00 41 83 f8 01 75 b8 eb a3 66 66 RSP: 0018:ffffa19940c4bdd0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff8e6692fad4c0 RCX: 0000000000000000 RDX: 0000000000000004 RSI: ffffa19940c4bc70 RDI: 00000000ffffffff RBP: ffffffffb4eb5930 R08: 00000000ffffdfff R09: 0000000000000001 R10: 00000000ffffdfff R11: ffffffffb5c8a9e0 R12: ffff8e6692fad4c0 R13: ffff8e6692fad4c0 R14: ffff8e6692d11a40 R15: ffff8e6692d11988 FS: 00007f6b4b491800(0000) GS:ffff8e670506e000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f6b4b5f8d40 CR3: 0000000114eb7001 CR4: 0000000000772ef0 PKRU: 55555554 Call Trace: d_walk+0xfd/0x370 shrink_dcache_for_umount+0x4d/0x140 generic_shutdown_super+0x20/0x160 kill_block_super+0x1a/0x40 ext4_kill_sb+0x22/0x40 [ext4] deactivate_locked_super+0x33/0xa0 cleanup_mnt+0xba/0x150 task_work_run+0x5c/0xa0 exit_to_user_mode_loop+0xac/0xb0 do_syscall_64+0x2ab/0xfa0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f6b4b6c2a2b Code: c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 f3 0f 1e fa 31 f6 e9 05 00 00 00 0f 1f 44 00 00 f3 0f 1e fa b8 a6 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 05 c3 0f 1f 40 00 48 8b 15 b9 83 0d 00 f7 d8 RSP: 002b:00007ffcd5b8b498 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 000055b84af0b9e0 RCX: 00007f6b4b6c2a2b RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000055b84af0bdf0 RBP: 00007ffcd5b8b570 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000103 R11: 0000000000000246 R12: 000055b84af0bae0 R13: 0000000000000000 R14: 000055b84af0bdf0 R15: 0000000000000000 ---[ end trace 0000000000000000 ]--- EXT4-fs (loop0): unmounting filesystem 00d9ea41-f61e-43d0-a449-6be03e7e8428. EXT4-fs (loop0): sb orphan head is 12 sb_info orphan list: inode loop0:12 at ffff8e66950e1df0: mode 40700, nlink 0, next 0 Assertion failure in ext4_put_super() at fs/ext4/super.c:1345: 'list_empty(&sbi->s_orphan)' Fixes: f046fbb4d81d ("ecryptfs: use new start_creating/start_removing APIs") Signed-off-by: Tyler Hicks Link: https://patch.msgid.link/20251223194153.2818445-3-code@tyhicks.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/ecryptfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e73d9de676a6..8ab014db3e03 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -533,6 +533,7 @@ static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, fsstack_copy_inode_size(dir, lower_dir); set_nlink(dir, lower_dir->i_nlink); out: + dput(lower_dir_dentry); end_creating(lower_dentry); if (d_really_is_negative(dentry)) d_drop(dentry); -- cgit v1.2.3 From 75ddaa4ddc86d31edb15e50152adf4ddee77a6ba Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Dec 2025 13:00:24 +0100 Subject: pidfs: protect PIDFD_GET_* ioctls() via ifdef We originally protected PIDFD_GET__NAMESPACE ioctls() through ifdefs and recent rework made it possible to drop them. There was an oversight though. When the relevant namespace is turned off ns->ops will be NULL so even though opening a file descriptor is perfectly legitimate it would fail during inode eviction when the file was closed. The simple fix would be to check ns->ops for NULL and continue allow to retrieve namespace fds from pidfds but we don't allow retrieving them when the relevant namespace type is turned off. So keep the simplification but add the ifdefs back in. Link: https://lore.kernel.org/20251222214907.GA189632@quark Link: https://patch.msgid.link/20251224-ununterbrochen-gagen-ea949b83f8f2@brauner Fixes: a71e4f103aed ("pidfs: simplify PIDFD_GET__NAMESPACE ioctls") Tested-by: Brendan Jackman Tested-by: Eric Biggers Reported-by: Eric Biggers Signed-off-by: Christian Brauner --- fs/pidfs.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/pidfs.c b/fs/pidfs.c index dba703d4ce4a..1e20e36e0ed5 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -517,14 +517,18 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { /* Namespaces that hang of nsproxy. */ case PIDFD_GET_CGROUP_NAMESPACE: +#ifdef CONFIG_CGROUPS if (!ns_ref_get(nsp->cgroup_ns)) break; ns_common = to_ns_common(nsp->cgroup_ns); +#endif break; case PIDFD_GET_IPC_NAMESPACE: +#ifdef CONFIG_IPC_NS if (!ns_ref_get(nsp->ipc_ns)) break; ns_common = to_ns_common(nsp->ipc_ns); +#endif break; case PIDFD_GET_MNT_NAMESPACE: if (!ns_ref_get(nsp->mnt_ns)) @@ -532,32 +536,43 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ns_common = to_ns_common(nsp->mnt_ns); break; case PIDFD_GET_NET_NAMESPACE: +#ifdef CONFIG_NET_NS if (!ns_ref_get(nsp->net_ns)) break; ns_common = to_ns_common(nsp->net_ns); +#endif break; case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: +#ifdef CONFIG_PID_NS if (!ns_ref_get(nsp->pid_ns_for_children)) break; ns_common = to_ns_common(nsp->pid_ns_for_children); +#endif break; case PIDFD_GET_TIME_NAMESPACE: +#ifdef CONFIG_TIME_NS if (!ns_ref_get(nsp->time_ns)) break; ns_common = to_ns_common(nsp->time_ns); +#endif break; case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: +#ifdef CONFIG_TIME_NS if (!ns_ref_get(nsp->time_ns_for_children)) break; ns_common = to_ns_common(nsp->time_ns_for_children); +#endif break; case PIDFD_GET_UTS_NAMESPACE: +#ifdef CONFIG_UTS_NS if (!ns_ref_get(nsp->uts_ns)) break; ns_common = to_ns_common(nsp->uts_ns); +#endif break; /* Namespaces that don't hang of nsproxy. */ case PIDFD_GET_USER_NAMESPACE: +#ifdef CONFIG_USER_NS scoped_guard(rcu) { struct user_namespace *user_ns; @@ -566,8 +581,10 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; ns_common = to_ns_common(user_ns); } +#endif break; case PIDFD_GET_PID_NAMESPACE: +#ifdef CONFIG_PID_NS scoped_guard(rcu) { struct pid_namespace *pid_ns; @@ -576,6 +593,7 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; ns_common = to_ns_common(pid_ns); } +#endif break; default: return -ENOIOCTLCMD; -- cgit v1.2.3