aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 08:14:00 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 08:14:00 -0800
commit1885cdbfbb51ede3637166c895d0b8040c9899cc (patch)
tree257c3f32cff62bff6a3acfe8c76d39981cc52faa
parent7d0a66e4bb9081d75c82ec4957c50034cb0ea449 (diff)
parent7fd8720dff2d9c70cf5a1a13b7513af01952ec02 (diff)
Merge tag 'vfs-6.19-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull iomap updates from Christian Brauner: "FUSE iomap Support for Buffered Reads: This adds iomap support for FUSE buffered reads and readahead. This enables granular uptodate tracking with large folios so only non-uptodate portions need to be read. Also fixes a race condition with large folios + writeback cache that could cause data corruption on partial writes followed by reads. - Refactored iomap read/readahead bio logic into helpers - Added caller-provided callbacks for read operations - Moved buffered IO bio logic into new file - FUSE now uses iomap for read_folio and readahead Zero Range Folio Batch Support: Add folio batch support for iomap_zero_range() to handle dirty folios over unwritten mappings. Fix raciness issues where dirty data could be lost during zero range operations. - filemap_get_folios_tag_range() helper for dirty folio lookup - Optional zero range dirty folio processing - XFS fills dirty folios on zero range of unwritten mappings - Removed old partial EOF zeroing optimization DIO Write Completions from Interrupt Context: Restore pre-iomap behavior where pure overwrite completions run inline rather than being deferred to workqueue. Reduces context switches for high-performance workloads like ScyllaDB. - Removed unused IOCB_DIO_CALLER_COMP code - Error completions always run in user context (fixes zonefs) - Reworked REQ_FUA selection logic - Inverted IOMAP_DIO_INLINE_COMP to IOMAP_DIO_OFFLOAD_COMP Buffered IO Cleanups: Some performance and code clarity improvements: - Replace manual bitmap scanning with find_next_bit() - Simplify read skip logic for writes - Optimize pending async writeback accounting - Better variable naming - Documentation for iomap_finish_folio_write() requirements Misaligned Vectors for Zoned XFS: Enables sub-block aligned vectors in XFS always-COW mode for zoned devices via new IOMAP_DIO_FSBLOCK_ALIGNED flag. Bug Fixes: - Allocate s_dio_done_wq for async reads (fixes syzbot report after error completion changes) - Fix iomap_read_end() for already uptodate folios (regression fix)" * tag 'vfs-6.19-rc1.iomap' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (40 commits) iomap: allocate s_dio_done_wq for async reads as well iomap: fix iomap_read_end() for already uptodate folios iomap: invert the polarity of IOMAP_DIO_INLINE_COMP iomap: support write completions from interrupt context iomap: rework REQ_FUA selection iomap: always run error completions in user context fs, iomap: remove IOCB_DIO_CALLER_COMP iomap: use find_next_bit() for uptodate bitmap scanning iomap: use find_next_bit() for dirty bitmap scanning iomap: simplify when reads can be skipped for writes iomap: simplify ->read_folio_range() error handling for reads iomap: optimize pending async writeback accounting docs: document iomap writeback's iomap_finish_folio_write() requirement iomap: account for unaligned end offsets when truncating read range iomap: rename bytes_pending/bytes_accounted to bytes_submitted/bytes_not_submitted xfs: support sub-block aligned vectors in always COW mode iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag xfs: error tag to force zeroing on debug kernels iomap: remove old partial eof zeroing optimization xfs: fill dirty folios on zero range of unwritten mappings ...
-rw-r--r--Documentation/filesystems/iomap/operations.rst50
-rw-r--r--block/fops.c5
-rw-r--r--fs/backing-file.c6
-rw-r--r--fs/dax.c30
-rw-r--r--fs/erofs/data.c5
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/fuse/file.c286
-rw-r--r--fs/fuse/fuse_i.h8
-rw-r--r--fs/fuse/inode.c13
-rw-r--r--fs/gfs2/aops.c6
-rw-r--r--fs/iomap/Makefile3
-rw-r--r--fs/iomap/bio.c88
-rw-r--r--fs/iomap/buffered-io.c636
-rw-r--r--fs/iomap/direct-io.c230
-rw-r--r--fs/iomap/internal.h12
-rw-r--r--fs/iomap/ioend.c2
-rw-r--r--fs/iomap/iter.c20
-rw-r--r--fs/iomap/seek.c8
-rw-r--r--fs/iomap/trace.h7
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h6
-rw-r--r--fs/xfs/xfs_aops.c5
-rw-r--r--fs/xfs/xfs_file.c50
-rw-r--r--fs/xfs/xfs_iomap.c38
-rw-r--r--fs/zonefs/file.c5
-rw-r--r--include/linux/fs.h43
-rw-r--r--include/linux/iomap.h86
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--io_uring/rw.c16
-rw-r--r--mm/filemap.c58
29 files changed, 1093 insertions, 633 deletions
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 387fd9cc72ca..da982ca7e413 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -135,6 +135,27 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:
* ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``.
+``struct iomap_read_ops``
+--------------------------
+
+.. code-block:: c
+
+ struct iomap_read_ops {
+ int (*read_folio_range)(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t len);
+ void (*submit_read)(struct iomap_read_folio_ctx *ctx);
+ };
+
+iomap calls these functions:
+
+ - ``read_folio_range``: Called to read in the range. This must be provided
+ by the caller. If this succeeds, iomap_finish_folio_read() must be called
+ after the range is read in, regardless of whether the read succeeded or
+ failed.
+
+ - ``submit_read``: Submit any pending read requests. This function is
+ optional.
+
Internal per-Folio State
------------------------
@@ -182,6 +203,28 @@ The ``flags`` argument to ``->iomap_begin`` will be set to zero.
The pagecache takes whatever locks it needs before calling the
filesystem.
+Both ``iomap_readahead`` and ``iomap_read_folio`` pass in a ``struct
+iomap_read_folio_ctx``:
+
+.. code-block:: c
+
+ struct iomap_read_folio_ctx {
+ const struct iomap_read_ops *ops;
+ struct folio *cur_folio;
+ struct readahead_control *rac;
+ void *read_ctx;
+ };
+
+``iomap_readahead`` must set:
+ * ``ops->read_folio_range()`` and ``rac``
+
+``iomap_read_folio`` must set:
+ * ``ops->read_folio_range()`` and ``cur_folio``
+
+``ops->submit_read()`` and ``read_ctx`` are optional. ``read_ctx`` is used to
+pass in any custom data the caller needs accessible in the ops callbacks for
+fulfilling reads.
+
Buffered Writes
---------------
@@ -317,6 +360,9 @@ The fields are as follows:
delalloc reservations to avoid having delalloc reservations for
clean pagecache.
This function must be supplied by the filesystem.
+ If this succeeds, iomap_finish_folio_write() must be called once writeback
+ completes for the range, regardless of whether the writeback succeeded or
+ failed.
- ``writeback_submit``: Submit the previous built writeback context.
Block based file systems should use the iomap_ioend_writeback_submit
@@ -444,10 +490,6 @@ These ``struct kiocb`` flags are significant for direct I/O with iomap:
Only meaningful for asynchronous I/O, and only if the entire I/O can
be issued as a single ``struct bio``.
- * ``IOCB_DIO_CALLER_COMP``: Try to run I/O completion from the caller's
- process context.
- See ``linux/fs.h`` for more details.
-
Filesystems should call ``iomap_dio_rw`` from ``->read_iter`` and
``->write_iter``, and set ``FMODE_CAN_ODIRECT`` in the ``->open``
function for the file.
diff --git a/block/fops.c b/block/fops.c
index 5e3db9fead77..4dad9c2d5796 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -540,12 +540,13 @@ const struct address_space_operations def_blk_aops = {
#else /* CONFIG_BUFFER_HEAD */
static int blkdev_read_folio(struct file *file, struct folio *folio)
{
- return iomap_read_folio(folio, &blkdev_iomap_ops);
+ iomap_bio_read_folio(folio, &blkdev_iomap_ops);
+ return 0;
}
static void blkdev_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &blkdev_iomap_ops);
+ iomap_bio_readahead(rac, &blkdev_iomap_ops);
}
static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 15a7f8031084..2a86bb6fcd13 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -227,12 +227,6 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
- /*
- * Stacked filesystems don't support deferred completions, don't copy
- * this property in case it is set by the issuer.
- */
- flags &= ~IOCB_DIO_CALLER_COMP;
-
old_cred = override_creds(ctx->cred);
if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags);
diff --git a/fs/dax.c b/fs/dax.c
index 516f995a988c..38fae11ee419 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1507,7 +1507,7 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return iomap_iter_advance(iter, &length);
+ return iomap_iter_advance(iter, length);
/*
* invalidate the pages whose sharing state is to be changed
@@ -1536,10 +1536,10 @@ static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (ret < 0)
return ret;
- ret = iomap_iter_advance(iter, &length);
+ ret = iomap_iter_advance(iter, length);
if (ret)
return ret;
- } while (length > 0);
+ } while ((length = iomap_length(iter)) > 0);
if (did_zero)
*did_zero = true;
@@ -1597,7 +1597,7 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
done = iov_iter_zero(min(length, end - pos), iter);
- return iomap_iter_advance(iomi, &done);
+ return iomap_iter_advance(iomi, done);
}
}
@@ -1681,12 +1681,12 @@ static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
- length = xfer;
- ret = iomap_iter_advance(iomi, &length);
+ ret = iomap_iter_advance(iomi, xfer);
if (!ret && xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
+ length = iomap_length(iomi);
}
dax_read_unlock(id);
@@ -1919,10 +1919,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
ret |= VM_FAULT_MAJOR;
}
- if (!(ret & VM_FAULT_ERROR)) {
- u64 length = PAGE_SIZE;
- iter.status = iomap_iter_advance(&iter, &length);
- }
+ if (!(ret & VM_FAULT_ERROR))
+ iter.status = iomap_iter_advance(&iter, PAGE_SIZE);
}
if (iomap_errp)
@@ -2034,10 +2032,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
continue; /* actually breaks out of the loop */
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
- if (ret != VM_FAULT_FALLBACK) {
- u64 length = PMD_SIZE;
- iter.status = iomap_iter_advance(&iter, &length);
- }
+ if (ret != VM_FAULT_FALLBACK)
+ iter.status = iomap_iter_advance(&iter, PMD_SIZE);
}
unlock_entry:
@@ -2163,7 +2159,6 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
- u64 dest_len;
void *saddr, *daddr;
int id, ret;
@@ -2196,10 +2191,9 @@ static int dax_range_compare_iter(struct iomap_iter *it_src,
dax_read_unlock(id);
advance:
- dest_len = len;
- ret = iomap_iter_advance(it_src, &len);
+ ret = iomap_iter_advance(it_src, len);
if (!ret)
- ret = iomap_iter_advance(it_dest, &dest_len);
+ ret = iomap_iter_advance(it_dest, len);
return ret;
out_unlock:
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 8ca29962a3dd..bb13c4cb8455 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
{
trace_erofs_read_folio(folio, true);
- return iomap_read_folio(folio, &erofs_iomap_ops);
+ iomap_bio_read_folio(folio, &erofs_iomap_ops);
+ return 0;
}
static void erofs_readahead(struct readahead_control *rac)
@@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac)
trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
readahead_count(rac), true);
- return iomap_readahead(rac, &erofs_iomap_ops);
+ iomap_bio_readahead(rac, &erofs_iomap_ops);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index ecaec0fea3a1..316922d5dd13 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1192,7 +1192,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
if (attr->blksize != 0)
blkbits = ilog2(attr->blksize);
else
- blkbits = fc->blkbits;
+ blkbits = inode->i_sb->s_blocksize_bits;
stat->blksize = 1 << blkbits;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f1ef77a0be05..7bcb650a9f26 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -834,23 +834,142 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio,
return 0;
}
+static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ iomap->type = IOMAP_MAPPED;
+ iomap->length = length;
+ iomap->offset = offset;
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+};
+
+struct fuse_fill_read_data {
+ struct file *file;
+
+ /* Fields below are used if sending the read request asynchronously */
+ struct fuse_conn *fc;
+ struct fuse_io_args *ia;
+ unsigned int nr_bytes;
+};
+
+/* forward declarations */
+static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
+ unsigned cur_bytes, bool write);
+static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
+ unsigned int count, bool async);
+
+static int fuse_handle_readahead(struct folio *folio,
+ struct readahead_control *rac,
+ struct fuse_fill_read_data *data, loff_t pos,
+ size_t len)
+{
+ struct fuse_io_args *ia = data->ia;
+ size_t off = offset_in_folio(folio, pos);
+ struct fuse_conn *fc = data->fc;
+ struct fuse_args_pages *ap;
+ unsigned int nr_pages;
+
+ if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes,
+ false)) {
+ fuse_send_readpages(ia, data->file, data->nr_bytes,
+ fc->async_read);
+ data->nr_bytes = 0;
+ data->ia = NULL;
+ ia = NULL;
+ }
+ if (!ia) {
+ if (fc->num_background >= fc->congestion_threshold &&
+ rac->ra->async_size >= readahead_count(rac))
+ /*
+ * Congested and only async pages left, so skip the
+ * rest.
+ */
+ return -EAGAIN;
+
+ nr_pages = min(fc->max_pages, readahead_count(rac));
+ data->ia = fuse_io_alloc(NULL, nr_pages);
+ if (!data->ia)
+ return -ENOMEM;
+ ia = data->ia;
+ }
+ folio_get(folio);
+ ap = &ia->ap;
+ ap->folios[ap->num_folios] = folio;
+ ap->descs[ap->num_folios].offset = off;
+ ap->descs[ap->num_folios].length = len;
+ data->nr_bytes += len;
+ ap->num_folios++;
+
+ return 0;
+}
+
+static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx,
+ size_t len)
+{
+ struct fuse_fill_read_data *data = ctx->read_ctx;
+ struct folio *folio = ctx->cur_folio;
+ loff_t pos = iter->pos;
+ size_t off = offset_in_folio(folio, pos);
+ struct file *file = data->file;
+ int ret;
+
+ if (ctx->rac) {
+ ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len);
+ } else {
+ /*
+ * for non-readahead read requests, do reads synchronously
+ * since it's not guaranteed that the server can handle
+ * out-of-order reads
+ */
+ ret = fuse_do_readfolio(file, folio, off, len);
+ if (!ret)
+ iomap_finish_folio_read(folio, off, len, ret);
+ }
+ return ret;
+}
+
+static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx)
+{
+ struct fuse_fill_read_data *data = ctx->read_ctx;
+
+ if (data->ia)
+ fuse_send_readpages(data->ia, data->file, data->nr_bytes,
+ data->fc->async_read);
+}
+
+static const struct iomap_read_ops fuse_iomap_read_ops = {
+ .read_folio_range = fuse_iomap_read_folio_range_async,
+ .submit_read = fuse_iomap_read_submit,
+};
+
static int fuse_read_folio(struct file *file, struct folio *folio)
{
struct inode *inode = folio->mapping->host;
- int err;
+ struct fuse_fill_read_data data = {
+ .file = file,
+ };
+ struct iomap_read_folio_ctx ctx = {
+ .cur_folio = folio,
+ .ops = &fuse_iomap_read_ops,
+ .read_ctx = &data,
- err = -EIO;
- if (fuse_is_bad(inode))
- goto out;
+ };
- err = fuse_do_readfolio(file, folio, 0, folio_size(folio));
- if (!err)
- folio_mark_uptodate(folio);
+ if (fuse_is_bad(inode)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
+ iomap_read_folio(&fuse_iomap_ops, &ctx);
fuse_invalidate_atime(inode);
- out:
- folio_unlock(folio);
- return err;
+ return 0;
}
static int fuse_iomap_read_folio_range(const struct iomap_iter *iter,
@@ -887,7 +1006,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
fuse_invalidate_atime(inode);
for (i = 0; i < ap->num_folios; i++) {
- folio_end_read(ap->folios[i], !err);
+ iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset,
+ ap->descs[i].length, err);
folio_put(ap->folios[i]);
}
if (ia->ff)
@@ -897,7 +1017,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
}
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
- unsigned int count)
+ unsigned int count, bool async)
{
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
@@ -919,7 +1039,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
ia->read.attr_ver = fuse_get_attr_version(fm->fc);
- if (fm->fc->async_read) {
+ if (async) {
ia->ff = fuse_file_get(ff);
ap->args.end = fuse_readpages_end;
err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
@@ -936,81 +1056,20 @@ static void fuse_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- unsigned int max_pages, nr_pages;
- struct folio *folio = NULL;
+ struct fuse_fill_read_data data = {
+ .file = rac->file,
+ .fc = fc,
+ };
+ struct iomap_read_folio_ctx ctx = {
+ .ops = &fuse_iomap_read_ops,
+ .rac = rac,
+ .read_ctx = &data
+ };
if (fuse_is_bad(inode))
return;
- max_pages = min_t(unsigned int, fc->max_pages,
- fc->max_read / PAGE_SIZE);
-
- /*
- * This is only accurate the first time through, since readahead_folio()
- * doesn't update readahead_count() from the previous folio until the
- * next call. Grab nr_pages here so we know how many pages we're going
- * to have to process. This means that we will exit here with
- * readahead_count() == folio_nr_pages(last_folio), but we will have
- * consumed all of the folios, and read_pages() will call
- * readahead_folio() again which will clean up the rac.
- */
- nr_pages = readahead_count(rac);
-
- while (nr_pages) {
- struct fuse_io_args *ia;
- struct fuse_args_pages *ap;
- unsigned cur_pages = min(max_pages, nr_pages);
- unsigned int pages = 0;
-
- if (fc->num_background >= fc->congestion_threshold &&
- rac->ra->async_size >= readahead_count(rac))
- /*
- * Congested and only async pages left, so skip the
- * rest.
- */
- break;
-
- ia = fuse_io_alloc(NULL, cur_pages);
- if (!ia)
- break;
- ap = &ia->ap;
-
- while (pages < cur_pages) {
- unsigned int folio_pages;
-
- /*
- * This returns a folio with a ref held on it.
- * The ref needs to be held until the request is
- * completed, since the splice case (see
- * fuse_try_move_page()) drops the ref after it's
- * replaced in the page cache.
- */
- if (!folio)
- folio = __readahead_folio(rac);
-
- folio_pages = folio_nr_pages(folio);
- if (folio_pages > cur_pages - pages) {
- /*
- * Large folios belonging to fuse will never
- * have more pages than max_pages.
- */
- WARN_ON(!pages);
- break;
- }
-
- ap->folios[ap->num_folios] = folio;
- ap->descs[ap->num_folios].length = folio_size(folio);
- ap->num_folios++;
- pages += folio_pages;
- folio = NULL;
- }
- fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT);
- nr_pages -= pages;
- }
- if (folio) {
- folio_end_read(folio, false);
- folio_put(folio);
- }
+ iomap_readahead(&fuse_iomap_ops, &ctx);
}
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -1397,20 +1456,6 @@ static const struct iomap_write_ops fuse_iomap_write_ops = {
.read_folio_range = fuse_iomap_read_folio_range,
};
-static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
-{
- iomap->type = IOMAP_MAPPED;
- iomap->length = length;
- iomap->offset = offset;
- return 0;
-}
-
-static const struct iomap_ops fuse_iomap_ops = {
- .iomap_begin = fuse_iomap_begin,
-};
-
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -1834,7 +1879,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
* scope of the fi->lock alleviates xarray lock
* contention and noticeably improves performance.
*/
- iomap_finish_folio_write(inode, ap->folios[i], 1);
+ iomap_finish_folio_write(inode, ap->folios[i],
+ ap->descs[i].length);
wake_up(&fi->page_waitq);
}
@@ -2047,7 +2093,7 @@ struct fuse_fill_wb_data {
struct fuse_file *ff;
unsigned int max_folios;
/*
- * nr_bytes won't overflow since fuse_writepage_need_send() caps
+ * nr_bytes won't overflow since fuse_folios_need_send() caps
* wb requests to never exceed fc->max_pages (which has an upper bound
* of U16_MAX).
*/
@@ -2092,14 +2138,15 @@ static void fuse_writepages_send(struct inode *inode,
spin_unlock(&fi->lock);
}
-static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
- unsigned len, struct fuse_args_pages *ap,
- struct fuse_fill_wb_data *data)
+static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
+ unsigned cur_bytes, bool write)
{
struct folio *prev_folio;
struct fuse_folio_desc prev_desc;
- unsigned bytes = data->nr_bytes + len;
+ unsigned bytes = cur_bytes + len;
loff_t prev_pos;
+ size_t max_bytes = write ? fc->max_write : fc->max_read;
WARN_ON(!ap->num_folios);
@@ -2107,8 +2154,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages)
return true;
- /* Reached max write bytes */
- if (bytes > fc->max_write)
+ if (bytes > max_bytes)
return true;
/* Discontinuity */
@@ -2118,11 +2164,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
if (prev_pos != pos)
return true;
- /* Need to grow the pages array? If so, did the expansion fail? */
- if (ap->num_folios == data->max_folios &&
- !fuse_pages_realloc(data, fc->max_pages))
- return true;
-
return false;
}
@@ -2146,10 +2187,24 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
return -EIO;
}
- if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) {
- fuse_writepages_send(inode, data);
- data->wpa = NULL;
- data->nr_bytes = 0;
+ if (wpa) {
+ bool send = fuse_folios_need_send(fc, pos, len, ap,
+ data->nr_bytes, true);
+
+ if (!send) {
+ /*
+ * Need to grow the pages array? If so, did the
+ * expansion fail?
+ */
+ send = (ap->num_folios == data->max_folios) &&
+ !fuse_pages_realloc(data, fc->max_pages);
+ }
+
+ if (send) {
+ fuse_writepages_send(inode, data);
+ data->wpa = NULL;
+ data->nr_bytes = 0;
+ }
}
if (data->wpa == NULL) {
@@ -2161,7 +2216,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
ap = &wpa->ia.ap;
}
- iomap_start_folio_write(inode, folio, 1);
fuse_writepage_args_page_fill(wpa, folio, ap->num_folios,
offset, len);
data->nr_bytes += len;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c2f2a48156d6..f616c1991fed 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -981,14 +981,6 @@ struct fuse_conn {
/* Request timeout (in jiffies). 0 = no timeout */
unsigned int req_timeout;
} timeout;
-
- /*
- * This is a workaround until fuse uses iomap for reads.
- * For fuseblk servers, this represents the blocksize passed in at
- * mount time and for regular fuse servers, this is equivalent to
- * inode->i_blkbits.
- */
- u8 blkbits;
};
/*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d1babf56f254..8ba29ca23244 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -291,7 +291,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
if (attr->blksize)
fi->cached_i_blkbits = ilog2(attr->blksize);
else
- fi->cached_i_blkbits = fc->blkbits;
+ fi->cached_i_blkbits = inode->i_sb->s_blocksize_bits;
/*
* Don't set the sticky bit in i_mode, unless we want the VFS
@@ -1838,22 +1838,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -EINVAL;
if (!sb_set_blocksize(sb, ctx->blksize))
goto err;
- /*
- * This is a workaround until fuse hooks into iomap for reads.
- * Use PAGE_SIZE for the blocksize else if the writeback cache
- * is enabled, buffered writes go through iomap and a read may
- * overwrite partially written data if blocksize < PAGE_SIZE
- */
- fc->blkbits = sb->s_blocksize_bits;
- if (ctx->blksize != PAGE_SIZE &&
- !sb_set_blocksize(sb, PAGE_SIZE))
- goto err;
#endif
fc->sync_fs = 1;
} else {
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
- fc->blkbits = sb->s_blocksize_bits;
}
sb->s_subtype = ctx->subtype;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 47d74afd63ac..38d4f343187a 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -424,11 +424,11 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
struct inode *inode = folio->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- int error;
+ int error = 0;
if (!gfs2_is_jdata(ip) ||
(i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
- error = iomap_read_folio(folio, &gfs2_iomap_ops);
+ iomap_bio_read_folio(folio, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
error = stuffed_read_folio(ip, folio);
} else {
@@ -503,7 +503,7 @@ static void gfs2_readahead(struct readahead_control *rac)
else if (gfs2_is_jdata(ip))
mpage_readahead(rac, gfs2_block_map);
else
- iomap_readahead(rac, &gfs2_iomap_ops);
+ iomap_bio_readahead(rac, &gfs2_iomap_ops);
}
/**
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index f7e1c8534c46..a572b8808524 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -14,5 +14,6 @@ iomap-y += trace.o \
iomap-$(CONFIG_BLOCK) += direct-io.o \
ioend.o \
fiemap.o \
- seek.o
+ seek.o \
+ bio.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c
new file mode 100644
index 000000000000..fc045f2e4c45
--- /dev/null
+++ b/fs/iomap/bio.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
+ */
+#include <linux/iomap.h>
+#include <linux/pagemap.h>
+#include "internal.h"
+#include "trace.h"
+
+static void iomap_read_end_io(struct bio *bio)
+{
+ int error = blk_status_to_errno(bio->bi_status);
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio)
+ iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
+ bio_put(bio);
+}
+
+static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx)
+{
+ struct bio *bio = ctx->read_ctx;
+
+ if (bio)
+ submit_bio(bio);
+}
+
+static int iomap_bio_read_folio_range(const struct iomap_iter *iter,
+ struct iomap_read_folio_ctx *ctx, size_t plen)
+{
+ struct folio *folio = ctx->cur_folio;
+ const struct iomap *iomap = &iter->iomap;
+ loff_t pos = iter->pos;
+ size_t poff = offset_in_folio(folio, pos);
+ loff_t length = iomap_length(iter);
<