aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-12 10:19:58 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-12 10:19:58 -0800
commit5903c871e21498405d11c5699d06becd12acda24 (patch)
treefcf0e899b9e5cf106fa6f4d72a333d53b44b7010 /fs/ext4/mballoc.c
parent178574549e421755dbe1e4721e20ca6c10af4900 (diff)
parent4f5e8e6f012349a107531b02eed5b5ace6181449 (diff)
Merge tag 'ext4_for_linus-7.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "New features and improvements for the ext4 file system - Avoid unnecessary cache invalidation in the extent status cache (es_cache) when adding extents to be cached in the es_cache and we are not changing the extent tree - Add a sysfs parameter, err_report_sec, to control how frequently to log a warning message that file system inconsistency has been detected (Previously we logged the warning message every 24 hours) - Avoid unnecessary forced ordered writes when appending to a file when delayed allocation is enabled - Defer splitting unwritten extents to I/O completion to improve write performance of concurrent direct I/O writes to multiple files - Refactor and add kunit tests to the extent splitting and conversion code paths Various Bug Fixes: - Fix a panic when the debugging DOUBLE_CHECK macro is defined - Avoid using fast commit for rare and complex file system operations to make fast commit easier to reason about. This can also avoid some corner cases that could result in file system inconsistency if there is a crash between the fast commit before a subsequent full commit - Fix memory leaks in error paths - Fix a false positive reports caused when running stress tests using mixed huge-page workloads caused by a race between page migration and bitmap updates - Fix a potential recursion into file system reclaim when evicting an inode when fast commit is enabled - Fix a warning caused by a potential double decrement to the dirty clusters counter when executing FS_IOC_SHUTDOWN when running a stress test - Enable mballoc optimized scanning regardless whether the inode is using indirect blocks or extent trees to map blocks" * tag 'ext4_for_linus-7.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (45 commits) et4: allow zeroout when doing written to unwritten split ext4: refactor split and convert extents ext4: refactor zeroout path and handle all cases ext4: propagate flags to ext4_convert_unwritten_extents_endio() ext4: propagate flags to convert_initialized_extent() ext4: add extent status cache support to kunit tests ext4: kunit tests for higher level extent manipulation functions ext4: kunit tests for extent splitting and conversion ext4: use optimized mballoc scanning regardless of inode format ext4: always allocate blocks only from groups inode can use ext4: fix dirtyclusters double decrement on fs shutdown ext4: fast commit: make s_fc_lock reclaim-safe ext4: fix e4b bitmap inconsistency reports ext4: remove redundant NULL check after __GFP_NOFAIL ext4: remove EXT4_GET_BLOCKS_IO_CREATE_EXT ext4: simplify the mapping query logic in ext4_iomap_begin() ext4: remove unused unwritten parameter in ext4_dio_write_iter() ext4: remove useless ext4_iomap_overwrite_ops ext4: avoid starting handle when dio writing an unwritten extent ext4: don't split extent before submitting I/O ...
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c73
1 files changed, 36 insertions, 37 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e817a758801d..b99d1a7e580e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -892,6 +892,21 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
}
}
+static ext4_group_t ext4_get_allocation_groups_count(
+ struct ext4_allocation_context *ac)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
+
+ /* non-extent files are limited to low blocks/groups */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+ ngroups = EXT4_SB(ac->ac_sb)->s_blockfile_groups;
+
+ /* Pairs with smp_wmb() in ext4_update_super() */
+ smp_rmb();
+
+ return ngroups;
+}
+
static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
struct xarray *xa,
ext4_group_t start, ext4_group_t end)
@@ -899,7 +914,7 @@ static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
struct super_block *sb = ac->ac_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
enum criteria cr = ac->ac_criteria;
- ext4_group_t ngroups = ext4_get_groups_count(sb);
+ ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);
unsigned long group = start;
struct ext4_group_info *grp;
@@ -951,7 +966,7 @@ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
ext4_group_t start, end;
start = group;
- end = ext4_get_groups_count(ac->ac_sb);
+ end = ext4_get_allocation_groups_count(ac);
wrap_around:
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
@@ -1001,7 +1016,7 @@ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
ext4_group_t start, end;
start = group;
- end = ext4_get_groups_count(ac->ac_sb);
+ end = ext4_get_allocation_groups_count(ac);
wrap_around:
i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
@@ -1083,7 +1098,7 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
min_order = fls(ac->ac_o_ex.fe_len);
start = group;
- end = ext4_get_groups_count(ac->ac_sb);
+ end = ext4_get_allocation_groups_count(ac);
wrap_around:
for (i = order; i >= min_order; i--) {
int frag_order;
@@ -1133,8 +1148,6 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
return 0;
if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
return 0;
- if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
- return 0;
return 1;
}
@@ -1182,11 +1195,7 @@ static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
int ret = 0;
ext4_group_t start;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
-
- /* non-extent files are limited to low blocks/groups */
- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
- ngroups = sbi->s_blockfile_groups;
+ ext4_group_t ngroups = ext4_get_allocation_groups_count(ac);
/* searching for the right group start from the goal value specified */
start = ac->ac_g_ex.fe_group;
@@ -1706,16 +1715,17 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
/* Avoid locking the folio in the fast path ... */
folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
- if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
+ /*
+ * folio_test_locked is employed to detect ongoing folio
+ * migrations, since concurrent migrations can lead to
+ * bitmap inconsistency. And if we are not uptodate that
+ * implies somebody just created the folio but is yet to
+ * initialize it. We can drop the folio reference and
+ * try to get the folio with lock in both cases to avoid
+ * concurrency.
+ */
if (!IS_ERR(folio))
- /*
- * drop the folio reference and try
- * to get the folio with lock. If we
- * are not uptodate that implies
- * somebody just created the folio but
- * is yet to initialize it. So
- * wait for it to initialize.
- */
folio_put(folio);
folio = __filemap_get_folio(inode->i_mapping, pnum,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
@@ -1764,7 +1774,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
/* we need another folio for the buddy */
folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0);
- if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
+ if (IS_ERR(folio) || !folio_test_uptodate(folio) || folio_test_locked(folio)) {
if (!IS_ERR(folio))
folio_put(folio);
folio = __filemap_get_folio(inode->i_mapping, pnum,
@@ -4185,8 +4195,7 @@ out_err:
* Returns 0 if success or error code
*/
static noinline_for_stack int
-ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle, unsigned int reserv_clstrs)
+ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle)
{
struct ext4_group_desc *gdp;
struct ext4_sb_info *sbi;
@@ -4241,13 +4250,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
BUG_ON(changed != ac->ac_b_ex.fe_len);
#endif
percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
- /*
- * Now reduce the dirty block count also. Should not go negative
- */
- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
- /* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyclusters_counter,
- reserv_clstrs);
return err;
}
@@ -6331,7 +6333,7 @@ repeat:
ext4_mb_pa_put_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
+ *errp = ext4_mb_mark_diskspace_used(ac, handle);
if (*errp) {
ext4_discard_allocated_blocks(ac);
goto errout;
@@ -6362,12 +6364,9 @@ errout:
out:
if (inquota && ar->len < inquota)
dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
- if (!ar->len) {
- if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
- /* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyclusters_counter,
- reserv_clstrs);
- }
+ /* release any reserved blocks */
+ if (reserv_clstrs)
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs);
trace_ext4_allocate_blocks(ar, (unsigned long long)block);