aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c417
1 files changed, 277 insertions, 140 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 396dc3a5d16b..c2c2d6ac7f3d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -29,7 +29,7 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
-#include <linux/pagevec.h>
+#include <linux/folio_batch.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/namei.h>
@@ -128,6 +128,8 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
+ struct jbd2_inode *jinode = READ_ONCE(EXT4_I(inode)->jinode);
+
trace_ext4_begin_ordered_truncate(inode, new_size);
/*
* If jinode is zero, then we never opened the file for
@@ -135,10 +137,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
* jbd2_journal_begin_ordered_truncate() since there's no
* outstanding writes we need to flush.
*/
- if (!EXT4_I(inode)->jinode)
+ if (!jinode)
return 0;
return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
- EXT4_I(inode)->jinode,
+ jinode,
new_size);
}
@@ -184,8 +186,18 @@ void ext4_evict_inode(struct inode *inode)
if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
- truncate_inode_pages_final(&inode->i_data);
+ /*
+ * If there's dirty page will lead to data loss, user
+ * could see stale data.
+ */
+ if (unlikely(!ext4_emergency_state(inode->i_sb) &&
+ mapping_tagged(&inode->i_data, PAGECACHE_TAG_DIRTY)))
+ ext4_warning_inode(inode, "data will be lost");
+ truncate_inode_pages_final(&inode->i_data);
+ /* Avoid mballoc special inode which has no proper iops */
+ if (!EXT4_SB(inode->i_sb)->s_journal)
+ mmb_sync(&EXT4_I(inode)->i_metadata_bhs);
goto no_delete;
}
@@ -262,7 +274,7 @@ void ext4_evict_inode(struct inode *inode)
err = ext4_truncate(inode);
if (err) {
ext4_error_err(inode->i_sb, -err,
- "couldn't truncate inode %lu (err %d)",
+ "couldn't truncate inode %llu (err %d)",
inode->i_ino, err);
goto stop_handle;
}
@@ -342,7 +354,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
spin_lock(&ei->i_block_reservation_lock);
trace_ext4_da_update_reserve_space(inode, used, quota_claim);
if (unlikely(used > ei->i_reserved_data_blocks)) {
- ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
+ ext4_warning(inode->i_sb, "%s: ino %llu, used %d "
"with only %d reserved data blocks",
__func__, inode->i_ino, used,
ei->i_reserved_data_blocks);
@@ -405,7 +417,10 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
KUNIT_STATIC_STUB_REDIRECT(ext4_issue_zeroout, inode, lblk, pblk, len);
if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
- return fscrypt_zeroout_range(inode, lblk, pblk, len);
+ return fscrypt_zeroout_range(inode,
+ (loff_t)lblk << inode->i_blkbits,
+ pblk << (inode->i_blkbits - SECTOR_SHIFT),
+ (u64)len << inode->i_blkbits);
ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
if (ret > 0)
@@ -475,7 +490,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
if (es_map->m_lblk != map->m_lblk ||
es_map->m_flags != map->m_flags ||
es_map->m_pblk != map->m_pblk) {
- printk("ES cache assertion failed for inode: %lu "
+ printk("ES cache assertion failed for inode: %llu "
"es_cached ex [%d/%d/%llu/%x] != "
"found ex [%d/%d/%llu/%x] retval %d flags %x\n",
inode->i_ino, es_map->m_lblk, es_map->m_len,
@@ -515,7 +530,7 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
if (unlikely(retval != map2.m_len)) {
ext4_warning(inode->i_sb,
"ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
+ "%llu: retval %d != map->m_len %d",
inode->i_ino, retval, map2.m_len);
WARN_ON(1);
}
@@ -563,7 +578,7 @@ int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
"ES len assertion failed for inode "
- "%lu: retval %d != map->m_len %d",
+ "%llu: retval %d != map->m_len %d",
inode->i_ino, retval, map->m_len);
WARN_ON(1);
}
@@ -630,7 +645,7 @@ int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
- "ES len assertion failed for inode %lu: "
+ "ES len assertion failed for inode %llu: "
"retval %d != map->m_len %d",
inode->i_ino, retval, map->m_len);
WARN_ON(1);
@@ -937,7 +952,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
{
int ret = 0;
- ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
+ ext4_debug("ext4_get_block_unwritten: inode %llu, create flag %d\n",
inode->i_ino, create);
ret = _ext4_get_block(inode, iblock, bh_result,
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
@@ -1420,9 +1435,6 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
/*
* We need to pick up the new inode size which generic_commit_write gave us
* `iocb` can be NULL - eg, when called from page_symlink().
- *
- * ext4 never places buffers on inode->i_mapping->i_private_list. metadata
- * buffers are managed internally.
*/
static int ext4_write_end(const struct kiocb *iocb,
struct address_space *mapping,
@@ -1456,10 +1468,9 @@ static int ext4_write_end(const struct kiocb *iocb,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity) {
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
- ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
- }
+
/*
* Don't mark the inode dirty under folio lock. First, it unnecessarily
* makes the holding time of folio lock longer. Second, it forces lock
@@ -1574,10 +1585,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity) {
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
- ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
- }
if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1659,7 +1668,7 @@ void ext4_da_release_space(struct inode *inode, int to_free)
* harmless to return without any action.
*/
ext4_warning(inode->i_sb, "ext4_da_release_space: "
- "ino %lu, to_free %d with only %d reserved "
+ "ino %llu, to_free %d with only %d reserved "
"data blocks", inode->i_ino, to_free,
ei->i_reserved_data_blocks);
WARN_ON(1);
@@ -1747,8 +1756,22 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
if (invalidate) {
- if (folio_mapped(folio))
+ if (folio_mapped(folio)) {
folio_clear_dirty_for_io(folio);
+ /*
+ * Unmap folio from page
+ * tables to prevent
+ * subsequent accesses through
+ * stale PTEs. This ensures
+ * future accesses trigger new
+ * page faults rather than
+ * reusing the invalidated
+ * folio.
+ */
+ unmap_mapping_pages(folio->mapping,
+ folio->index,
+ folio_nr_pages(folio), false);
+ }
block_invalidate_folio(folio, 0,
folio_size(folio));
folio_clear_uptodate(folio);
@@ -2491,7 +2514,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
}
ext4_msg(sb, KERN_CRIT,
"Delayed block allocation failed for "
- "inode %lu at logical offset %llu with"
+ "inode %llu at logical offset %llu with"
" max blocks %u with error %d",
inode->i_ino,
(unsigned long long)map->m_lblk,
@@ -2535,7 +2558,7 @@ update_disksize:
err2 = ext4_mark_inode_dirty(handle, inode);
if (err2) {
ext4_error_err(inode->i_sb, -err2,
- "Failed to mark inode %lu dirty",
+ "Failed to mark inode %llu dirty",
inode->i_ino);
}
if (!err)
@@ -2909,7 +2932,7 @@ retry:
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
- "%ld pages, ino %lu; err %d", __func__,
+ "%ld pages, ino %llu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
/* Release allocated io_end */
ext4_put_io_end(mpd->io_submit.io_end);
@@ -3031,17 +3054,23 @@ static int ext4_writepages(struct address_space *mapping,
int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
+ loff_t range_start, range_end;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
- .range_start = jinode->i_dirty_start,
- .range_end = jinode->i_dirty_end,
};
struct mpage_da_data mpd = {
.inode = jinode->i_vfs_inode,
.wbc = &wbc,
.can_map = 0,
};
+
+ if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end))
+ return 0;
+
+ wbc.range_start = range_start;
+ wbc.range_end = range_end;
+
return ext4_do_writepages(&mpd);
}
@@ -3196,7 +3225,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool disksize_changed = false;
- loff_t new_i_size, zero_len = 0;
+ loff_t new_i_size;
handle_t *handle;
if (unlikely(!folio_buffers(folio))) {
@@ -3240,19 +3269,15 @@ static int ext4_da_do_write_end(struct address_space *mapping,
folio_unlock(folio);
folio_put(folio);
- if (pos > old_size) {
+ if (pos > old_size)
pagecache_isize_extended(inode, old_size, pos);
- zero_len = pos - old_size;
- }
- if (!disksize_changed && !zero_len)
+ if (!disksize_changed)
return copied;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
- if (zero_len)
- ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
@@ -3437,7 +3462,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
}
/* Any metadata buffers to write? */
- if (!list_empty(&inode->i_mapping->i_private_list))
+ if (mmb_has_buffers(&EXT4_I(inode)->i_metadata_bhs))
return true;
return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
}
@@ -4002,12 +4027,11 @@ void ext4_set_aops(struct inode *inode)
* ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
* racing writeback can come later and flush the stale pagecache to disk.
*/
-static int __ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length)
+static struct buffer_head *ext4_load_tail_bh(struct inode *inode, loff_t from)
{
unsigned int offset, blocksize, pos;
ext4_lblk_t iblock;
- struct inode *inode = mapping->host;
+ struct address_space *mapping = inode->i_mapping;
struct buffer_head *bh;
struct folio *folio;
int err = 0;
@@ -4016,7 +4040,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mapping_gfp_constraint(mapping, ~__GFP_FS));
if (IS_ERR(folio))
- return PTR_ERR(folio);
+ return ERR_CAST(folio);
blocksize = inode->i_sb->s_blocksize;
@@ -4068,47 +4092,92 @@ static int __ext4_block_zero_page_range(handle_t *handle,
}
}
}
- if (ext4_should_journal_data(inode)) {
- BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
- EXT4_JTR_NONE);
- if (err)
- goto unlock;
- }
- folio_zero_range(folio, offset, length);
+ return bh;
+
+unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return err ? ERR_PTR(err) : NULL;
+}
+
+static int ext4_block_do_zero_range(struct inode *inode, loff_t from,
+ loff_t length, bool *did_zero,
+ bool *zero_written)
+{
+ struct buffer_head *bh;
+ struct folio *folio;
+
+ bh = ext4_load_tail_bh(inode, from);
+ if (IS_ERR_OR_NULL(bh))
+ return PTR_ERR_OR_ZERO(bh);
+
+ folio = bh->b_folio;
+ folio_zero_range(folio, offset_in_folio(folio, from), length);
BUFFER_TRACE(bh, "zeroed end of block");
- if (ext4_should_journal_data(inode)) {
- err = ext4_dirty_journalled_data(handle, bh);
- } else {
- mark_buffer_dirty(bh);
- /*
- * Only the written block requires ordered data to prevent
- * exposing stale data.
- */
- if (!buffer_unwritten(bh) && !buffer_delay(bh) &&
- ext4_should_order_data(inode))
- err = ext4_jbd2_inode_add_write(handle, inode, from,
- length);
+ mark_buffer_dirty(bh);
+ if (did_zero)
+ *did_zero = true;
+ if (zero_written && !buffer_unwritten(bh) && !buffer_delay(bh))
+ *zero_written = true;
+
+ folio_unlock(folio);
+ folio_put(folio);
+ return 0;
+}
+
+static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from,
+ loff_t length, bool *did_zero)
+{
+ struct buffer_head *bh;
+ struct folio *folio;
+ handle_t *handle;
+ int err;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ bh = ext4_load_tail_bh(inode, from);
+ if (IS_ERR_OR_NULL(bh)) {
+ err = PTR_ERR_OR_ZERO(bh);
+ goto out_handle;
}
+ folio = bh->b_folio;
-unlock:
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ folio_zero_range(folio, offset_in_folio(folio, from), length);
+ BUFFER_TRACE(bh, "zeroed end of block");
+
+ err = ext4_dirty_journalled_data(handle, bh);
+ if (err)
+ goto out;
+
+ if (did_zero)
+ *did_zero = true;
+out:
folio_unlock(folio);
folio_put(folio);
+out_handle:
+ ext4_journal_stop(handle);
return err;
}
/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'. The range to be zero'd must
- * be contained with in one block. If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that corresponds to 'from'
+ * Zeros out a mapping of length 'length' starting from file offset
+ * 'from'. The range to be zero'd must be contained with in one block.
+ * If the specified range exceeds the end of the block it will be
+ * shortened to end of the block that corresponds to 'from'.
*/
-static int ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length)
+static int ext4_block_zero_range(struct inode *inode,
+ loff_t from, loff_t length, bool *did_zero,
+ bool *zero_written)
{
- struct inode *inode = mapping->host;
unsigned blocksize = inode->i_sb->s_blocksize;
unsigned int max = blocksize - (from & (blocksize - 1));
@@ -4120,40 +4189,73 @@ static int ext4_block_zero_page_range(handle_t *handle,
length = max;
if (IS_DAX(inode)) {
- return dax_zero_range(inode, from, length, NULL,
+ return dax_zero_range(inode, from, length, did_zero,
&ext4_iomap_ops);
+ } else if (ext4_should_journal_data(inode)) {
+ return ext4_block_journalled_zero_range(inode, from, length,
+ did_zero);
}
- return __ext4_block_zero_page_range(handle, mapping, from, length);
+ return ext4_block_do_zero_range(inode, from, length, did_zero,
+ zero_written);
}
/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
+ * Zero out a mapping from file offset 'from' up to the end of the block
+ * which corresponds to 'from' or to the given 'end' inside this block.
+ * This required during truncate up and performing append writes. We need
+ * to physically zero the tail end of that block so it doesn't yield old
+ * data if the file is grown.
*/
-static int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from)
+int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
{
- unsigned length;
- unsigned blocksize;
- struct inode *inode = mapping->host;
+ unsigned int blocksize = i_blocksize(inode);
+ unsigned int offset;
+ loff_t length = end - from;
+ bool did_zero = false;
+ bool zero_written = false;
+ int err;
+ offset = from & (blocksize - 1);
+ if (!offset || from >= end)
+ return 0;
/* If we are processing an encrypted inode during orphan list handling */
if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
return 0;
- blocksize = i_blocksize(inode);
- length = blocksize - (from & (blocksize - 1));
+ if (length > blocksize - offset)
+ length = blocksize - offset;
+
+ err = ext4_block_zero_range(inode, from, length,
+ &did_zero, &zero_written);
+ if (err)
+ return err;
+ /*
+ * It's necessary to order zeroed data before update i_disksize when
+ * truncating up or performing an append write, because there might be
+ * exposing stale on-disk data which may caused by concurrent post-EOF
+ * mmap write during folio writeback.
+ */
+ if (ext4_should_order_data(inode) &&
+ did_zero && zero_written && !IS_DAX(inode)) {
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_jbd2_inode_add_write(handle, inode, from, length);
+ ext4_journal_stop(handle);
+ if (err)
+ return err;
+ }
- return ext4_block_zero_page_range(handle, mapping, from, length);
+ return 0;
}
-int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
- loff_t lstart, loff_t length)
+int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length,
+ bool *did_zero)
{
struct super_block *sb = inode->i_sb;
- struct address_space *mapping = inode->i_mapping;
unsigned partial_start, partial_end;
ext4_fsblk_t start, end;
loff_t byte_end = (lstart + length - 1);
@@ -4168,22 +4270,21 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
/* Handle partial zero within the single block */
if (start == end &&
(partial_start || (partial_end != sb->s_blocksize - 1))) {
- err = ext4_block_zero_page_range(handle, mapping,
- lstart, length);
+ err = ext4_block_zero_range(inode, lstart, length, did_zero,
+ NULL);
return err;
}
/* Handle partial zero out on the start of the range */
if (partial_start) {
- err = ext4_block_zero_page_range(handle, mapping,
- lstart, sb->s_blocksize);
+ err = ext4_block_zero_range(inode, lstart, sb->s_blocksize,
+ did_zero, NULL);
if (err)
return err;
}
/* Handle partial zero out on the end of the range */
if (partial_end != sb->s_blocksize - 1)
- err = ext4_block_zero_page_range(handle, mapping,
- byte_end - partial_end,
- partial_end + 1);
+ err = ext4_block_zero_range(inode, byte_end - partial_end,
+ partial_end + 1, did_zero, NULL);
return err;
}
@@ -4332,6 +4433,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
+ bool partial_zeroed = false;
int ret;
trace_ext4_punch_hole(inode, offset, length, 0);
@@ -4358,17 +4460,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
end = max_end;
length = end - offset;
- /*
- * Attach jinode to inode for jbd2 if we do any zeroing of partial
- * block.
- */
- if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
- ret = ext4_inode_attach_jinode(inode);
- if (ret < 0)
- return ret;
- }
-
-
ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
return ret;
@@ -4378,8 +4469,18 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (ret)
return ret;
+ ret = ext4_zero_partial_blocks(inode, offset, length, &partial_zeroed);
+ if (ret)
+ return ret;
+ if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset,
+ end - 1);
+ if (ret)
+ return ret;
+ }
+
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_chunk_trans_extent(inode, 2);
+ credits = ext4_chunk_trans_extent(inode, 0);
else
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
@@ -4389,10 +4490,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
return ret;
}
- ret = ext4_zero_partial_blocks(handle, inode, offset, length);
- if (ret)
- goto out_handle;
-
/* If there are blocks to remove, do it */
start_lblk = EXT4_B_TO_LBLK(inode, offset);
end_lblk = end >> inode->i_blkbits;
@@ -4429,7 +4526,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
goto out_handle;
ext4_update_inode_fsync_trans(handle, inode, 1);
- if (IS_SYNC(inode))
+ if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
ext4_handle_sync(handle);
out_handle:
ext4_journal_stop(handle);
@@ -4451,8 +4548,13 @@ int ext4_inode_attach_jinode(struct inode *inode)
spin_unlock(&inode->i_lock);
return -ENOMEM;
}
- ei->jinode = jinode;
- jbd2_journal_init_jbd_inode(ei->jinode, inode);
+ jbd2_journal_init_jbd_inode(jinode, inode);
+ /*
+ * Publish ->jinode only after it is fully initialized so that
+ * readers never observe a partially initialized jbd2_inode.
+ */
+ smp_wmb();
+ WRITE_ONCE(ei->jinode, jinode);
jinode = NULL;
}
spin_unlock(&inode->i_lock);
@@ -4495,7 +4597,6 @@ int ext4_truncate(struct inode *inode)
unsigned int credits;
int err = 0, err2;
handle_t *handle;
- struct address_space *mapping = inode->i_mapping;
/*
* There is a possibility that we're either freeing the inode
@@ -4525,6 +4626,11 @@ int ext4_truncate(struct inode *inode)
err = ext4_inode_attach_jinode(inode);
if (err)
goto out_trace;
+
+ /* Zero to the end of the block containing i_size */
+ err = ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX);
+ if (err)
+ goto out_trace;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4538,9 +4644,6 @@ int ext4_truncate(struct inode *inode)
goto out_trace;
}
- if (inode->i_size & (inode->i_sb->s_blocksize - 1))
- ext4_block_truncate_page(handle, mapping, inode->i_size);
-
/*
* We add the inode to the orphan list, so that if this
* truncate spans multiple transactions, and we crash, we will
@@ -5401,18 +5504,36 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
- if (inode->i_size == 0 ||
- inode->i_size >= sizeof(ei->i_data) ||
- strnlen((char *)ei->i_data, inode->i_size + 1) !=
- inode->i_size) {
- ext4_error_inode(inode, function, line, 0,
- "invalid fast symlink length %llu",
- (unsigned long long)inode->i_size);
- ret = -EFSCORRUPTED;
- goto bad_inode;
+
+ /*
+ * Orphan cleanup can see inodes with i_size == 0
+ * and i_data uninitialized. Skip size checks in
+ * that case. This is safe because the first thing
+ * ext4_evict_inode() does for fast symlinks is
+ * clearing of i_data and i_size.
+ */
+ if ((EXT4_SB(sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ if (inode->i_nlink != 0) {
+ ext4_error_inode(inode, function, line, 0,
+ "invalid orphan symlink nlink %d",
+ inode->i_nlink);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+ } else {
+ if (inode->i_size == 0 ||
+ inode->i_size >= sizeof(ei->i_data) ||
+ strnlen((char *)ei->i_data, inode->i_size + 1) !=
+ inode->i_size) {
+ ext4_error_inode(inode, function, line, 0,
+ "invalid fast symlink length %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+ inode_set_cached_link(inode, (char *)ei->i_data,
+ inode->i_size);
}
- inode_set_cached_link(inode, (char *)ei->i_data,
- inode->i_size);
} else {
inode->i_op = &ext4_symlink_inode_operations;
}
@@ -5849,6 +5970,18 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (attr->ia_size == inode->i_size)
inc_ivers = false;
+ /*
+ * If file has inline data but new size exceeds inline capacity,
+ * convert to extent-based storage first to prevent inconsistent
+ * state (inline flag set but size exceeds inline capacity).
+ */
+ if (ext4_has_inline_data(inode) &&
+ attr->ia_size > EXT4_I(inode)->i_inline_size) {
+ error = ext4_convert_inline_data(inode);
+ if (error)
+ goto err_out;
+ }
+
if (shrink) {
if (ext4_should_order_data(inode)) {
error = ext4_begin_ordered_truncate(inode,
@@ -5880,15 +6013,6 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
goto out_mmap_sem;
}
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto out_mmap_sem;
- }
- if (ext4_handle_valid(handle) && shrink) {
- error = ext4_orphan_add(handle, inode);
- orphan = 1;
- }
/*
* Update c/mtime and tail zero the EOF folio on
* truncate up. ext4_truncate() handles the shrink case
@@ -5897,9 +6021,22 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (!shrink) {
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
- if (oldsize & (inode->i_sb->s_blocksize - 1))
- ext4_block_truncate_page(handle,
- inode->i_mapping, oldsize);
+ if (oldsize & (inode->i_sb->s_blocksize - 1)) {
+ error = ext4_block_zero_eof(inode,
+ oldsize, LLONG_MAX);
+ if (error)
+ goto out_mmap_sem;
+ }
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto out_mmap_sem;
+ }
+ if (ext4_handle_valid(handle) && shrink) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
}
if (shrink)