// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
* Written by Alex Tomas <alex@clusterfs.com>
*
* Architecture independence:
* Copyright (c) 2005, Bull S.A.
* Written by Pierre Peiffer <pierre.peiffer@bull.net>
*/
/*
* Extents support for EXT4
*
* TODO:
* - ext4*_error() should be used in some situations
* - analyze all BUG()/BUG_ON(), use -EIO where appropriate
* - smart tree reduction
*/
#include <linux/fs.h>
#include <linux/time.h>
#include <linux/jbd2.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fiemap.h>
#include <linux/backing-dev.h>
#include <linux/iomap.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
#include "xattr.h"
#include <trace/events/ext4.h>
/*
* used by extent splitting.
*/
#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
due to ENOSPC */
#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */
#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */
#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
EXT4_EXTENT_TAIL_OFFSET(eh));
return cpu_to_le32(csum);
}
static int ext4_extent_block_csum_verify(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_extent_tail *et;
if (!ext4_has_metadata_csum(inode->i_sb))
return 1;
et = find_ext4_extent_tail(eh);
if (et->et_checksum != ext4_extent_block_csum(inode, eh))
return 0;
return 1;
}
static void ext4_extent_block_csum_set(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_extent_tail *et;
if (!ext4_has_metadata_csum(inode->i_sb))
return;
et = find_ext4_extent_tail(eh);
et->et_checksum = ext4_extent_block_csum(inode, eh);
}
static int ext4_split_extent_at(handle_t *handle,
struct inode *inode,
struct ext4_ext_path **ppath,
ext4_lblk_t split,
int split_flag,
int flags);
static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
{
/*
* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
* i_mutex. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
}
/*
* Make sure 'handle' has at least 'check_cred' credits. If not, restart
* transaction with 'restart_cred' credits. The function drops i_data_sem
* when restarting transaction and gets it after transaction is restarted.
*
* The function returns 0 on success, 1 if transaction had to be restarted,
* and < 0 in case of fatal error.
*/
int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
int check_cred, int restart_cred,
int revoke_cred)
{
int ret;
int dropped = 0;
ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
if (dropped)
down_write(&EXT4_I(inode)->i_data_sem);
return ret;
}
/*
* could return:
* - EROFS
* - ENOMEM
*/
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
if (path->p_bh) {
/* path points to block */
BUFFER_TRACE(path->p_bh, "get_write_access");
return ext4_journal_get_write_access(handle, path->p_bh);
}
/* path points to leaf/index in inode body */
/* we use in-core data, no need to protect them */
return 0;
}
/*
* could return:
* - EROFS
* - ENOMEM
* - EIO
*/
static int __ext4_ext_dirty(const char *where, unsigned int line,
handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
int err;
WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
if (path->p_bh) {
ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
/* path points to block */
err = __ext4_handle_dirty_metadata(where, line, handle,
inode, path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
}
return err;
}
#define ext4_ext_dirty(handle, inode, path) \
__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t block)
{
if (path) {
int depth = path->p_depth;
struct ext4_extent *ex;
/*
* Try to predict block placement assuming that we are
* filling in a file which will eventually be
* non-sparse --- i.e., in the case of libbfd writing
* an ELF object sections out-of-order but in a way
* the eventually results in a contiguous object or
* executable file, or some database extending a table
* space file. However, this is actually somewhat
* non-ideal if we are writing a sparse file such as
* qemu or KVM writing a raw image file that is going
* to stay fairly sparse, since it will end up
* fragmenting the file system's free space. Maybe we
* should have some hueristics or some way to allow
* userspace to pass a hint to file system,
* especially if the latter case turns out to be
* common.
*/
ex = path[depth].p_ext;
if (ex) {
ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
if (block > ext_block)
return ext_pblk + (block - ext_block);
else
return ext_pblk - (ext_block - block);
}
/* it looks like index is empty;
* try to find starting block from index itself */
if (path[depth].p_bh)
return path[depth].p_bh->b_blocknr;
}
/* OK. use inode's group */
return ext4_inode_to_goal_block(inode);
}
/*
* Allocation for a meta data block
*/
static ext4_fsblk_t
ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex, int *err, unsigned int flags)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
NULL, err);
return newblock;
}
static inline int ext4_ext_space_block(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
if (!check && size > 6)
size = 6;
#endif
return size;
}
static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
{
int size;
size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
/ sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
if (!check && size > 5)
size = 5;
#endif
return size;
}
static inline int ext4_ext_space_root(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent);
#ifdef AGGRESSIVE_TEST
if (!check && size > 3)
size = 3;
#endif
return size;
}
static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
{
int size;
size = sizeof(EXT4_I(inode)->i_data);
size -= sizeof(struct ext4_extent_header);
size /= sizeof(struct ext4_extent_idx);
#ifdef AGGRESSIVE_TEST
if (!check && size > 4)
size = 4;
#endif
return size;
}
static inline int
ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
struct ext4_ext_path **ppath, ext4_lblk_t lblk,
int nofail)
{
struct ext4_ext_path *path = *ppath;
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
if (nofail)
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
flags);
}
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
int max;
if (depth == ext_depth(inode)) {
if (depth == 0)
max = ext4_ext_space_root(inode, 1);
else
max = ext4_ext_space_root_idx(inode, 1);
} else {
if (depth == 0)
max = ext4_ext_space_block(inode, 1);
else
max = ext4_ext_space_block_idx(inode, 1);
}
return max;
}
static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
{
ext4_fsblk_t block = ext4_ext_pblock(ext);
int len = ext4_ext_get_actual_len(ext);
ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
/*
* We allow neither:
* - zero length
* - overflow/wrap-around
*/
if (lblock + len <= lblock)
return 0;
return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
}
static int ext4_valid_extent_idx(struct inode *inode,
struct ext4_extent_idx *ext_idx)
{
ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
}
static int ext4_valid_extent_entries(struct inode *inode,
struct ext4_extent_header *eh,
ext4_fsblk_t *pblk, int depth)
{
unsigned short entries;
if (eh->eh_entries == 0)
return 1;
entries = le16_to_cpu(eh->eh_entries);
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
ext4_lblk_t lblock = 0;
ext4_lblk_t prev = 0;
int len = 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
len = ext4_ext_get_actual_len(ext);
if ((lblock <= prev) && prev) {
*pblk = ext4_ext_pblock(ext);
return 0;
}
ext++;
entries--;
prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
ext_idx++;
entries--;
}
}
return 1;
}
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
int depth, ext4_fsblk_t pblk)
{
const char *error_msg;
int max = 0, err = -EFSCORRUPTED;
if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
error_msg = "invalid magic";
goto corrupted;
}
if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
error_msg = "unexpected eh_depth";
goto corrupted;
}
if (unlikely(eh->eh_max == 0)) {
error_msg = "invalid eh_max";
goto corrupted;
}
max = ext4_ext_max_entries(inode, depth);
if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
error_msg = "too large eh_max";
goto corrupted;
}
if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
error_msg = "invalid eh_entries";
goto corrupted;
}
if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
}
if (unlikely(depth > 32)) {
error_msg = "too large eh_depth";
goto corrupted;
}
/* Verify checksum on non-root extent tree nodes */
if (ext_depth(inode) != depth &&
!ext4_extent_block_csum_verify(inode, eh)) {
error_msg = "extent tree corrupted";
err = -EFSBADCRC;
goto corrupted;
}
return 0;
corrupted:
ext4_error_inode_err(inode, function, line, 0, -err,
"pblk %llu bad header/extent: %s - magic %x, "
"entries %u, max %u(%u), depth %u(%u)",
(unsigned long long) pblk, error_msg,
le16_to_cpu(eh->eh_magic),
le16_to_cpu(eh->eh_entries),
le16_to_cpu(eh->eh_max),
max, le16_to_cpu(eh->eh_depth), depth);
return err;
}
#define ext4_ext_check(inode, eh, depth, pblk) \
__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
int ext4_ext_check_inode(struct inode *inode)
{
return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}
static void ext4_cache_extents(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
ext4_lblk_t prev = 0;
int i;
for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
unsigned int status = EXTENT_STATUS_WRITTEN;
ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
int len = ext4_ext_get_actual_len(ex);
if (prev && (prev != lblk))
ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
EXTENT_STATUS_HOLE);
if (ext4_ext_is_unwritten(ex))
status = EXTENT_STATUS_UNWRITTEN;
ext4_es_cache_extent(inode, lblk, len,
ext4_ext_pblock(ex), status);
prev = lblk + len;
}
}
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
struct inode *inode, ext4_fsblk_t pblk, int depth,
int flags)
{
struct buffer_head *bh;
int err;
gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS;
if (flags & EXT4_EX_NOFAIL)
gfp_flags |= __GFP_NOFAIL;
bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
err = bh_submit_read(bh);
if (err < 0)
goto errout;
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
if (!ext4_has_feature_journal(inode->i_sb) ||
(inode->i_ino !=
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
err = __ext4_ext_check(function, line, inode,
ext_block_hdr(bh), depth, pblk);
if (err)
goto errout;
}
set_buffer_verified(bh);
/*
* If this is a leaf block, cache all of its entries
*/
if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
struct ext4_extent_header *eh = ext_block_hdr(bh);
ext4_cache_extents(inode, eh);
}
return bh;
errout:
put_bh(bh);
return ERR_PTR(err);
}
#define read_extent_tree_block(inode, pblk, depth, flags) \
__read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
(depth), (flags))
/*
* This function is called to cache a file's extent information in the
* extent status tree
*/
int ext4_ext_precache(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_ext_path *path = NULL;
struct buffer_head *bh;
int i = 0, depth, ret = 0;
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return 0; /* not an extent-mapped inode */
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
/* Don't cache anything if there are no external extent blocks */
if (!depth) {
up_read(&ei->i_data_sem);
return ret;
}
path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
GFP_NOFS);
if (path == NULL) {
up_read(&ei->i_data_sem);
return -ENOMEM;
}
path[0].p_hdr = ext_inode_hdr(inode);
ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
if (ret)
goto out;
path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
while (i >= 0) {
/*
* If this is a leaf block or we've reached the end of
* the index block, go up
*/
if ((i == depth) ||
path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
brelse(path[i].p_bh);
path[i].p_bh = NULL;
i--;
continue;
}
bh = read_extent_tree_block(inode,
ext4_idx_pblock(path[i].p_idx++),
depth - i - 1,
EXT4_EX_FORCE_CACHE);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
break;
}
i++;
path[i].p_bh = bh;
path[i].p_hdr = ext_block_hdr(bh);
path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
}
ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
up_read(&ei->i_data_sem);
ext4_ext_drop_refs(path);
kfree(path);
return ret;
}
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
{
int k, l = path->p_depth;
ext_debug(inode, "path:");
for (k = 0; k <= l; k++, path++) {
if (path->p_idx) {
ext_debug(inode, " %d->%llu",
le32_to_cpu(path->p_idx->ei_block),
ext4_idx_pblock(path->p_idx));
} else if (path->p_ext) {
ext_debug(inode, " %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext4_ext_pblock(path->p_ext));
} else
ext_debug(inode, " []");
}
ext_debug(inode, "\n");
}
static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
{
int depth = ext_depth(inode);
struct ext4_extent_header *eh;
struct ext4_extent *ex;
int i;
if (!path)
return;
eh = path[depth].p_hdr;
ex = EXT_FIRST_EXTENT(eh);
ext_debug(inode, "Displaying leaf extents\n");
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
}
ext_debug(inode, "\n");
}
static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
ext4_fsblk_t newblock, int level)
{
int depth = ext_depth(inode);
struct ext4_extent *ex;
if (depth != level) {
struct ext4_extent_idx *idx;
idx = path[level].p_idx;
while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
level, le32_to_cpu(idx->ei_block),
ext4_idx_pblock(idx), newblock);
idx++;
}
return;
}
ex = path[depth].p_ext;
while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(ex->ee_block),
ext4_ext_pblock(ex),
ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex),
newblock);
ex++;
}
}
#else
#define ext4_ext_show_path(inode, path)
#define ext4_ext_show_leaf(inode, path)
#define ext4_ext_show_move(inode, path, newblock, level)
#endif
void ext4_ext_drop_refs(struct ext4_ext_path *path)
{
int depth, i;
if (!path)
return;
depth = path->p_depth;
for (i = 0; i <= depth; i++, path++) {
if (path->p_bh) {
brelse(path->p_bh);
path->p_bh = NULL;
}
}
}
/*
* ext4_ext_binsearch_idx:
* binary search for the closest index of the given block
* the header must be checked before calling this
*/
static void
ext4_ext_binsearch_idx(struct inode *inode,
struct ext4_ext_path *path, ext4_lblk_t block)
{
struct ext4_extent_header *eh = path->p_hdr;
struct ext4_extent_idx *r, *l, *m;
ext_debug(inode, "binsearch for %u(idx): ", block);
l = EXT_FIRST_INDEX(eh) + 1;
r = EXT_LAST_INDEX(eh);
while (l <= r) {
m = l + (r - l) / 2;
if (block < le32_to_cpu(m->ei_block))
r = m - 1;
else
l = m + 1;
ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
r, le32_to_cpu(r->ei_block));
}
path->p_idx = l - 1;
ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
ext4_idx_pblock(path->p_idx));
#ifdef CHECK_BINSEARCH
{
struct ext4_extent_idx *chix, *ix;
int k;
chix = ix = EXT_FIRST_INDEX(eh);
for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
if (k != 0 && le32_to_cpu(ix->ei_block) <=
le32_to_cpu(ix[-1].ei_block)) {
printk(KERN_DEBUG "k=%d, ix=0x%p, "
"first=0x%p\n", k,
ix, EXT_FIRST_INDEX(eh));
printk(KERN_DEBUG "%u <= %u\n",
le32_to_cpu(ix->ei_block),
le32_to_cpu(ix[-1].ei_block));
}
BUG_ON(k && le32_to_cpu(ix->ei_block)
<= le32_to_cpu(ix[-1].ei_block));
if (block < le32_to_cpu(ix->ei_block))
break;
chix = ix;
}
BUG_ON(chix != path->p_idx);
}
#endif
}
/*
* ext4_ext_binsearch:
* binary search for closest extent of the given block
* the header must be checked before calling this
*/
static void
ext4_ext_binsearch(struct inode *inode,
struct ext4_ext_path *path, ext4_lblk_t block)
{
struct ext4_extent_header *eh = path->p_hdr;
struct ext4_extent *r, *l, *m;
if (eh->eh_entries == 0) {
/*
* this leaf is empty:
* we get such a leaf in split/add case
*/
return;
}
ext_debug(inode, "binsearch for %u: ", block);
l = EXT_FIRST_EXTENT(eh) + 1;
r = EXT_LAST_EXTENT(eh);
while (l <= r) {
m = l + (r - l) / 2;
if (block < le32_to_cpu(m->ee_block))
r = m - 1;
else
l = m + 1;
ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
r, le32_to_cpu(r->ee_block));
}
path->p_ext = l - 1;
ext_debug(inode, " -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext4_ext_pblock(path->p_ext),
ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
{
struct ext4_extent *chex, *ex;
int k;
chex = ex = EXT_FIRST_EXTENT(eh);
for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
BUG_ON(k && le32_to_cpu(ex->ee_block)
<= le32_to_cpu(ex[-1].ee_block));
if (block < le32_to_cpu(ex->ee_block))
break;
chex = ex;
}
BUG_ON(chex != path->p_ext);
}
#endif
}
void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
{
struct ext4_extent_header *eh;
eh = ext_inode_hdr(inode);
eh->eh_depth = 0;
eh->eh_entries = 0;
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
}
struct ext4_ext_path *
ext4_find_extent(struct inode *inode, ext4_lblk_t block,
struct ext4_ext_path **orig_path, int flags)
{
struct ext4_extent_header *eh;
struct buffer_head *bh;
struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
short int depth, i, ppos = 0;
int ret;
gfp_t gfp_flags = GFP_NOFS;
if (flags & EXT4_EX_NOFAIL)
gfp_flags |= __GFP_NOFAIL;
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
depth);
ret = -EFSCORRUPTED;
goto err;
}
if (path) {
ext4_ext_drop_refs(path);
if (depth > path[0].p_maxdepth) {
kfree(path);
*orig_path = path = NULL;
}
}
if (!path) {
/* account possible depth increase */
path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
gfp_flags);
if (unlikely(!path))
return ERR_PTR(-ENOMEM);
path[0].p_maxdepth = depth + 1;
}
path[0].p_hdr = eh;
path[0].p_bh = NULL;
i = depth;
if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
ext4_cache_extents(inode, eh);
/* walk through the tree */
while (i) {
ext_debug(inode, "depth %d: num %d, max %d\n",
ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
ext4_ext_binsearch_idx(inode, path + ppos, block);
path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
flags);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
}
eh = ext_block_hdr(bh);
ppos++;
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
}
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
path[ppos].p_idx = NULL;
/* find extent */
ext4_ext_binsearch(inode, path + ppos, block);
/* if not an empty leaf */
if (path[ppos].p_ext)
path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
ext4_ext_show_path(inode, path);
return path;
err:
ext4_ext_drop_refs(path);
kfree(path);
if (orig_path)
*orig_path = NULL;
return ERR_PTR(ret);
}
/*
* ext4_ext_insert_index:
* insert new index [@logical;@ptr] into the block at @curp;
* check where to insert: before @curp or after @curp
*/
static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
struct ext4_ext_path *curp,
int logical, ext4_fsblk_t ptr)
{
struct ext4_extent_idx *ix;
int len, err;
err = ext4_ext_get_access(handle, inode, curp);
if (err)
return err;
if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
EXT4_ERROR_INODE(inode,
"logical %d == ei_block %d!",
logical, le32_to_cpu(curp->p_idx->ei_block));
return -EFSCORRUPTED;
}
if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
>= le16_to_cpu(curp->p_hdr->eh_max))) {
EXT4_ERROR_INODE(inode,
"eh_entries %d >= eh_max %d!",
le16_to_cpu(curp->p_hdr->eh_entries),
le16_to_cpu(curp->p_hdr->eh_max));
return -EFSCORRUPTED;
}
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
/* insert after */
ext_debug(inode, "insert new index %d after: %llu\n",
logical, ptr);
ix = curp->p_idx + 1;
} else {
/* insert before */
ext_debug(inode, "insert new index %d before: %llu\n",
logical, ptr);
ix = curp->p_idx;
}
len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
BUG_ON(len < 0);
if (len > 0) {
ext_debug(inode, "insert new index %d: "
"move %d indices from 0x%p to 0x%p\n",
logical, len, ix, ix + 1);
memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
}
if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
return -EFSCORRUPTED;
}
ix->ei_block = cpu_to_le32(logical);
ext4_idx_store_pblock(ix, ptr);
le16_add_cpu(&curp->p_hdr->eh_entries, 1);
if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
return -EFSCORRUPTED;
}
err = ext4_ext_dirty(handle, inode, curp);
ext4_std_error(inode->i_sb, err);
return err;
}
/*
* ext4_ext_split:
* inserts new subtree into the path, using free index entry
* at depth @at:
* - allocates all needed blocks (new leaf and all intermediate index blocks)
* - makes decision where to split
* - moves remaining extents and index entries (right to the split point)
* into the newly allocated blocks
* - initializes subtree
*/
static int ext4_ext_split(handle_t *handle, struct inode *inode,
unsigned int flags,
struct ext4_ext_path *path,
struct ext4_extent *newext, int at)
{
struct buffer_head *bh = NULL;
int depth = ext_depth(inode);
struct ext4_extent_header *neh;
struct ext4_extent_idx *fidx;
int i = at, k, m, a;
ext4_fsblk_t newblock, oldblock;
__le32 border;
ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
gfp_t gfp_flags = GFP_NOFS;
int err = 0;
size_t ext_size = 0;
if (flags & EXT4_EX_NOFAIL)
gfp_flags |= __GFP_NOFAIL;
/* make decision: where to split? */
/* FIXME: now decision is simplest: at current extent */
/* if current leaf will be split, then we should use
* border from split point */
if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
return -EFSCORRUPTED;
}
if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
border = path[depth].p_ext[1].ee_block;
ext_debug(inode, "leaf will be split."
" next leaf starts at %d\n",
le32_to_cpu(border));
} else {
border = newext->ee_block;
ext_debug(inode, "leaf will be added."
" next leaf starts at %d\n",
le32_to_cpu(border));
}
/*
* If error occurs, then we break processing
* and mark filesystem read-only. index won't
* be inserted and tree will be in consistent
* state. Next mount will repair buffers too.
*/
/*
* Get array to track all allocated blocks.
* We need this to handle errors and free blocks
* upon them.
*/
ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
if (!ablocks)
return -ENOMEM;
/* allocate all needed blocks */
ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
newblock = ext4_ext_new_meta_block(handle, inode, path,
newext, &err, flags);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
}
/* initialize new leaf */
newblock = ablocks[--a];
if (unlikely(newblock == 0)) {
EXT4_ERROR_INODE(inode, "newblock == 0!");
err = -EFSCORRUPTED;
goto cleanup;
}
bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
if (unlikely(!bh)) {
err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
if (err)
goto cleanup;
neh = ext_block_hdr(bh);
neh->eh_entries = 0;
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
/* move remainder of path[depth] to the new leaf */
if (unlikely(path[depth].p_hdr->eh_entries !=
path[depth].p_hdr->eh_max)) {
EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
path[depth].p_hdr->eh_entries,
path[depth].p_hdr->eh_max);
err = -EFSCORRUPTED;
goto cleanup;
}
/* start copy from next extent */
m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
ext4_ext_show_move(inode, path, newblock, depth);
if (m) {
struct ext4_extent *ex;
ex = EXT_FIRST_EXTENT(neh);
memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
le16_add_cpu(&neh->eh_entries, m);
}
/* zero out unused area in the extent block */
ext_size = sizeof(struct ext4_extent_header) +
sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto cleanup;
brelse(bh);
bh = NULL;
/* correct old leaf */
if (m) {
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto cleanup;
le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
err = ext4_ext_dirty(handle, inode, path + depth);
if (err)
goto cleanup;
}
/* create intermediate indexes */
k = depth - at - 1;
if (unlikely(k < 0)) {
EXT4_ERROR_INODE(inode, "k %d < 0!", k);
err = -EFSCORRUPTED;
goto cleanup;
}
if (k)
ext_debug(inode, "create %d intermediate indices\n", k);
/* insert new index into current index block */
/* current depth stored in i var */
i = depth - 1;
while (k--) {
oldblock = newblock;
newblock = ablocks[--a];
bh = sb_getblk(inode->i_sb, newblock);
if (unlikely(!bh)) {
err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
if (err)
goto cleanup;
neh = ext_block_hdr(bh);
neh->eh_entries = cpu_to_le16(1);
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
neh->eh_depth = cpu_to_le16(depth - i);
fidx = EXT_FIRST_INDEX(neh);
fidx->ei_block = border;
ext4_idx_store_pblock(fidx, oldblock);
ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
i, newblock, le32_to_cpu(border), oldblock);
/* move remainder of path[i] to the new index block */
if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
EXT_LAST_INDEX(path[i].p_hdr))) {
EXT4_ERROR_INODE(inode,
"EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
le32_to_cpu(path[i].p_ext->ee_block));
err = -EFSCORRUPTED;
goto cleanup;
}
/* start copy indexes */
m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
EXT_MAX_INDEX(path[i].p_hdr));
ext4_ext_show_move(inode, path, newblock, i);
if (m) {
memmove(++fidx, path[i].p_idx,
sizeof(struct ext4_extent_idx) * m);
le16_add_cpu(&neh->eh_entries, m);
}
/* zero out unused area in the extent block */
ext_size = sizeof(struct ext4_extent_header) +
(sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
memset(bh->b_data + ext_size, 0,
inode->i_sb->s_blocksize - ext_size);
ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto cleanup;
brelse(bh);
bh = NULL;
/* correct old index */
if (m) {
err = ext4_ext_get_access(handle, inode, path + i);
if (err)
goto cleanup;
le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
err = ext4_ext_dirty(handle, inode, path + i);
if (err)
goto cleanup;
}
i--;
}
/* insert new index */
err = ext4_ext_insert_index(handle, inode, path + at,
le32_to_cpu(border), newblock);
cleanup:
if (bh) {
if (buffer_locked(bh))
unlock_buffer(bh);
brelse(bh);
}
if (err) {
/* free all allocated blocks in error case */
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
EXT4_FREE_BLOCKS_METADATA);
}
}
kfree(ablocks);
return err;
}
/*
* ext4_ext_grow_indepth:
* implements tree growing procedure:
* - allocates new block
* - moves top-level data (index block or leaf) into the new block
* - initializes new top-level, creating index that points to the
* just created block
*/
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
unsigned int flags)
{
struct ext4_extent_header *neh;
struct buffer_head *bh;
ext4_fsblk_t newblock, goal = 0;
struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
int err = 0;
size_t ext_size = 0;
/* Try to prepend new index to old one */
if (ext_depth(inode))
goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
if (goal > le32_to_cpu(es->s_first_data_block)) {
flags |= EXT4_MB_HINT_TRY_GOAL;
goal--;
} else
goal = ext4_inode_to_goal_block(inode);
newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
NULL, &err);
if (newblock == 0)
return err;
bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
if (unlikely(!bh))
return -ENOMEM;
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
if (err) {
unlock_buffer(bh);
goto out;
}
ext_size = sizeof(EXT4_I(inode)->i_data);
/* move top-level index/leaf into new block */
memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
/* zero out unused area in the extent block */
memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
/* set size of new block */
neh = ext_block_hdr(bh);
/* old root could have indexes or leaves
* so calculate e_max right way */
if (ext_depth(inode))
neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
else
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
ext4_extent_block_csum_set(inode, neh);
set_buffer_uptodate(bh);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, inode, bh);
if (err)
goto out;
/* Update top-level index: num,max,pointer */
neh = ext_inode_hdr(inode);
neh->eh_entries = cpu_to_le16(1);
ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
if (neh->eh_depth == 0) {
/* Root extent block becomes index block */
neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
EXT_FIRST_INDEX(neh)->ei_block =
EXT_FIRST_EXTENT(neh)->ee_block;
}
ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
le16_add_cpu(&neh->eh_depth, 1);
err = ext4_mark_inode_dirty(handle, inode);
out:
brelse(bh);
return err;
}
/*
* ext4_ext_create_new_leaf:
* finds empty index and adds new leaf.
* if no free index is found, then it requests in-depth growing.
*/
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
unsigned int mb_flags,
unsigned int gb_flags,
struct ext4_ext_path **ppath,
struct ext4_extent *newext)
{
struct ext4_ext_path *path = *ppath;
struct ext4_ext_path *curp;
int depth, i, err = 0;
repeat:
i = depth = ext_depth(inode);
/* walk up to the tree and look for free index entry */
curp = path + depth;
while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
i--;
curp--;
}
/* we use already allocated block for index block,
* so subsequent data blocks should be contiguous */
if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
if (err)
goto out;
/* refill path */
path = ext4_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
ppath, gb_flags);
if (IS_ERR(path))
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
err = ext4_ext_grow_indepth(handle, inode, mb_flags);
if (err)
goto out;
/* refill path */
path = ext4_find_extent(inode,
(ext4_lblk_t)le32_to_cpu(newext->ee_block),
ppath, gb_flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
}
/*
* only first (depth 0 -> 1) produces free space;
* in all other cases we have to split the grown tree
*/
depth = ext_depth(inode);
if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
/* now we need to split */
goto repeat;
}
}
out:
return err;
}
/*
* search the closest allocated block to the left for *logical
* and returns it at @logical + it's physical address at @phys
* if *logical is the smallest allocated block, the function
* returns 0 at @phys
* return value contains 0 (success) or error code
*/
static int ext4_ext_search_left(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys)
{
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
int depth, ee_len;
if (unlikely(path == NULL)) {
EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
return -EFSCORRUPTED;
}
depth = path->p_depth;
*phys = 0;
if (depth == 0 && path->p_ext == NULL)
return 0;
/* usually extent in the path covers blocks smaller
* then *logical, but it can be that extent is the
* first one in the file */
ex = path[depth].p_ext;
ee_len = ext4_ext_get_actual_len(ex);
if (*logical < le32_to_cpu(ex->ee_block)) {
if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
EXT4_ERROR_INODE(inode,
"EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
*logical, le32_to_cpu(ex->ee_block));
return -EFSCORRUPTED;
}
while (--depth >= 0) {
ix = path[depth].p_idx;
if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
EXT4_ERROR_INODE(inode,
"ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!"