diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-18 17:27:43 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-18 17:27:43 -0700 |
| commit | 720261cfc7329406a50c2a8536e0039b9dd9a4e5 (patch) | |
| tree | 660152d34a1e0fdfd747aae4b9c9485b826654e1 | |
| parent | 4f40c636b291deeae7d1f4c9fb5db5f0aac54267 (diff) | |
| parent | a97b43fac5b9b3ffca71b8a917a249789902fce9 (diff) | |
Merge tag 'bcachefs-2024-07-18.2' of https://evilpiepirate.org/git/bcachefs
Pull bcachefs updates from Kent Overstreet:
- Metadata version 1.8: Stripe sectors accounting, BCH_DATA_unstriped
This splits out the accounting of dirty sectors and stripe sectors in
alloc keys; this lets us see stripe buckets that still have unstriped
data in them.
This is needed for ensuring that erasure coding is working correctly,
as well as completing stripe creation after a crash.
- Metadata version 1.9: Disk accounting rewrite
The previous disk accounting scheme relied heavily on percpu counters
that were also sharded by outstanding journal buffer; it was fast but
not extensible or scalable, and meant that all accounting counters
were recorded in every journal entry.
The new disk accounting scheme stores accounting as normal btree
keys; updates are deltas until they are flushed by the btree write
buffer.
This means we have no practical limit on the number of counters, and
a new tagged union format that's easy to extend.
We now have counters for compression type/ratio, per-snapshot-id
usage, per-btree-id usage, and pending rebalance work.
- Self healing on read IO/checksum error
Data is now automatically rewritten if we get a read error and then a
successful retry
- Mount API conversion (thanks to Thomas Bertschinger)
- Better lockdep coverage
Previously, btree node locks were tracked individually by lockdep,
like any other lock. But we may take _many_ btree node locks
simultaneously, we easily blow through the limit of 48 locks that
lockdep can track, leading to lockdep turning itself off.
Tracking each btree node lock individually isn't really necessary
since we have our own cycle detector for deadlock avoidance and
centralized tracking of btree node locks, so we now have a single
lockdep_map in btree_trans for "any btree nodes are locked".
- Some more small incremental work towards online check_allocations
- Lots more debugging improvements
- Fixes, including:
- undefined behaviour fixes, originally noted as breaking userspace
LTO builds
- fix a spurious warning in fsck_err, reported by Marcin
- fix an integer overflow on trans->nr_updates, also reported by
Marcin; this broke during deletion of highly fragmented indirect
extents
* tag 'bcachefs-2024-07-18.2' of https://evilpiepirate.org/git/bcachefs: (120 commits)
lockdep: Add comments for lockdep_set_no{validate,track}_class()
bcachefs: Fix integer overflow on trans->nr_updates
bcachefs: silence silly kdoc warning
bcachefs: Fix fsck warning about btree_trans not passed to fsck error
bcachefs: Add an error message for insufficient rw journal devs
bcachefs: varint: Avoid left-shift of a negative value
bcachefs: darray: Don't pass NULL to memcpy()
bcachefs: Kill bch2_assert_btree_nodes_not_locked()
bcachefs: Rename BCH_WRITE_DONE -> BCH_WRITE_SUBMITTED
bcachefs: __bch2_read(): call trans_begin() on every loop iter
bcachefs: show none if label is not set
bcachefs: drop packed, aligned from bkey_inode_buf
bcachefs: btree node scan: fall back to comparing by journal seq
bcachefs: Add lockdep support for btree node locks
lockdep: lockdep_set_notrack_class()
bcachefs: Improve copygc_wait_to_text()
bcachefs: Convert clock code to u64s
bcachefs: Improve startup message
bcachefs: Self healing on read IO error
bcachefs: Make read_only a mount option again, but hidden
...
115 files changed, 3908 insertions, 2683 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index f66231d4f953..63af77fcdd32 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3720,7 +3720,6 @@ F: drivers/md/bcache/ BCACHEFS M: Kent Overstreet <kent.overstreet@linux.dev> -R: Brian Foster <bfoster@redhat.com> L: linux-bcachefs@vger.kernel.org S: Supported C: irc://irc.oftc.net/bcache diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 66ca0bbee639..0ab533a2b03b 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -29,10 +29,11 @@ bcachefs-y := \ clock.o \ compress.o \ darray.o \ + data_update.o \ debug.o \ dirent.o \ + disk_accounting.o \ disk_groups.o \ - data_update.o \ ec.o \ errcode.o \ error.o \ diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 250d6c6d3a3a..a7b425d3c8a0 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -346,7 +346,6 @@ int bch2_set_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl; @@ -354,6 +353,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, int ret; mutex_lock(&inode->ei_update_lock); + struct btree_trans *trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); acl = _acl; @@ -394,8 +394,8 @@ btree_err: set_cached_acl(&inode->v, type, acl); err: - mutex_unlock(&inode->ei_update_lock); bch2_trans_put(trans); + mutex_unlock(&inode->ei_update_lock); return ret; } diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 658f11aebda1..d9c5a92fa708 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -15,6 +15,7 @@ #include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" +#include "disk_accounting.h" #include "ec.h" #include "error.h" #include "lru.h" @@ -268,27 +269,41 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, i == READ ? "read" : "write", a.v->io_time[i], LRU_TIME_MAX); + unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) > + offsetof(struct bch_alloc_v4, stripe_sectors) + ? a.v->stripe_sectors + : 0; + switch (a.v->data_type) { case BCH_DATA_free: case BCH_DATA_need_gc_gens: case BCH_DATA_need_discard: - bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe, + bkey_fsck_err_on(stripe_sectors || + a.v->dirty_sectors || + a.v->cached_sectors || + a.v->stripe, c, err, alloc_key_empty_but_have_data, - "empty data type free but have data"); + "empty data type free but have data %u.%u.%u %u", + stripe_sectors, + a.v->dirty_sectors, + a.v->cached_sectors, + a.v->stripe); break; case BCH_DATA_sb: case BCH_DATA_journal: case BCH_DATA_btree: case BCH_DATA_user: case BCH_DATA_parity: - bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), + bkey_fsck_err_on(!a.v->dirty_sectors && + !stripe_sectors, c, err, alloc_key_dirty_sectors_0, "data_type %s but dirty_sectors==0", bch2_data_type_str(a.v->data_type)); break; case BCH_DATA_cached: bkey_fsck_err_on(!a.v->cached_sectors || - bch2_bucket_sectors_dirty(*a.v) || + a.v->dirty_sectors || + stripe_sectors || a.v->stripe, c, err, alloc_key_cached_inconsistency, "data type inconsistency"); @@ -319,6 +334,7 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); a->fragmentation_lru = swab64(a->fragmentation_lru); + a->stripe_sectors = swab32(a->stripe_sectors); bps = alloc_v4_backpointers(a); for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { @@ -343,6 +359,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); prt_printf(out, "dirty_sectors %u\n", a->dirty_sec |
