dm vdo: add slab structure, slab journal and reference counters

Most of the vdo volume belongs to the slab depot. The depot contains a collection of slabs. The slabs can be up to 32GB, and are divided into three sections. Most of a slab consists of a linear sequence of 4K blocks. These blocks are used either to store data, or to hold portions of the block map (see subsequent patches). In addition to the data blocks, each slab has a set of reference counters, using 1 byte for each data block. Finally each slab has a journal. Reference updates are written to the slab journal, which is written out one block at a time as each block fills. A copy of the reference counters is kept in memory, and are written out a block at a time, in oldest-dirtied-order whenever there is a need to reclaim slab journal space. The journal is used both to ensure that the main recovery journal (see subsequent patches) can regularly free up space, and also to amortize the cost of updating individual reference blocks. This patch adds the slab structure as well as the slab journal and reference counters. Co-developed-by: J. corwin Coburn <corwin@hurlbutnet.net> Signed-off-by: J. corwin Coburn <corwin@hurlbutnet.net> Co-developed-by: Michael Sclafani <dm-devel@lists.linux.dev> Signed-off-by: Michael Sclafani <dm-devel@lists.linux.dev> Co-developed-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me> Signed-off-by: Sweet Tea Dorminy <sweettea-kernel@dorminy.me> Signed-off-by: Matthew Sakai <msakai@redhat.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
author: Matthew Sakai <msakai@redhat.com> 2023-11-16 20:56:30 -0500
committer: Mike Snitzer <snitzer@kernel.org> 2024-02-20 13:43:15 -0500
commit: 09eff388df02ac0fff7d726ca2136d623d1db907 (patch)
tree: 001428cff6a97de69617c61bd5c61d7068f66555 /drivers/md
parent: 58a55a59160cc2f59168431859b6ae154b154246 (diff)
2 files changed, 2656 insertions, 0 deletions
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
new file mode 100644
index 000000000000..af3136f83e4f
--- /dev/null
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -0,0 +1,2404 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "slab-depot.h"
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+#include <linux/log2.h>
+#include <linux/min_heap.h>
+#include <linux/minmax.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "physical-zone.h"
+#include "priority-table.h"
+#include "recovery-journal.h"
+#include "repair.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+static const u64 BYTES_PER_WORD = sizeof(u64);
+static const bool NORMAL_OPERATION = true;
+
+/**
+ * get_lock() - Get the lock object for a slab journal block by sequence number.
+ * @journal: vdo_slab journal to retrieve from.
+ * @sequence_number: Sequence number of the block.
+ *
+ * Return: The lock object for the given sequence number.
+ */
+static inline struct journal_lock * __must_check get_lock(struct slab_journal *journal,
+							  sequence_number_t sequence_number)
+{
+	return &journal->locks[sequence_number % journal->size];
+}
+
+static bool is_slab_open(struct vdo_slab *slab)
+{
+	return (!vdo_is_state_quiescing(&slab->state) &&
+		!vdo_is_state_quiescent(&slab->state));
+}
+
+/**
+ * must_make_entries_to_flush() - Check whether there are entry waiters which should delay a flush.
+ * @journal: The journal to check.
+ *
+ * Return: true if there are no entry waiters, or if the slab is unrecovered.
+ */
+static inline bool __must_check must_make_entries_to_flush(struct slab_journal *journal)
+{
+	return ((journal->slab->status != VDO_SLAB_REBUILDING) &&
+		vdo_has_waiters(&journal->entry_waiters));
+}
+
+/**
+ * is_reaping() - Check whether a reap is currently in progress.
+ * @journal: The journal which may be reaping.
+ *
+ * Return: true if the journal is reaping.
+ */
+static inline bool __must_check is_reaping(struct slab_journal *journal)
+{
+	return (journal->head != journal->unreapable);
+}
+
+/**
+ * initialize_tail_block() - Initialize tail block as a new block.
+ * @journal: The journal whose tail block is being initialized.
+ */
+static void initialize_tail_block(struct slab_journal *journal)
+{
+	struct slab_journal_block_header *header = &journal->tail_header;
+
+	header->sequence_number = journal->tail;
+	header->entry_count = 0;
+	header->has_block_map_increments = false;
+}
+
+/**
+ * initialize_journal_state() - Set all journal fields appropriately to start journaling.
+ * @journal: The journal to be reset, based on its tail sequence number.
+ */
+static void initialize_journal_state(struct slab_journal *journal)
+{
+	journal->unreapable = journal->head;
+	journal->reap_lock = get_lock(journal, journal->unreapable);
+	journal->next_commit = journal->tail;
+	journal->summarized = journal->last_summarized = journal->tail;
+	initialize_tail_block(journal);
+}
+
+/**
+ * block_is_full() - Check whether a journal block is full.
+ * @journal: The slab journal for the block.
+ *
+ * Return: true if the tail block is full.
+ */
+static bool __must_check block_is_full(struct slab_journal *journal)
+{
+	journal_entry_count_t count = journal->tail_header.entry_count;
+
+	return (journal->tail_header.has_block_map_increments ?
+		(journal->full_entries_per_block == count) :
+		(journal->entries_per_block == count));
+}
+
+static void add_entries(struct slab_journal *journal);
+static void update_tail_block_location(struct slab_journal *journal);
+static void release_journal_locks(struct waiter *waiter, void *context);
+
+/**
+ * is_slab_journal_blank() - Check whether a slab's journal is blank.
+ *
+ * A slab journal is blank if it has never had any entries recorded in it.
+ *
+ * Return: true if the slab's journal has never been modified.
+ */
+static bool is_slab_journal_blank(const struct vdo_slab *slab)
+{
+	return ((slab->journal.tail == 1) &&
+		(slab->journal.tail_header.entry_count == 0));
+}
+
+/**
+ * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
+ *                             order.
+ * @journal: The journal to be marked dirty.
+ * @lock: The recovery journal lock held by the slab journal.
+ */
+static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock)
+{
+	struct slab_journal *dirty_journal;
+	struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals;
+
+	ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean");
+
+	journal->recovery_lock = lock;
+	list_for_each_entry_reverse(dirty_journal, dirty_list, dirty_entry) {
+		if (dirty_journal->recovery_lock <= journal->recovery_lock)
+			break;
+	}
+
+	list_move_tail(&journal->dirty_entry, dirty_journal->dirty_entry.next);
+}
+
+static void mark_slab_journal_clean(struct slab_journal *journal)
+{
+	journal->recovery_lock = 0;
+	list_del_init(&journal->dirty_entry);
+}
+
+static void check_if_slab_drained(struct vdo_slab *slab)
+{
+	bool read_only;
+	struct slab_journal *journal = &slab->journal;
+	const struct admin_state_code *code;
+
+	if (!vdo_is_state_draining(&slab->state) ||
+	    must_make_entries_to_flush(journal) ||
+	    is_reaping(journal) ||
+	    journal->waiting_to_commit ||
+	    !list_empty(&journal->uncommitted_blocks) ||
+	    journal->updating_slab_summary ||
+	    (slab->active_count > 0))
+		return;
+
+	/* When not suspending or recovering, the slab must be clean. */
+	code = vdo_get_admin_state_code(&slab->state);
+	read_only = vdo_is_read_only(slab->allocator->depot->vdo);
+	if (!read_only &&
+	    vdo_has_waiters(&slab->dirty_blocks) &&
+	    (code != VDO_ADMIN_STATE_SUSPENDING) &&
+	    (code != VDO_ADMIN_STATE_RECOVERING))
+		return;
+
+	vdo_finish_draining_with_result(&slab->state,
+					(read_only ? VDO_READ_ONLY : VDO_SUCCESS));
+}
+
+/**
+ * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are
+ *                    complete.
+ * @journal: The journal to be reaped.
+ */
+static void finish_reaping(struct slab_journal *journal)
+{
+	journal->head = journal->unreapable;
+	add_entries(journal);
+	check_if_slab_drained(journal->slab);
+}
+
+static void reap_slab_journal(struct slab_journal *journal);
+
+/**
+ * complete_reaping() - Finish reaping now that we have flushed the lower layer and then try
+ *                      reaping again in case we deferred reaping due to an outstanding vio.
+ * @completion: The flush vio.
+ */
+static void complete_reaping(struct vdo_completion *completion)
+{
+	struct slab_journal *journal = completion->parent;
+
+	return_vio_to_pool(journal->slab->allocator->vio_pool,
+			   vio_as_pooled_vio(as_vio(uds_forget(completion))));
+	finish_reaping(journal);
+	reap_slab_journal(journal);
+}
+
+/**
+ * handle_flush_error() - Handle an error flushing the lower layer.
+ * @completion: The flush vio.
+ */
+static void handle_flush_error(struct vdo_completion *completion)
+{
+	vio_record_metadata_io_error(as_vio(completion));
+	vdo_enter_read_only_mode(completion->vdo, completion->result);
+	complete_reaping(completion);
+}
+
+static void flush_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_journal *journal = vio->completion.parent;
+
+	continue_vio_after_io(vio, complete_reaping,
+			      journal->slab->allocator->thread_id);
+}
+
+/**
+ * flush_for_reaping() - A waiter callback for getting a vio with which to flush the lower layer
+ *                       prior to reaping.
+ * @waiter: The journal as a flush waiter.
+ * @context: The newly acquired flush vio.
+ */
+static void flush_for_reaping(struct waiter *waiter, void *context)
+{
+	struct slab_journal *journal =
+		container_of(waiter, struct slab_journal, flush_waiter);
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+
+	vio->completion.parent = journal;
+	submit_flush_vio(vio, flush_endio, handle_flush_error);
+}
+
+/**
+ * reap_slab_journal() - Conduct a reap on a slab journal to reclaim unreferenced blocks.
+ * @journal: The slab journal.
+ */
+static void reap_slab_journal(struct slab_journal *journal)
+{
+	bool reaped = false;
+
+	if (is_reaping(journal)) {
+		/* We already have a reap in progress so wait for it to finish. */
+		return;
+	}
+
+	if ((journal->slab->status != VDO_SLAB_REBUILT) ||
+	    !vdo_is_state_normal(&journal->slab->state) ||
+	    vdo_is_read_only(journal->slab->allocator->depot->vdo)) {
+		/*
+		 * We must not reap in the first two cases, and there's no point in read-only mode.
+		 */
+		return;
+	}
+
+	/*
+	 * Start reclaiming blocks only when the journal head has no references. Then stop when a
+	 * block is referenced or reap reaches the most recently written block, referenced by the
+	 * slab summary, which has the sequence number just before the tail.
+	 */
+	while ((journal->unreapable < journal->tail) && (journal->reap_lock->count == 0)) {
+		reaped = true;
+		journal->unreapable++;
+		journal->reap_lock++;
+		if (journal->reap_lock == &journal->locks[journal->size])
+			journal->reap_lock = &journal->locks[0];
+	}
+
+	if (!reaped)
+		return;
+
+	/*
+	 * It is never safe to reap a slab journal block without first issuing a flush, regardless
+	 * of whether a user flush has been received or not. In the absence of the flush, the
+	 * reference block write which released the locks allowing the slab journal to reap may not
+	 * be persisted. Although slab summary writes will eventually issue flushes, multiple slab
+	 * journal block writes can be issued while previous slab summary updates have not yet been
+	 * made. Even though those slab journal block writes will be ignored if the slab summary
+	 * update is not persisted, they may still overwrite the to-be-reaped slab journal block
+	 * resulting in a loss of reference count updates (VDO-2912).
+	 */
+	journal->flush_waiter.callback = flush_for_reaping;
+	acquire_vio_from_pool(journal->slab->allocator->vio_pool,
+			      &journal->flush_waiter);
+}
+
+/**
+ * adjust_slab_journal_block_reference() - Adjust the reference count for a slab journal block.
+ * @journal: The slab journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ * @adjustment: Amount to adjust the reference counter.
+ *
+ * Note that when the adjustment is negative, the slab journal will be reaped.
+ */
+static void adjust_slab_journal_block_reference(struct slab_journal *journal,
+						sequence_number_t sequence_number,
+						int adjustment)
+{
+	struct journal_lock *lock;
+
+	if (sequence_number == 0)
+		return;
+
+	if (journal->slab->status == VDO_SLAB_REPLAYING) {
+		/* Locks should not be used during offline replay. */
+		return;
+	}
+
+	ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
+	lock = get_lock(journal, sequence_number);
+	if (adjustment < 0) {
+		ASSERT_LOG_ONLY((-adjustment <= lock->count),
+				"adjustment %d of lock count %u for slab journal block %llu must not underflow",
+				adjustment, lock->count,
+				(unsigned long long) sequence_number);
+	}
+
+	lock->count += adjustment;
+	if (lock->count == 0)
+		reap_slab_journal(journal);
+}
+
+/**
+ * release_journal_locks() - Callback invoked after a slab summary update completes.
+ * @waiter: The slab summary waiter that has just been notified.
+ * @context: The result code of the update.
+ *
+ * Registered in the constructor on behalf of update_tail_block_location().
+ *
+ * Implements waiter_callback_fn.
+ */
+static void release_journal_locks(struct waiter *waiter, void *context)
+{
+	sequence_number_t first, i;
+	struct slab_journal *journal =
+		container_of(waiter, struct slab_journal, slab_summary_waiter);
+	int result = *((int *) context);
+
+	if (result != VDO_SUCCESS) {
+		if (result != VDO_READ_ONLY) {
+			/*
+			 * Don't bother logging what might be lots of errors if we are already in
+			 * read-only mode.
+			 */
+			uds_log_error_strerror(result, "failed slab summary update %llu",
+					       (unsigned long long) journal->summarized);
+		}
+
+		journal->updating_slab_summary = false;
+		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+		check_if_slab_drained(journal->slab);
+		return;
+	}
+
+	if (journal->partial_write_in_progress && (journal->summarized == journal->tail)) {
+		journal->partial_write_in_progress = false;
+		add_entries(journal);
+	}
+
+	first = journal->last_summarized;
+	journal->last_summarized = journal->summarized;
+	for (i = journal->summarized - 1; i >= first; i--) {
+		/*
+		 * Release the lock the summarized block held on the recovery journal. (During
+		 * replay, recovery_start will always be 0.)
+		 */
+		if (journal->recovery_journal != NULL) {
+			zone_count_t zone_number = journal->slab->allocator->zone_number;
+			struct journal_lock *lock = get_lock(journal, i);
+
+			vdo_release_recovery_journal_block_reference(journal->recovery_journal,
+								     lock->recovery_start,
+								     VDO_ZONE_TYPE_PHYSICAL,
+								     zone_number);
+		}
+
+		/*
+		 * Release our own lock against reaping for blocks that are committed. (This
+		 * function will not change locks during replay.)
+		 */
+		adjust_slab_journal_block_reference(journal, i, -1);
+	}
+
+	journal->updating_slab_summary = false;
+
+	reap_slab_journal(journal);
+
+	/* Check if the slab summary needs to be updated again. */
+	update_tail_block_location(journal);
+}
+
+/**
+ * update_tail_block_location() - Update the tail block location in the slab summary, if necessary.
+ * @journal: The slab journal that is updating its tail block location.
+ */
+static void update_tail_block_location(struct slab_journal *journal)
+{
+	block_count_t free_block_count;
+	struct vdo_slab *slab = journal->slab;
+
+	if (journal->updating_slab_summary ||
+	    vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
+	    (journal->last_summarized >= journal->next_commit)) {
+		check_if_slab_drained(slab);
+		return;
+	}
+
+	if (slab->status != VDO_SLAB_REBUILT) {
+		u8 hint = slab->allocator->summary_entries[slab->slab_number].fullness_hint;
+
+		free_block_count = ((block_count_t) hint) << slab->allocator->depot->hint_shift;
+	} else {
+		free_block_count = slab->free_blocks;
+	}
+
+	journal->summarized = journal->next_commit;
+	journal->updating_slab_summary = true;
+
+	/*
+	 * Update slab summary as dirty.
+	 * vdo_slab journal can only reap past sequence number 1 when all the ref counts for this
+	 * slab have been written to the layer. Therefore, indicate that the ref counts must be
+	 * loaded when the journal head has reaped past sequence number 1.
+	 */
+	update_slab_summary_entry(slab, &journal->slab_summary_waiter,
+				  journal->summarized % journal->size,
+				  (journal->head > 1), false, free_block_count);
+}
+
+/**
+ * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries.
+ */
+static void reopen_slab_journal(struct vdo_slab *slab)
+{
+	struct slab_journal *journal = &slab->journal;
+	sequence_number_t block;
+
+	ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0,
+			"vdo_slab journal's active block empty before reopening");
+	journal->head = journal->tail;
+	initialize_journal_state(journal);
+
+	/* Ensure no locks are spuriously held on an empty journal. */
+	for (block = 1; block <= journal->size; block++) {
+		ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0),
+				"Scrubbed journal's block %llu is not locked",
+				(unsigned long long) block);
+	}
+
+	add_entries(journal);
+}
+
+static sequence_number_t get_committing_sequence_number(const struct pooled_vio *vio)
+{
+	const struct packed_slab_journal_block *block =
+		(const struct packed_slab_journal_block *) vio->vio.data;
+
+	return __le64_to_cpu(block->header.sequence_number);
+}
+
+/**
+ * complete_write() - Handle post-commit processing.
+ * @completion: The write vio as a completion.
+ *
+ * This is the callback registered by write_slab_journal_block().
+ */
+static void complete_write(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct pooled_vio *pooled = vio_as_pooled_vio(as_vio(completion));
+	struct slab_journal *journal = completion->parent;
+	sequence_number_t committed = get_committing_sequence_number(pooled);
+
+	list_del_init(&pooled->list_entry);
+	return_vio_to_pool(journal->slab->allocator->vio_pool, uds_forget(pooled));
+
+	if (result != VDO_SUCCESS) {
+		vio_record_metadata_io_error(as_vio(completion));
+		uds_log_error_strerror(result, "cannot write slab journal block %llu",
+				       (unsigned long long) committed);
+		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+		check_if_slab_drained(journal->slab);
+		return;
+	}
+
+	WRITE_ONCE(journal->events->blocks_written, journal->events->blocks_written + 1);
+
+	if (list_empty(&journal->uncommitted_blocks)) {
+		/* If no blocks are outstanding, then the commit point is at the tail. */
+		journal->next_commit = journal->tail;
+	} else {
+		/* The commit point is always the beginning of the oldest incomplete block. */
+		pooled = container_of(journal->uncommitted_blocks.next,
+				      struct pooled_vio, list_entry);
+		journal->next_commit = get_committing_sequence_number(pooled);
+	}
+
+	update_tail_block_location(journal);
+}
+
+static void write_slab_journal_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_journal *journal = vio->completion.parent;
+
+	continue_vio_after_io(vio, complete_write, journal->slab->allocator->thread_id);
+}
+
+/**
+ * write_slab_journal_block() - Write a slab journal block.
+ * @waiter: The vio pool waiter which was just notified.
+ * @context: The vio pool entry for the write.
+ *
+ * Callback from acquire_vio_from_pool() registered in commit_tail().
+ */
+static void write_slab_journal_block(struct waiter *waiter, void *context)
+{
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+	struct slab_journal *journal =
+		container_of(waiter, struct slab_journal, resource_waiter);
+	struct slab_journal_block_header *header = &journal->tail_header;
+	int unused_entries = journal->entries_per_block - header->entry_count;
+	physical_block_number_t block_number;
+	const struct admin_state_code *operation;
+
+	header->head = journal->head;
+	list_add_tail(&pooled->list_entry, &journal->uncommitted_blocks);
+	vdo_pack_slab_journal_block_header(header, &journal->block->header);
+
+	/* Copy the tail block into the vio. */
+	memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE);
+
+	ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull");
+	if (unused_entries > 0) {
+		/*
+		 * Release the per-entry locks for any unused entries in the block we are about to
+		 * write.
+		 */
+		adjust_slab_journal_block_reference(journal, header->sequence_number,
+						    -unused_entries);
+		journal->partial_write_in_progress = !block_is_full(journal);
+	}
+
+	block_number = journal->slab->journal_origin +
+		(header->sequence_number % journal->size);
+	vio->completion.parent = journal;
+
+	/*
+	 * This block won't be read in recovery until the slab summary is updated to refer to it.
+	 * The slab summary update does a flush which is sufficient to protect us from VDO-2331.
+	 */
+	submit_metadata_vio(uds_forget(vio), block_number, write_slab_journal_endio,
+			    complete_write, REQ_OP_WRITE);
+
+	/* Since the write is submitted, the tail block structure can be reused. */
+	journal->tail++;
+	initialize_tail_block(journal);
+	journal->waiting_to_commit = false;
+
+	operation = vdo_get_admin_state_code(&journal->slab->state);
+	if (operation == VDO_ADMIN_STATE_WAITING_FOR_RECOVERY) {
+		vdo_finish_operation(&journal->slab->state,
+				     (vdo_is_read_only(journal->slab->allocator->depot->vdo) ?
+				      VDO_READ_ONLY : VDO_SUCCESS));
+		return;
+	}
+
+	add_entries(journal);
+}
+
+/**
+ * commit_tail() - Commit the tail block of the slab journal.
+ * @journal: The journal whose tail block should be committed.
+ */
+static void commit_tail(struct slab_journal *journal)
+{
+	if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal)) {
+		/*
+		 * There are no entries at the moment, but there are some waiters, so defer
+		 * initiating the flush until those entries are ready to write.
+		 */
+		return;
+	}
+
+	if (vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
+	    journal->waiting_to_commit ||
+	    (journal->tail_header.entry_count == 0)) {
+		/*
+		 * There is nothing to do since the tail block is empty, or writing, or the journal
+		 * is in read-only mode.
+		 */
+		return;
+	}
+
+	/*
+	 * Since we are about to commit the tail block, this journal no longer needs to be on the
+	 * ring of journals which the recovery journal might ask to commit.
+	 */
+	mark_slab_journal_clean(journal);
+
+	journal->waiting_to_commit = true;
+
+	journal->resource_waiter.callback = write_slab_journal_block;
+	acquire_vio_from_pool(journal->slab->allocator->vio_pool,
+			      &journal->resource_waiter);
+}
+
+/**
+ * encode_slab_journal_entry() - Encode a slab journal entry.
+ * @tail_header: The unpacked header for the block.
+ * @payload: The journal block payload to hold the entry.
+ * @sbn: The slab block number of the entry to encode.
+ * @operation: The type of the entry.
+ * @increment: True if this is an increment.
+ *
+ * Exposed for unit tests.
+ */
+static void encode_slab_journal_entry(struct slab_journal_block_header *tail_header,
+				      slab_journal_payload *payload,
+				      slab_block_number sbn,
+				      enum journal_operation operation,
+				      bool increment)
+{
+	journal_entry_count_t entry_number = tail_header->entry_count++;
+
+	if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+		if (!tail_header->has_block_map_increments) {
+			memset(payload->full_entries.entry_types, 0,
+			       VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE);
+			tail_header->has_block_map_increments = true;
+		}
+
+		payload->full_entries.entry_types[entry_number / 8] |=
+			((u8)1 << (entry_number % 8));
+	}
+
+	vdo_pack_slab_journal_entry(&payload->entries[entry_number], sbn, increment);
+}
+
+/**
+ * expand_journal_point() - Convert a recovery journal journal_point which refers to both an
+ *                          increment and a decrement to a single point which refers to one or the
+ *                          other.
+ * @recovery_point: The journal point to convert.
+ * @increment: Whether the current entry is an increment.
+ *
+ * Return: The expanded journal point
+ *
+ * Because each data_vio has but a single recovery journal point, but may need to make both
+ * increment and decrement entries in the same slab journal. In order to distinguish the two
+ * entries, the entry count of the expanded journal point is twice the actual recovery journal
+ * entry count for increments, and one more than that for decrements.
+ */
+static struct journal_point expand_journal_point(struct journal_point recovery_point,
+						 bool increment)
+{
+	recovery_point.entry_count *= 2;
+	if (!increment)
+		recovery_point.entry_count++;
+
+	return recovery_point;
+}
+
+/**
+ * add_entry() - Actually add an entry to the slab journal, potentially firing off a write if a
+ *               block becomes full.
+ * @journal: The slab journal to append to.
+ * @pbn: The pbn being adjusted.
+ * @operation: The type of entry to make.
+ * @increment: True if this is an increment.
+ * @recovery_point: The expanded recovery point.
+ *
+ * This function is synchronous.
+ */
+static void add_entry(struct slab_journal *journal, physical_block_number_t pbn,
+		      enum journal_operation operation, bool increment,
+		      struct journal_point recovery_point)
+{
+	struct packed_slab_journal_block *block = journal->block;
+	int result;
+
+	result = ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point,
+						 &recovery_point),
+			"recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
+			(unsigned long long) recovery_point.sequence_number,
+			recovery_point.entry_count,
+			(unsigned long long) journal->tail_header.recovery_point.sequence_number,
+			journal->tail_header.recovery_point.entry_count);
+	if (result != VDO_SUCCESS) {
+		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+		return;
+	}
+
+	if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+		result = ASSERT((journal->tail_header.entry_count <
+				 journal->full_entries_per_block),
+				"block has room for full entries");
+		if (result != VDO_SUCCESS) {
+			vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo,
+						 result);
+			return;
+		}
+	}
+
+	encode_slab_journal_entry(&journal->tail_header, &block->payload,
+				  pbn - journal->slab->start, operation, increment);
+	journal->tail_header.recovery_point = recovery_point;
+	if (block_is_full(journal))
+		commit_tail(journal);
+}
+
+static inline block_count_t journal_length(const struct slab_journal *journal)
+{
+	return journal->tail - journal->head;
+}
+
+/**
+ * vdo_attempt_replay_into_slab() - Replay a recovery journal entry into a slab's journal.
+ * @slab: The slab to play into.
+ * @pbn: The PBN for the entry.
+ * @operation: The type of entry to add.
+ * @increment: True if this entry is an increment.
+ * @recovery_point: The recovery journal point corresponding to this entry.
+ * @parent: The completion to notify when there is space to add the entry if the entry could not be
+ *          added immediately.
+ *
+ * Return: true if the entry was added immediately.
+ */
+bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t pbn,
+				  enum journal_operation operation, bool increment,
+				  struct journal_point *recovery_point,
+				  struct vdo_completion *parent)
+{
+	struct slab_journal *journal = &slab->journal;
+	struct slab_journal_block_header *header = &journal->tail_header;
+	struct journal_point expanded = expand_journal_point(*recovery_point, increment);
+
+	/* Only accept entries after the current recovery point. */
+	if (!vdo_before_journal_point(&journal->tail_header.recovery_point, &expanded))
+		return true;
+
+	if ((header->entry_count >= journal->full_entries_per_block) &&
+	    (header->has_block_map_increments || (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING))) {
+		/*
+		 * The tail block does not have room for the entry we are attempting to add so
+		 * commit the tail block now.
+		 */
+		commit_tail(journal);
+	}
+
+	if (journal->waiting_to_commit) {
+		vdo_start_operation_with_waiter(&journal->slab->state,
+						VDO_ADMIN_STATE_WAITING_FOR_RECOVERY,
+						parent, NULL);
+		return false;
+	}
+
+	if (journal_length(journal) >= journal->size) {
+		/*
+		 * We must have reaped the current head before the crash, since the blocked
+		 * threshold keeps us from having more entries than fit in a slab journal; hence we
+		 * can just advance the head (and unreapable block), as needed.
+		 */
+		journal->head++;
+		journal->unreapable++;
+	}
+
+	if (journal->slab->status == VDO_SLAB_REBUILT)
+		journal->slab->status = VDO_SLAB_REPLAYING;
+
+	add_entry(journal, pbn, operation, increment, expanded);
+	return true;
+}
+
+/**
+ * requires_reaping() - Check whether the journal must be reaped before adding new entries.
+ * @journal: The journal to check.
+ *
+ * Return: true if the journal must be reaped.
+ */
+static bool requires_reaping(const struct slab_journal *journal)
+{
+	return (journal_length(journal) >= journal->blocking_threshold);
+}
+
+/** finish_summary_update() - A waiter callback that resets the writing state of a slab. */
+static void finish_summary_update(struct waiter *waiter, void *context)
+{
+	struct vdo_slab *slab = container_of(waiter, struct vdo_slab, summary_waiter);
+	int result = *((int *) context);
+
+	slab->active_count--;
+
+	if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
+		uds_log_error_strerror(result, "failed to update slab summary");
+		vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
+	}
+
+	check_if_slab_drained(slab);
+}
+
+static void write_reference_block(struct waiter *waiter, void *context);
+
+/**
+ * launch_reference_block_write() - Launch the write of a dirty reference block by first acquiring
+ *                                  a VIO for it from the pool.
+ * @waiter: The waiter of the block which is starting to write.
+ * @context: The parent slab of the block.
+ *
+ * This can be asynchronous since the writer will have to wait if all VIOs in the pool are
+ * currently in use.
+ */
+static void launch_reference_block_write(struct waiter *waiter, void *context)
+{
+	struct vdo_slab *slab = context;
+
+	if (vdo_is_read_only(slab->allocator->depot->vdo))
+		return;
+
+	slab->active_count++;
+	container_of(waiter, struct reference_block, waiter)->is_writing = true;
+	waiter->callback = write_reference_block;
+	acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+}
+
+static void save_dirty_reference_blocks(struct vdo_slab *slab)
+{
+	vdo_notify_all_waiters(&slab->dirty_blocks, launch_reference_block_write, slab);
+	check_if_slab_drained(slab);
+}
+
+/**
+ * finish_reference_block_write() - After a reference block has written, clean it, release its
+ *                                  locks, and return its VIO to the pool.
+ * @completion: The VIO that just finished writing.
+ */
+static void finish_reference_block_write(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
+	struct reference_block *block = completion->parent;
+	struct vdo_slab *slab = block->slab;
+	tail_block_offset_t offset;
+
+	slab->active_count--;
+
+	/* Release the slab journal lock. */
+	adjust_slab_journal_block_reference(&slab->journal,
+					    block->slab_journal_lock_to_release, -1);
+	return_vio_to_pool(slab->allocator->vio_pool, pooled);
+
+	/*
+	 * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
+	 * us to be dirtied again, but we don't want to double enqueue.
+	 */
+	block->is_writing = false;
+
+	if (vdo_is_read_only(completion->vdo)) {
+		check_if_slab_drained(slab);
+		return;
+	}
+
+	/* Re-queue the block if it was re-dirtied while it was writing. */
+	if (block->is_dirty) {
+		vdo_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
+		if (vdo_is_state_draining(&slab->state)) {
+			/* We must be saving, and this block will otherwise not be relaunched. */
+			save_dirty_reference_blocks(slab);
+		}
+
+		return;
+	}
+
+	/*
+	 * Mark the slab as clean in the slab summary if there are no dirty or writing blocks
+	 * and no summary update in progress.
+	 */
+	if ((slab->active_count > 0) || vdo_has_waiters(&slab->dirty_blocks)) {
+		check_if_slab_drained(slab);
+		return;
+	}
+
+	offset = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+	slab->active_count++;
+	slab->summary_waiter.callback = finish_summary_update;
+	update_slab_summary_entry(slab, &slab->summary_waiter, offset,
+				  true, true, slab->free_blocks);
+}
+
+/**
+ * get_reference_counters_for_block() - Find the reference counters for a given block.
+ * @block: The reference_block in question.
+ *
+ * Return: A pointer to the reference counters for this block.
+ */
+static vdo_refcount_t * __must_check get_reference_counters_for_block(struct reference_block *block)
+{
+	size_t block_index = block - block->slab->reference_blocks;
+
+	return &block->slab->counters[block_index * COUNTS_PER_BLOCK];
+}
+
+/**
+ * pack_reference_block() - Copy data from a reference block to a buffer ready to be written out.
+ * @block: The block to copy.
+ * @buffer: The char buffer to fill with the packed block.
+ */
+static void pack_reference_block(struct reference_block *block, void *buffer)
+{
+	struct packed_reference_block *packed = buffer;
+	vdo_refcount_t *counters = get_reference_counters_for_block(block);
+	sector_count_t i;
+	struct packed_journal_point commit_point;
+
+	vdo_pack_journal_point(&block->slab->slab_journal_point, &commit_point);
+
+	for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
+		packed->sectors[i].commit_point = commit_point;
+		memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR),
+		       (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
+	}
+}
+
+static void write_reference_block_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct reference_block *block = vio->completion.parent;
+	thread_id_t thread_id = block->slab->allocator->thread_id;
+
+	continue_vio_after_io(vio, finish_reference_block_write, thread_id);
+}
+
+/**
+ * handle_io_error() - Handle an I/O error reading or writing a reference count block.
+ * @completion: The VIO doing the I/O as a completion.
+ */
+static void handle_io_error(struct vdo_completion *completion)
+{
+	int result = completion->result;
+
author	Matthew Sakai <msakai@redhat.com>	2023-11-16 20:56:30 -0500
committer	Mike Snitzer <snitzer@kernel.org>	2024-02-20 13:43:15 -0500
commit	09eff388df02ac0fff7d726ca2136d623d1db907 (patch)
tree	001428cff6a97de69617c61bd5c61d7068f66555 /drivers/md
parent	58a55a59160cc2f59168431859b6ae154b154246 (diff)