aboutsummaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/damon.h18
-rw-r--r--include/linux/dax.h4
-rw-r--r--include/linux/folio_batch.h (renamed from include/linux/pagevec.h)16
-rw-r--r--include/linux/folio_queue.h8
-rw-r--r--include/linux/fs.h14
-rw-r--r--include/linux/huge_mm.h13
-rw-r--r--include/linux/hugetlb.h31
-rw-r--r--include/linux/hugetlb_inline.h4
-rw-r--r--include/linux/hyperv.h4
-rw-r--r--include/linux/iomap.h2
-rw-r--r--include/linux/kasan.h8
-rw-r--r--include/linux/kho/abi/kexec_handover.h144
-rw-r--r--include/linux/kho/abi/memfd.h18
-rw-r--r--include/linux/kho_radix_tree.h70
-rw-r--r--include/linux/ksm.h10
-rw-r--r--include/linux/leafops.h39
-rw-r--r--include/linux/maple_tree.h42
-rw-r--r--include/linux/memcontrol.h2
-rw-r--r--include/linux/memfd.h12
-rw-r--r--include/linux/memory-tiers.h2
-rw-r--r--include/linux/memory.h3
-rw-r--r--include/linux/memory_hotplug.h18
-rw-r--r--include/linux/mm.h716
-rw-r--r--include/linux/mm_inline.h16
-rw-r--r--include/linux/mm_types.h91
-rw-r--r--include/linux/mman.h49
-rw-r--r--include/linux/mmu_notifier.h130
-rw-r--r--include/linux/mmzone.h82
-rw-r--r--include/linux/page-flags.h163
-rw-r--r--include/linux/page_ref.h18
-rw-r--r--include/linux/page_reporting.h1
-rw-r--r--include/linux/pagewalk.h8
-rw-r--r--include/linux/pgtable.h139
-rw-r--r--include/linux/sunrpc/svc.h2
-rw-r--r--include/linux/swap.h30
-rw-r--r--include/linux/types.h2
-rw-r--r--include/linux/uio_driver.h4
-rw-r--r--include/linux/userfaultfd_k.h3
-rw-r--r--include/linux/vm_event_item.h13
-rw-r--r--include/linux/vmalloc.h3
-rw-r--r--include/linux/writeback.h2
41 files changed, 1311 insertions, 643 deletions
diff --git a/include/linux/damon.h b/include/linux/damon.h
index be3d198043ff..d9a3babbafc1 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -55,6 +55,8 @@ struct damon_size_range {
* @list: List head for siblings.
* @age: Age of this region.
*
+ * For any use case, @ar should be non-zero positive size.
+ *
* @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be
* increased for every &damon_attrs->sample_interval if an access to the region
* during the last sampling interval is found. The update of this field should
@@ -214,11 +216,22 @@ struct damos_quota_goal {
};
/**
+ * enum damos_quota_goal_tuner - Goal-based quota tuning logic.
+ * @DAMOS_QUOTA_GOAL_TUNER_CONSIST: Aim long term consistent quota.
+ * @DAMOS_QUOTA_GOAL_TUNER_TEMPORAL: Aim zero quota asap.
+ */
+enum damos_quota_goal_tuner {
+ DAMOS_QUOTA_GOAL_TUNER_CONSIST,
+ DAMOS_QUOTA_GOAL_TUNER_TEMPORAL,
+};
+
+/**
* struct damos_quota - Controls the aggressiveness of the given scheme.
* @reset_interval: Charge reset interval in milliseconds.
* @ms: Maximum milliseconds that the scheme can use.
* @sz: Maximum bytes of memory that the action can be applied.
* @goals: Head of quota tuning goals (&damos_quota_goal) list.
+ * @goal_tuner: Goal-based @esz tuning algorithm to use.
* @esz: Effective size quota in bytes.
*
* @weight_sz: Weight of the region's size for prioritization.
@@ -260,6 +273,7 @@ struct damos_quota {
unsigned long ms;
unsigned long sz;
struct list_head goals;
+ enum damos_quota_goal_tuner goal_tuner;
unsigned long esz;
unsigned int weight_sz;
@@ -647,8 +661,7 @@ struct damon_operations {
void (*prepare_access_checks)(struct damon_ctx *context);
unsigned int (*check_accesses)(struct damon_ctx *context);
int (*get_scheme_score)(struct damon_ctx *context,
- struct damon_target *t, struct damon_region *r,
- struct damos *scheme);
+ struct damon_region *r, struct damos *scheme);
unsigned long (*apply_scheme)(struct damon_ctx *context,
struct damon_target *t, struct damon_region *r,
struct damos *scheme, unsigned long *sz_filter_passed);
@@ -981,6 +994,7 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control);
int damon_set_region_biggest_system_ram_default(struct damon_target *t,
unsigned long *start, unsigned long *end,
+ unsigned long addr_unit,
unsigned long min_region_sz);
#endif /* CONFIG_DAMON */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index bf103f317cac..10a7cc79aea5 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -69,7 +69,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc,
const struct inode *inode,
struct dax_device *dax_dev)
{
- if (!vma_desc_test_flags(desc, VMA_SYNC_BIT))
+ if (!vma_desc_test(desc, VMA_SYNC_BIT))
return true;
if (!IS_DAX(inode))
return false;
@@ -115,7 +115,7 @@ static inline bool daxdev_mapping_supported(const struct vm_area_desc *desc,
const struct inode *inode,
struct dax_device *dax_dev)
{
- return !vma_desc_test_flags(desc, VMA_SYNC_BIT);
+ return !vma_desc_test(desc, VMA_SYNC_BIT);
}
static inline size_t dax_recovery_write(struct dax_device *dax_dev,
pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
diff --git a/include/linux/pagevec.h b/include/linux/folio_batch.h
index 63be5a451627..b45946adc50b 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/folio_batch.h
@@ -1,18 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * include/linux/pagevec.h
+ * include/linux/folio_batch.h
*
* In many places it is efficient to batch an operation up against multiple
* folios. A folio_batch is a container which is used for that.
*/
-#ifndef _LINUX_PAGEVEC_H
-#define _LINUX_PAGEVEC_H
+#ifndef _LINUX_FOLIO_BATCH_H
+#define _LINUX_FOLIO_BATCH_H
#include <linux/types.h>
/* 31 pointers + header align the folio_batch structure to a power of two */
-#define PAGEVEC_SIZE 31
+#define FOLIO_BATCH_SIZE 31
struct folio;
@@ -29,7 +29,7 @@ struct folio_batch {
unsigned char nr;
unsigned char i;
bool percpu_pvec_drained;
- struct folio *folios[PAGEVEC_SIZE];
+ struct folio *folios[FOLIO_BATCH_SIZE];
};
/**
@@ -58,7 +58,7 @@ static inline unsigned int folio_batch_count(const struct folio_batch *fbatch)
static inline unsigned int folio_batch_space(const struct folio_batch *fbatch)
{
- return PAGEVEC_SIZE - fbatch->nr;
+ return FOLIO_BATCH_SIZE - fbatch->nr;
}
/**
@@ -93,7 +93,7 @@ static inline struct folio *folio_batch_next(struct folio_batch *fbatch)
return fbatch->folios[fbatch->i++];
}
-void __folio_batch_release(struct folio_batch *pvec);
+void __folio_batch_release(struct folio_batch *fbatch);
static inline void folio_batch_release(struct folio_batch *fbatch)
{
@@ -102,4 +102,4 @@ static inline void folio_batch_release(struct folio_batch *fbatch)
}
void folio_batch_remove_exceptionals(struct folio_batch *fbatch);
-#endif /* _LINUX_PAGEVEC_H */
+#endif /* _LINUX_FOLIO_BATCH_H */
diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h
index adab609c972e..f6d5f1f127c9 100644
--- a/include/linux/folio_queue.h
+++ b/include/linux/folio_queue.h
@@ -14,7 +14,7 @@
#ifndef _LINUX_FOLIO_QUEUE_H
#define _LINUX_FOLIO_QUEUE_H
-#include <linux/pagevec.h>
+#include <linux/folio_batch.h>
#include <linux/mm.h>
/*
@@ -29,12 +29,12 @@
*/
struct folio_queue {
struct folio_batch vec; /* Folios in the queue segment */
- u8 orders[PAGEVEC_SIZE]; /* Order of each folio */
+ u8 orders[FOLIO_BATCH_SIZE]; /* Order of each folio */
struct folio_queue *next; /* Next queue segment or NULL */
struct folio_queue *prev; /* Previous queue segment of NULL */
unsigned long marks; /* 1-bit mark per folio */
unsigned long marks2; /* Second 1-bit mark per folio */
-#if PAGEVEC_SIZE > BITS_PER_LONG
+#if FOLIO_BATCH_SIZE > BITS_PER_LONG
#error marks is not big enough
#endif
unsigned int rreq_id;
@@ -70,7 +70,7 @@ static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id)
*/
static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq)
{
- return PAGEVEC_SIZE;
+ return FOLIO_BATCH_SIZE;
}
/**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b5b01bb22d12..e1d257e6da68 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2058,16 +2058,24 @@ static inline bool can_mmap_file(struct file *file)
return true;
}
-int __compat_vma_mmap(const struct file_operations *f_op,
- struct file *file, struct vm_area_struct *vma);
+void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file,
+ const struct vm_area_struct *vma);
+int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma);
int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
+int __vma_check_mmap_hook(struct vm_area_struct *vma);
static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
{
+ int err;
+
if (file->f_op->mmap_prepare)
return compat_vma_mmap(file, vma);
- return file->f_op->mmap(file, vma);
+ err = file->f_op->mmap(file, vma);
+ if (err)
+ return err;
+
+ return __vma_check_mmap_hook(vma);
}
static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a4d9f964dfde..2949e5acff35 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -27,8 +27,8 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr, unsigned long next);
-int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr);
+bool zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr);
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud,
unsigned long addr);
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
@@ -83,7 +83,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
* file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
* it. Same to PFNMAPs where there's neither page* nor pagecache.
*/
-#define THP_ORDERS_ALL_SPECIAL \
+#define THP_ORDERS_ALL_SPECIAL_DAX \
(BIT(PMD_ORDER) | BIT(PUD_ORDER))
#define THP_ORDERS_ALL_FILE_DEFAULT \
((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
@@ -92,7 +92,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
* Mask of all large folio orders supported for THP.
*/
#define THP_ORDERS_ALL \
- (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT)
+ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
enum tva_type {
TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */
@@ -771,6 +771,11 @@ static inline bool pmd_is_huge(pmd_t pmd)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline bool is_pmd_order(unsigned int order)
+{
+ return order == HPAGE_PMD_ORDER;
+}
+
static inline int split_folio_to_list_to_order(struct folio *folio,
struct list_head *list, int new_order)
{
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fc5462fe943f..93418625d3c5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -778,10 +778,6 @@ static inline unsigned long huge_page_size(const struct hstate *h)
return (unsigned long)PAGE_SIZE << h->order;
}
-extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma);
-
-extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);
-
static inline unsigned long huge_page_mask(struct hstate *h)
{
return h->mask;
@@ -797,6 +793,23 @@ static inline unsigned huge_page_shift(struct hstate *h)
return h->order + PAGE_SHIFT;
}
+/**
+ * hugetlb_linear_page_index() - linear_page_index() but in hugetlb
+ * page size granularity.
+ * @vma: the hugetlb VMA
+ * @address: the virtual address within the VMA
+ *
+ * Return: the page offset within the mapping in huge page units.
+ */
+static inline pgoff_t hugetlb_linear_page_index(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct hstate *h = hstate_vma(vma);
+
+ return ((address - vma->vm_start) >> huge_page_shift(h)) +
+ (vma->vm_pgoff >> huge_page_order(h));
+}
+
static inline bool order_is_gigantic(unsigned int order)
{
return order > MAX_PAGE_ORDER;
@@ -1178,16 +1191,6 @@ static inline unsigned long huge_page_mask(struct hstate *h)
return PAGE_MASK;
}
-static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
-{
- return PAGE_SIZE;
-}
-
-static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
-{
- return PAGE_SIZE;
-}
-
static inline unsigned int huge_page_order(struct hstate *h)
{
return 0;
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 593f5d4e108b..565b473fd135 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -13,7 +13,7 @@ static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags)
{
- return vma_flags_test(flags, VMA_HUGETLB_BIT);
+ return vma_flags_test_any(flags, VMA_HUGETLB_BIT);
}
#else
@@ -30,7 +30,7 @@ static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags)
#endif
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(const struct vm_area_struct *vma)
{
return is_vm_hugetlb_flags(vma->vm_flags);
}
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index dfc516c1c719..a26fb8e7cedf 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1015,8 +1015,8 @@ struct vmbus_channel {
/* The max size of a packet on this channel */
u32 max_pkt_size;
- /* function to mmap ring buffer memory to the channel's sysfs ring attribute */
- int (*mmap_ring_buffer)(struct vmbus_channel *channel, struct vm_area_struct *vma);
+ /* function to mmap ring buffer memory to the channel's sysfs ring attribute */
+ int (*mmap_prepare_ring_buffer)(struct vmbus_channel *channel, struct vm_area_desc *desc);
/* boolean to control visibility of sysfs for ring buffer */
bool ring_sysfs_visible;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 531f9ebdeeae..2c5685adf3a9 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -9,7 +9,7 @@
#include <linux/types.h>
#include <linux/mm_types.h>
#include <linux/blkdev.h>
-#include <linux/pagevec.h>
+#include <linux/folio_batch.h>
struct address_space;
struct fiemap_extent_info;
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 338a1921a50a..bf233bde68c7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -352,8 +352,8 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip);
* kasan_mempool_unpoison_object().
*
* This function operates on all slab allocations including large kmalloc
- * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
- * size > KMALLOC_MAX_SIZE).
+ * allocations (i.e. the ones backed directly by the buddy allocator rather
+ * than kmalloc slab caches).
*
* Return: true if the allocation can be safely reused; false otherwise.
*/
@@ -381,8 +381,8 @@ void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip);
* original tags based on the pointer value.
*
* This function operates on all slab allocations including large kmalloc
- * allocations (the ones returned by kmalloc_large() or by kmalloc() with the
- * size > KMALLOC_MAX_SIZE).
+ * allocations (i.e. the ones backed directly by the buddy allocator rather
+ * than kmalloc slab caches).
*/
static __always_inline void kasan_mempool_unpoison_object(void *ptr,
size_t size)
diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h
index 2201a0d2c159..6b7d8ef550f9 100644
--- a/include/linux/kho/abi/kexec_handover.h
+++ b/include/linux/kho/abi/kexec_handover.h
@@ -10,8 +10,13 @@
#ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H
#define _LINUX_KHO_ABI_KEXEC_HANDOVER_H
+#include <linux/bits.h>
+#include <linux/log2.h>
+#include <linux/math.h>
#include <linux/types.h>
+#include <asm/page.h>
+
/**
* DOC: Kexec Handover ABI
*
@@ -29,32 +34,32 @@
* compatibility is only guaranteed for kernels supporting the same ABI version.
*
* FDT Structure Overview:
- * The FDT serves as a central registry for physical
- * addresses of preserved data structures and sub-FDTs. The first kernel
- * populates this FDT with references to memory regions and other FDTs that
- * need to persist across the kexec transition. The subsequent kernel then
- * parses this FDT to locate and restore the preserved data.::
+ * The FDT serves as a central registry for physical addresses of preserved
+ * data structures. The first kernel populates this FDT with references to
+ * memory regions and other metadata that need to persist across the kexec
+ * transition. The subsequent kernel then parses this FDT to locate and
+ * restore the preserved data.::
*
* / {
- * compatible = "kho-v1";
+ * compatible = "kho-v2";
*
* preserved-memory-map = <0x...>;
*
* <subnode-name-1> {
- * fdt = <0x...>;
+ * preserved-data = <0x...>;
* };
*
* <subnode-name-2> {
- * fdt = <0x...>;
+ * preserved-data = <0x...>;
* };
* ... ...
* <subnode-name-N> {
- * fdt = <0x...>;
+ * preserved-data = <0x...>;
* };
* };
*
* Root KHO Node (/):
- * - compatible: "kho-v1"
+ * - compatible: "kho-v2"
*
* Indentifies the overall KHO ABI version.
*
@@ -69,20 +74,20 @@
* is provided by the subsystem that uses KHO for preserving its
* data.
*
- * - fdt: u64
+ * - preserved-data: u64
*
- * Physical address pointing to a subnode FDT blob that is also
+ * Physical address pointing to a subnode data blob that is also
* being preserved.
*/
/* The compatible string for the KHO FDT root node. */
-#define KHO_FDT_COMPATIBLE "kho-v1"
+#define KHO_FDT_COMPATIBLE "kho-v2"
/* The FDT property for the preserved memory map. */
#define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map"
-/* The FDT property for sub-FDTs. */
-#define KHO_FDT_SUB_TREE_PROP_NAME "fdt"
+/* The FDT property for preserved data blobs. */
+#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data"
/**
* DOC: Kexec Handover ABI for vmalloc Preservation
@@ -160,4 +165,113 @@ struct kho_vmalloc {
unsigned short order;
};
+/**
+ * DOC: KHO persistent memory tracker
+ *
+ * KHO tracks preserved memory using a radix tree data structure. Each node of
+ * the tree is exactly a single page. The leaf nodes are bitmaps where each set
+ * bit is a preserved page of any order. The intermediate nodes are tables of
+ * physical addresses that point to a lower level node.
+ *
+ * The tree hierarchy is shown below::
+ *
+ * root
+ * +-------------------+
+ * | Level 5 | (struct kho_radix_node)
+ * +-------------------+
+ * |
+ * v
+ * +-------------------+
+ * | Level 4 | (struct kho_radix_node)
+ * +-------------------+
+ * |
+ * | ... (intermediate levels)
+ * |
+ * v
+ * +-------------------+
+ * | Level 0 | (struct kho_radix_leaf)
+ * +-------------------+
+ *
+ * The tree is traversed using a key that encodes the page's physical address
+ * (pa) and its order into a single unsigned long value. The encoded key value
+ * is composed of two parts: the 'order bit' in the upper part and the
+ * 'shifted physical address' in the lower part.::
+ *
+ * +------------+-----------------------------+--------------------------+
+ * | Page Order | Order Bit | Shifted Physical Address |
+ * +------------+-----------------------------+--------------------------+
+ * | 0 | ...000100 ... (at bit 52) | pa >> (PAGE_SHIFT + 0) |
+ * | 1 | ...000010 ... (at bit 51) | pa >> (PAGE_SHIFT + 1) |
+ * | 2 | ...000001 ... (at bit 50) | pa >> (PAGE_SHIFT + 2) |
+ * | ... | ... | ... |
+ * +------------+-----------------------------+--------------------------+
+ *
+ * Shifted Physical Address:
+ * The 'shifted physical address' is the physical address normalized for its
+ * order. It effectively represents the PFN shifted right by the order.
+ *
+ * Order Bit:
+ * The 'order bit' encodes the page order by setting a single bit at a
+ * specific position. The position of this bit itself represents the order.
+ *
+ * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
+ * maximum range for the shifted physical address (for order 0) is 52 bits
+ * (64 - 12). This address occupies bits [0-51]. For order 0, the order bit is
+ * set at position 52.
+ *
+ * The following diagram illustrates how the encoded key value is split into
+ * indices for the tree levels, with PAGE_SIZE of 4KB::
+ *
+ * 63:60 59:51 50:42 41:33 32:24 23:15 14:0
+ * +---------+--------+--------+--------+--------+--------+-----------------+
+ * | 0 | Lv 5 | Lv 4 | Lv 3 | Lv 2 | Lv 1 | Lv 0 (bitmap) |
+ * +---------+--------+--------+--------+--------+--------+-----------------+
+ *
+ * The radix tree stores pages of all orders in a single 6-level hierarchy. It
+ * efficiently shares higher tree levels, especially due to common zero top
+ * address bits, allowing a single, efficient algorithm to manage all
+ * pages. This bitmap approach also offers memory efficiency; for example, a
+ * 512KB bitmap can cover a 16GB memory range for 0-order pages with PAGE_SIZE =
+ * 4KB.
+ *
+ * The data structures defined here are part of the KHO ABI. Any modification
+ * to these structures that breaks backward compatibility must be accompanied by
+ * an update to the "compatible" string. This ensures that a newer kernel can
+ * correctly interpret the data passed by an older kernel.
+ */
+
+/*
+ * Defines constants for the KHO radix tree structure, used to track preserved
+ * memory. These constants govern the indexing, sizing, and depth of the tree.
+ */
+enum kho_radix_consts {
+ /*
+ * The bit position of the order bit (and also the length of the
+ * shifted physical address) for an order-0 page.
+ */
+ KHO_ORDER_0_LOG2 = 64 - PAGE_SHIFT,
+
+ /* Size of the table in kho_radix_node, in log2 */
+ KHO_TABLE_SIZE_LOG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
+
+ /* Number of bits in the kho_radix_leaf bitmap, in log2 */
+ KHO_BITMAP_SIZE_LOG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),
+
+ /*
+ * The total tree depth is the number of intermediate levels
+ * and 1 bitmap level.
+ */
+ KHO_TREE_MAX_DEPTH =
+ DIV_ROUND_UP(KHO_ORDER_0_LOG2 - KHO_BITMAP_SIZE_LOG2,
+ KHO_TABLE_SIZE_LOG2) + 1,
+};
+
+struct kho_radix_node {
+ u64 table[1 << KHO_TABLE_SIZE_LOG2];
+};
+
+struct kho_radix_leaf {
+ DECLARE_BITMAP(bitmap, 1 << KHO_BITMAP_SIZE_LOG2);
+};
+
#endif /* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */
diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h
index 68cb6303b846..08b10fea2afc 100644
--- a/include/linux/kho/abi/memfd.h
+++ b/include/linux/kho/abi/memfd.h
@@ -56,10 +56,24 @@ struct memfd_luo_folio_ser {
u64 index;
} __packed;
+/*
+ * The set of seals this version supports preserving. If support for any new
+ * seals is needed, add it here and bump version.
+ */
+#define MEMFD_LUO_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_SHRINK | \
+ F_SEAL_GROW | \
+ F_SEAL_WRITE | \
+ F_SEAL_FUTURE_WRITE | \
+ F_SEAL_EXEC)
+
/**
* struct memfd_luo_ser - Main serialization structure for a memfd.
* @pos: The file's current position (f_pos).
* @size: The total size of the file in bytes (i_size).
+ * @seals: The seals present on the memfd. The seals are uABI so it is safe
+ * to directly use them in the ABI.
+ * @flags: Flags for the file. Unused flag bits must be set to 0.
* @nr_folios: Number of folios in the folios array.
* @folios: KHO vmalloc descriptor pointing to the array of
* struct memfd_luo_folio_ser.
@@ -67,11 +81,13 @@ struct memfd_luo_folio_ser {
struct memfd_luo_ser {
u64 pos;
u64 size;
+ u32 seals;
+ u32 flags;
u64 nr_folios;
struct kho_vmalloc folios;
} __packed;
/* The compatibility string for memfd file handler */
-#define MEMFD_LUO_FH_COMPATIBLE "memfd-v1"
+#define MEMFD_LUO_FH_COMPATIBLE "memfd-v2"
#endif /* _LINUX_KHO_ABI_MEMFD_H */
diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
new file mode 100644
index 000000000000..84e918b96e53
--- /dev/null
+++ b/include/linux/kho_radix_tree.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_KHO_RADIX_TREE_H
+#define _LINUX_KHO_RADIX_TREE_H
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/mutex_types.h>
+#include <linux/types.h>
+
+/**
+ * DOC: Kexec Handover Radix Tree
+ *
+ * This is a radix tree implementation for tracking physical memory pages
+ * across kexec transitions. It was developed for the KHO mechanism but is
+ * designed for broader use by any subsystem that needs to preserve pages.
+ *
+ * The radix tree is a multi-level tree where leaf nodes are bitmaps
+ * representing individual pages. To allow pages of different sizes (orders)
+ * to be stored efficiently in a single tree, it uses a unique key encoding
+ * scheme. Each key is an unsigned long that combines a page's physical
+ * address and its order.
+ *
+ * Client code is responsible for allocating the root node of the tree,
+ * initializing the mutex lock, and managing its lifecycle. It must use the
+ * tree data structures defined in the KHO ABI,
+ * `include/linux/kho/abi/kexec_handover.h`.
+ */
+
+struct kho_radix_node;
+
+struct kho_radix_tree {
+ struct kho_radix_node *root;
+ struct mutex lock; /* protects the tree's structure and root pointer */
+};
+
+typedef int (*kho_radix_tree_walk_callback_t)(phys_addr_t phys,
+ unsigned int order);
+
+#ifdef CONFIG_KEXEC_HANDOVER
+
+int kho_radix_add_page(struct kho_radix_tree *tree, unsigned long pfn,
+ unsigned int order);
+
+void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn,
+ unsigned int order);
+
+int kho_radix_walk_tree(struct kho_radix_tree *tree,
+ kho_radix_tree_walk_callback_t cb);
+
+#else /* #ifdef CONFIG_KEXEC_HANDOVER */
+
+static inline int kho_radix_add_page(struct kho_radix_tree *tree, long pfn,
+ unsigned int order)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void kho_radix_del_pa