From 9ec1e972c3de3106140c18d2a1c7c74795d85a69 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Fri, 30 Jan 2026 15:59:16 -0500
Subject: maple_tree: introduce maple_copy node and use it in
 mas_spanning_rebalance()

Introduce an internal-memory only node type called maple_copy to
facilitate internal copy operations.  Use it in mas_spanning_rebalance()
for just the leaf nodes.  Initially, the maple_copy node is used to
configure the source nodes and copy the data into the big_node.

The maple_copy contains a list of source entries with start and end
offsets.  One of the maple_copy entries can be itself with an offset of 0
to 2, representing the data where the store partially overwrites entries,
or fully overwrites the entry.  The side effect is that the source nodes
no longer have to worry about partially copying the existing offset if it
is not fully overwritten.

This is in preparation of removal of the maple big_node, but for the time
being the data is copied to the big node to limit the change size.

Link: https://lkml.kernel.org/r/20260130205935.2559335-12-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andrew Ballance <andrewjballance@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Kujau <lists@nerdbynature.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 7b8aad47121e..9bc7fa89bc2e 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -139,6 +139,7 @@ enum maple_type {
 	maple_leaf_64,
 	maple_range_64,
 	maple_arange_64,
+	maple_copy,
 };
 
 enum store_type {
@@ -154,6 +155,30 @@ enum store_type {
 	wr_slot_store,
 };
 
+struct maple_copy {
+	struct {
+		struct maple_node *node;
+		unsigned long max;
+		unsigned char start;
+		unsigned char end;
+		enum maple_type mt;
+	} src[4];
+	/* Simulated node */
+	void __rcu *slot[3];
+	unsigned long min;
+	union {
+		unsigned long pivot[3];
+		struct {
+			void *_pad[2];
+			unsigned long max;
+		};
+	};
+	unsigned char end;
+
+	/*Avoid passing these around */
+	unsigned char s_count;
+};
+
 /**
  * DOC: Maple tree flags
  *
@@ -299,6 +324,7 @@ struct maple_node {
 		};
 		struct maple_range_64 mr64;
 		struct maple_arange_64 ma64;
+		struct maple_copy cp;
 	};
 };
 
-- 
cgit v1.2.3


From 6953038cab845f3720ec8d83915f4f083861e195 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Fri, 30 Jan 2026 15:59:19 -0500
Subject: maple_tree: change initial big node setup in
 mas_wr_spanning_rebalance()

Instead of copying the data into the big node and finding out that the
data may need to be moved or appended to, calculate the data space up
front (in the maple copy node) and set up another source for the copy.

The additional copy source is tracked in the maple state sib (short for
sibling), and is put into the maple write states for future operations
after the data is in the big node.

To facilitate the newly moved node, some initial setup of the maple
subtree state are relocated after the potential shift caused by the new
way of rebalancing against a sibling.

Link: https://lkml.kernel.org/r/20260130205935.2559335-15-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andrew Ballance <andrewjballance@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Kujau <lists@nerdbynature.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 9bc7fa89bc2e..e99e16ac1c6d 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -177,6 +177,7 @@ struct maple_copy {
 
 	/*Avoid passing these around */
 	unsigned char s_count;
+	unsigned char data;
 };
 
 /**
-- 
cgit v1.2.3


From 20b20162e1f3b7e60cf0e79116fb2f3bdef3dc5e Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Fri, 30 Jan 2026 15:59:21 -0500
Subject: maple_tree: add gap support, slot and pivot sizes for maple copy

Add plumbing work for using maple copy as a normal node for a source of
copy operations.  This is needed later.

Link: https://lkml.kernel.org/r/20260130205935.2559335-17-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andrew Ballance <andrewjballance@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Kujau <lists@nerdbynature.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index e99e16ac1c6d..db6a02788902 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -165,6 +165,7 @@ struct maple_copy {
 	} src[4];
 	/* Simulated node */
 	void __rcu *slot[3];
+	unsigned long gap[3];
 	unsigned long min;
 	union {
 		unsigned long pivot[3];
-- 
cgit v1.2.3


From a9c6716e088a1d4badd4fa6797469506bb99ec8b Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Fri, 30 Jan 2026 15:59:22 -0500
Subject: maple_tree: start using maple copy node for destination

Stop using the maple subtree state and big node in favour of using three
destinations in the maple copy node.  That is, expand the way leaves were
handled to all levels of the tree and use the maple copy node to track the
new nodes.

Extract out the sibling init into the data calculation since this is where
the insufficient data can be detected.  The remainder of the sibling code
to shift the next iteration is moved to the spanning_ascend() function,
since it is not always needed.

Next introduce the dst_setup() function which will decide how many nodes
are needed to contain the data at this level.  Using the destination
count, populate the copy node's dst array with the new nodes and set
d_count to the correct value.  Note that this can be tricky in the case of
a leaf node with exactly enough room because of the rule against NULLs at
the end of leaves.

Once the destinations are ready, copy the data by altering the
cp_data_write() function to copy from the sources to the destinations
directly.  This eliminates the use of the big node in this code path.  On
node completion, node_finalise() will zero out the remaining area and set
the metadata, if necessary.

spanning_ascend() is used to decide if the operation is complete.  It may
create a new root, converge into one destination, or continue upwards by
ascending the left and right write maple states.

One test case setup needed to be tweaked so that the targeted node was
surrounded by full nodes.

[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/20260130205935.2559335-18-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andrew Ballance <andrewjballance@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Kujau <lists@nerdbynature.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index db6a02788902..0c464eade1d6 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -156,6 +156,17 @@ enum store_type {
 };
 
 struct maple_copy {
+	/*
+	 * min, max, and pivots are values
+	 * start, end, split are indexes into arrays
+	 * data is a size
+	 */
+
+	struct {
+		struct maple_node *node;
+		unsigned long max;
+		enum maple_type mt;
+	} dst[3];
 	struct {
 		struct maple_node *node;
 		unsigned long max;
@@ -178,7 +189,10 @@ struct maple_copy {
 
 	/*Avoid passing these around */
 	unsigned char s_count;
+	unsigned char d_count;
+	unsigned char split;
 	unsigned char data;
+	unsigned char height;
 };
 
 /**
-- 
cgit v1.2.3


From e4f4fc7aa8b720d934a0bfcea7f8aae4271d308f Mon Sep 17 00:00:00 2001
From: "JP Kobryn (Meta)" <jp.kobryn@linux.dev>
Date: Thu, 19 Feb 2026 15:58:46 -0800
Subject: mm: move pgscan, pgsteal, pgrefill to node stats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are situations where reclaim kicks in on a system with free memory.
One possible cause is a NUMA imbalance scenario where one or more nodes
are under pressure.  It would help if we could easily identify such nodes.

Move the pgscan, pgsteal, and pgrefill counters from vm_event_item to
node_stat_item to provide per-node reclaim visibility.  With these
counters as node stats, the values are now displayed in the per-node
section of /proc/zoneinfo, which allows for quick identification of the
affected nodes.

/proc/vmstat continues to report the same counters, aggregated across all
nodes.  But the ordering of these items within the readout changes as they
move from the vm events section to the node stats section.

Memcg accounting of these counters is preserved.  The relocated counters
remain visible in memory.stat alongside the existing aggregate pgscan and
pgsteal counters.

However, this change affects how the global counters are accumulated.
Previously, the global event count update was gated on !cgroup_reclaim(),
excluding memcg-based reclaim from /proc/vmstat.  Now that
mod_lruvec_state() is being used to update the counters, the global
counters will include all reclaim.  This is consistent with how pgdemote
counters are already tracked.

Finally, the virtio_balloon driver is updated to use
global_node_page_state() to fetch the counters, as they are no longer
accessible through the vm_events array.

Link: https://lkml.kernel.org/r/20260219235846.161910-1-jp.kobryn@linux.dev
Signed-off-by: JP Kobryn <jp.kobryn@linux.dev>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Eugenio Pérez <eperezma@redhat.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h        | 13 +++++++++++++
 include/linux/vm_event_item.h | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e51190a55e4..546bca95ca40 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -255,6 +255,19 @@ enum node_stat_item {
 	PGDEMOTE_DIRECT,
 	PGDEMOTE_KHUGEPAGED,
 	PGDEMOTE_PROACTIVE,
+	PGSTEAL_KSWAPD,
+	PGSTEAL_DIRECT,
+	PGSTEAL_KHUGEPAGED,
+	PGSTEAL_PROACTIVE,
+	PGSTEAL_ANON,
+	PGSTEAL_FILE,
+	PGSCAN_KSWAPD,
+	PGSCAN_DIRECT,
+	PGSCAN_KHUGEPAGED,
+	PGSCAN_PROACTIVE,
+	PGSCAN_ANON,
+	PGSCAN_FILE,
+	PGREFILL,
 #ifdef CONFIG_HUGETLB_PAGE
 	NR_HUGETLB,
 #endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 22a139f82d75..03fe95f5a020 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -38,21 +38,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
 		PGFAULT, PGMAJFAULT,
 		PGLAZYFREED,
-		PGREFILL,
 		PGREUSE,
-		PGSTEAL_KSWAPD,
-		PGSTEAL_DIRECT,
-		PGSTEAL_KHUGEPAGED,
-		PGSTEAL_PROACTIVE,
-		PGSCAN_KSWAPD,
-		PGSCAN_DIRECT,
-		PGSCAN_KHUGEPAGED,
-		PGSCAN_PROACTIVE,
 		PGSCAN_DIRECT_THROTTLE,
-		PGSCAN_ANON,
-		PGSCAN_FILE,
-		PGSTEAL_ANON,
-		PGSTEAL_FILE,
 #ifdef CONFIG_NUMA
 		PGSCAN_ZONE_RECLAIM_SUCCESS,
 		PGSCAN_ZONE_RECLAIM_FAILED,
-- 
cgit v1.2.3


From 0d6af9bcf383bcdf601e670bb605861b01e318e7 Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Wed, 18 Feb 2026 04:06:34 +0800
Subject: mm, swap: use the swap table to track the swap count

Now all the infrastructures are ready, switch to using the swap table
only.  This is unfortunately a large patch because the whole old counting
mechanism, especially SWP_CONTINUED, has to be gone and switch to the new
mechanism together, with no intermediate steps available.

The swap table is capable of holding up to SWP_TB_COUNT_MAX - 1 counts in
the higher bits of each table entry, so using that, the swap_map can be
completely dropped.

swap_map also had a limit of SWAP_CONT_MAX.  Any value beyond that limit
will require a COUNT_CONTINUED page.  COUNT_CONTINUED is a bit complex to
maintain, so for the swap table, a simpler approach is used: when the
count goes beyond SWP_TB_COUNT_MAX - 1, the cluster will have an
extend_table allocated, which is a swap cluster-sized array of unsigned
int.  The counting is basically offloaded there until the count drops
below SWP_TB_COUNT_MAX again.

Both the swap table and the extend table are cluster-based, so they
exhibit good performance and sparsity.

To make the switch from swap_map to swap table clean, this commit cleans
up and introduces a new set of functions based on the swap table design,
for manipulating swap counts:

- __swap_cluster_dup_entry, __swap_cluster_put_entry,
  __swap_cluster_alloc_entry, __swap_cluster_free_entry:

  Increase/decrease the count of a swap slot, or alloc / free a swap
  slot. This is the internal routine that does the counting work based
  on the swap table and handles all the complexities. The caller will
  need to lock the cluster before calling them.

  All swap count-related update operations are wrapped by these four
  helpers.

- swap_dup_entries_cluster, swap_put_entries_cluster:

  Increase/decrease the swap count of one or a set of swap slots in the
  same cluster range. These two helpers serve as the common routines for
  folio_dup_swap & swap_dup_entry_direct, or
  folio_put_swap & swap_put_entries_direct.

And use these helpers to replace all existing callers. This helps to
simplify the count tracking by a lot, and the swap_map is gone.

[ryncsn@gmail.com: fix build]
  Link: https://lkml.kernel.org/r/aZWuLZi-vYi3vAWe@KASONG-MC4
Link: https://lkml.kernel.org/r/20260218-swap-table-p3-v3-9-f4e34be021a7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Suggested-by: Chris Li <chrisl@kernel.org>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: kernel test robot <lkp@intel.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 62fc7499b408..0effe3cc50f5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -208,7 +208,6 @@ enum {
 	SWP_DISCARDABLE = (1 << 2),	/* blkdev support discard */
 	SWP_DISCARDING	= (1 << 3),	/* now discarding a free cluster */
 	SWP_SOLIDSTATE	= (1 << 4),	/* blkdev seeks are cheap */
-	SWP_CONTINUED	= (1 << 5),	/* swap_map has count continuation */
 	SWP_BLKDEV	= (1 << 6),	/* its a block device */
 	SWP_ACTIVATED	= (1 << 7),	/* set after swap_activate success */
 	SWP_FS_OPS	= (1 << 8),	/* swapfile operations go through fs */
@@ -223,16 +222,6 @@ enum {
 #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10)
 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
 
-/* Bit flag in swap_map */
-#define COUNT_CONTINUED	0x80	/* Flag swap_map continuation for full count */
-
-/* Special value in first swap_map */
-#define SWAP_MAP_MAX	0x3e	/* Max count */
-#define SWAP_MAP_BAD	0x3f	/* Note page is bad */
-
-/* Special value in each swap_map continuation */
-#define SWAP_CONT_MAX	0x7f	/* Max count */
-
 /*
  * The first page in the swap file is the swap header, which is always marked
  * bad to prevent it from being allocated as an entry. This also prevents the
@@ -264,8 +253,7 @@ struct swap_info_struct {
 	signed short	prio;		/* swap priority of this type */
 	struct plist_node list;		/* entry in swap_active_head */
 	signed char	type;		/* strange name for an index */
-	unsigned int	max;		/* extent of the swap_map */
-	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+	unsigned int	max;		/* size of this swap device */
 	unsigned long *zeromap;		/* kvmalloc'ed bitmap to track zero pages */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
 	struct list_head free_clusters; /* free clusters list */
@@ -284,18 +272,14 @@ struct swap_info_struct {
 	struct completion comp;		/* seldom referenced */
 	spinlock_t lock;		/*
 					 * protect map scan related fields like
-					 * swap_map, inuse_pages and all cluster
-					 * lists. other fields are only changed
+					 * inuse_pages and all cluster lists.
+					 * Other fields are only changed
 					 * at swapon/swapoff, so are protected
 					 * by swap_lock. changing flags need
 					 * hold this lock and swap_lock. If
 					 * both locks need hold, hold swap_lock
 					 * first.
 					 */
-	spinlock_t cont_lock;		/*
-					 * protect swap count continuation page
-					 * list.
-					 */
 	struct work_struct discard_work; /* discard worker */
 	struct work_struct reclaim_work; /* reclaim worker */
 	struct list_head discard_clusters; /* discard clusters list */
@@ -451,7 +435,6 @@ static inline long get_nr_swap_pages(void)
 }
 
 extern void si_swapinfo(struct sysinfo *);
-extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 int swap_type_of(dev_t device, sector_t offset);
 int find_first_swap(dev_t *device);
 extern unsigned int count_swap_pages(int, int);
@@ -517,11 +500,6 @@ static inline void free_swap_cache(struct folio *folio)
 {
 }
 
-static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
-{
-	return 0;
-}
-
 static inline int swap_dup_entry_direct(swp_entry_t ent)
 {
 	return 0;
-- 
cgit v1.2.3


From 1beb9b7223d2a1f1872f76a3d29b0a4a3cee4171 Mon Sep 17 00:00:00 2001
From: "Pratyush Yadav (Google)" <pratyush@kernel.org>
Date: Mon, 16 Feb 2026 19:59:32 +0100
Subject: memfd: export memfd_{add,get}_seals()

Patch series "mm: memfd_luo: preserve file seals", v2.

This series adds support for preserving file seals when preserving a memfd
using LUO.  Patch 1 exports some memfd seal manipulation functions and
patch 2 adds support for preserving them.  Since it makes changes to the
serialized data structure for memfd, it also bumps the version number.


This patch (of 2):

Support for preserving file seals will be added to memfd preservation
using the Live Update Orchestrator (LUO).  Export memfd_{add,get}_seals)()
so memfd_luo can use them to manipulate the seals.

Link: https://lkml.kernel.org/r/20260216185946.1215770-1-pratyush@kernel.org
Link: https://lkml.kernel.org/r/20260216185946.1215770-2-pratyush@kernel.org
Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Samiullah Khawaja <skhawaja@google.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memfd.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index c328a7b356d0..b4fda09dab9f 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -18,6 +18,8 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx);
  */
 int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr);
 struct file *memfd_alloc_file(const char *name, unsigned int flags);
+int memfd_get_seals(struct file *file);
+int memfd_add_seals(struct file *file, unsigned int seals);
 #else
 static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
 {
@@ -37,6 +39,16 @@ static inline struct file *memfd_alloc_file(const char *name, unsigned int flags
 {
 	return ERR_PTR(-EINVAL);
 }
+
+static inline int memfd_get_seals(struct file *file)
+{
+	return -EINVAL;
+}
+
+static inline int memfd_add_seals(struct file *file, unsigned int seals)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif /* __LINUX_MEMFD_H */
-- 
cgit v1.2.3


From 8a552d68a86ef0e6fb2ff4af13031a5e82c0f1d0 Mon Sep 17 00:00:00 2001
From: "Pratyush Yadav (Google)" <pratyush@kernel.org>
Date: Mon, 16 Feb 2026 19:59:33 +0100
Subject: mm: memfd_luo: preserve file seals

File seals are used on memfd for making shared memory communication with
untrusted peers safer and simpler.  Seals provide a guarantee that certain
operations won't be allowed on the file such as writes or truncations.
Maintaining these guarantees across a live update will help keeping such
use cases secure.

These guarantees will also be needed for IOMMUFD preservation with LUO.
Normally when IOMMUFD maps a memfd, it pins all its pages to make sure any
truncation operations on the memfd don't lead to IOMMUFD using freed
memory.  This doesn't work with LUO since the preserved memfd might have
completely different pages after a live update, and mapping them back to
the IOMMUFD will cause all sorts of problems.  Using and preserving the
seals allows IOMMUFD preservation logic to trust the memfd.

Since the uABI defines seals as an int, preserve them by introducing a new
u32 field.  There are currently only 6 possible seals, so the extra bits
are unused and provide room for future expansion.  Since the seals are
uABI, it is safe to use them directly in the ABI.  While at it, also add a
u32 flags field.  It makes sure the struct is nicely aligned, and can be
used later to support things like MFD_CLOEXEC.

Since the serialization structure is changed, bump the version number to
"memfd-v2".

It is important to note that the memfd-v2 version only supports seals that
existed when this version was defined.  This set is defined by
MEMFD_LUO_ALL_SEALS.  Any new seal might bring a completely different
semantic with it and the parser for memfd-v2 cannot be expected to deal
with that.  If there are any future seals added, they will need another
version bump.

Link: https://lkml.kernel.org/r/20260216185946.1215770-3-pratyush@kernel.org
Signed-off-by: Pratyush Yadav (Google) <pratyush@kernel.org>
Tested-by: Samiullah Khawaja <skhawaja@google.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kho/abi/memfd.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h
index 68cb6303b846..08b10fea2afc 100644
--- a/include/linux/kho/abi/memfd.h
+++ b/include/linux/kho/abi/memfd.h
@@ -56,10 +56,24 @@ struct memfd_luo_folio_ser {
 	u64 index;
 } __packed;
 
+/*
+ * The set of seals this version supports preserving. If support for any new
+ * seals is needed, add it here and bump version.
+ */
+#define MEMFD_LUO_ALL_SEALS (F_SEAL_SEAL | \
+			     F_SEAL_SHRINK | \
+			     F_SEAL_GROW | \
+			     F_SEAL_WRITE | \
+			     F_SEAL_FUTURE_WRITE | \
+			     F_SEAL_EXEC)
+
 /**
  * struct memfd_luo_ser - Main serialization structure for a memfd.
  * @pos:       The file's current position (f_pos).
  * @size:      The total size of the file in bytes (i_size).
+ * @seals:     The seals present on the memfd. The seals are uABI so it is safe
+ *             to directly use them in the ABI.
+ * @flags:     Flags for the file. Unused flag bits must be set to 0.
  * @nr_folios: Number of folios in the folios array.
  * @folios:    KHO vmalloc descriptor pointing to the array of
  *             struct memfd_luo_folio_ser.
@@ -67,11 +81,13 @@ struct memfd_luo_folio_ser {
 struct memfd_luo_ser {
 	u64 pos;
 	u64 size;
+	u32 seals;
+	u32 flags;
 	u64 nr_folios;
 	struct kho_vmalloc folios;
 } __packed;
 
 /* The compatibility string for memfd file handler */
-#define MEMFD_LUO_FH_COMPATIBLE	"memfd-v1"
+#define MEMFD_LUO_FH_COMPATIBLE	"memfd-v2"
 
 #endif /* _LINUX_KHO_ABI_MEMFD_H */
-- 
cgit v1.2.3


From c9cb94c6b85a2854ae03c874331b0880ee735441 Mon Sep 17 00:00:00 2001
From: Asier Gutierrez <gutierrez.asier@huawei-partners.com>
Date: Fri, 13 Feb 2026 14:50:32 +0000
Subject: mm/damon: remove unused target param of get_scheme_score()

damon_target is not used by get_scheme_score operations, nor with virtual
neither with physical addresses.

Link: https://lkml.kernel.org/r/20260213145032.1740407-1-gutierrez.asier@huawei-partners.com
Signed-off-by: Asier Gutierrez <gutierrez.asier@huawei-partners.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Quanmin Yan <yanquanmin1@huawei.com>
Cc: ze zuo <zuoze1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index be3d198043ff..60e6da3012fa 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -647,8 +647,7 @@ struct damon_operations {
 	void (*prepare_access_checks)(struct damon_ctx *context);
 	unsigned int (*check_accesses)(struct damon_ctx *context);
 	int (*get_scheme_score)(struct damon_ctx *context,
-			struct damon_target *t, struct damon_region *r,
-			struct damos *scheme);
+			struct damon_region *r, struct damos *scheme);
 	unsigned long (*apply_scheme)(struct damon_ctx *context,
 			struct damon_target *t, struct damon_region *r,
 			struct damos *scheme, unsigned long *sz_filter_passed);
-- 
cgit v1.2.3


From 5ad41a38c36474ff59545cb514801d90719555de Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <jiayuan.chen@shopee.com>
Date: Fri, 13 Feb 2026 15:18:22 +0800
Subject: mm: zswap: add per-memcg stat for incompressible pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm: zswap: add per-memcg stat for incompressible pages", v3.

In containerized environments, knowing which cgroup is contributing
incompressible pages to zswap is essential for effective resource
management.  This series adds a new per-memcg stat 'zswap_incomp' to track
incompressible pages, along with a selftest.


This patch (of 2):

The global zswap_stored_incompressible_pages counter was added in commit
dca4437a5861 ("mm/zswap: store <PAGE_SIZE compression failed page as-is")
to track how many pages are stored in raw (uncompressed) form in zswap.
However, in containerized environments, knowing which cgroup is
contributing incompressible pages is essential for effective resource
management [1].

Add a new memcg stat 'zswap_incomp' to track incompressible pages per
cgroup.  This helps administrators and orchestrators to:

1. Identify workloads that produce incompressible data (e.g., encrypted
   data, already-compressed media, random data) and may not benefit from
   zswap.

2. Make informed decisions about workload placement - moving
   incompressible workloads to nodes with larger swap backing devices
   rather than relying on zswap.

3. Debug zswap efficiency issues at the cgroup level without needing to
   correlate global stats with individual cgroups.

While the compression ratio can be estimated from existing stats (zswap /
zswapped * PAGE_SIZE), this doesn't distinguish between "uniformly poor
compression" and "a few completely incompressible pages mixed with highly
compressible ones".  The zswap_incomp stat provides direct visibility into
the latter case.

Link: https://lkml.kernel.org/r/20260213071827.5688-1-jiayuan.chen@linux.dev
Link: https://lkml.kernel.org/r/20260213071827.5688-2-jiayuan.chen@linux.dev
Link: https://lore.kernel.org/linux-mm/CAF8kJuONDFj4NAksaR4j_WyDbNwNGYLmTe-o76rqU17La=nkOw@mail.gmail.com/ [1]
Signed-off-by: Jiayuan Chen <jiayuan.chen@shopee.com>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 70b685a85bf4..5695776f32c8 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -39,6 +39,7 @@ enum memcg_stat_item {
 	MEMCG_KMEM,
 	MEMCG_ZSWAP_B,
 	MEMCG_ZSWAPPED,
+	MEMCG_ZSWAP_INCOMP,
 	MEMCG_NR_STAT,
 };
 
-- 
cgit v1.2.3


From c5c48345135ff04e039377020df23294d59aa59a Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry@gourry.net>
Date: Wed, 11 Feb 2026 16:54:47 -0500
Subject: mm: name the anonymous MMOP enum as enum mmop

Give the MMOP enum (MMOP_OFFLINE, MMOP_ONLINE, etc) a proper type name so
the compiler can help catch invalid values being assigned to variables of
this type.

Leave the existing functions returning int alone to allow for
value-or-error pattern to remain unchanged without churn.

mmop_default_online_type is left as int because it uses the -1 sentinal
value to signal it hasn't been initialized yet.

Keep the uint8_t buffer in offline_and_remove_memory() as-is for space
efficiency, with an explicit cast when we consume the value.

Move the enum definition before the CONFIG_MEMORY_HOTPLUG guard so it is
unconditionally available for struct memory_block in memory.h.

No functional change.

Link: https://lore.kernel.org/linux-mm/3424eba7-523b-4351-abd0-3a888a3e5e61@kernel.org/
Link: https://lkml.kernel.org/r/20260211215447.2194189-1-gourry@gourry.net
Signed-off-by: Gregory Price <gourry@gourry.net>
Suggested-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Suggested-by: "David Hildenbrand (arm)" <david@kernel.org>
Reviewed-by: Ben Cheatham <benjamin.cheatham@amd.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory.h         |  3 ++-
 include/linux/memory_hotplug.h | 16 ++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index faeaa921e55b..5bb5599c6b2b 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -19,6 +19,7 @@
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/mutex.h>
+#include <linux/memory_hotplug.h>
 
 #define MIN_MEMORY_BLOCK_SIZE     (1UL << SECTION_SIZE_BITS)
 
@@ -77,7 +78,7 @@ enum memory_block_state {
 struct memory_block {
 	unsigned long start_section_nr;
 	enum memory_block_state state;	/* serialized by the dev->lock */
-	int online_type;		/* for passing data to online routine */
+	enum mmop online_type;	/* for passing data to online routine */
 	int nid;			/* NID for this memory block */
 	/*
 	 * The single zone of this memory block if all PFNs of this memory block
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f2f16cdd73ee..e77ef3d7ff73 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -16,11 +16,8 @@ struct resource;
 struct vmem_altmap;
 struct dev_pagemap;
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-struct page *pfn_to_online_page(unsigned long pfn);
-
 /* Types for control the zone type of onlined and offlined memory */
-enum {
+enum mmop {
 	/* Offline the memory. */
 	MMOP_OFFLINE = 0,
 	/* Online the memory. Zone depends, see default_zone_for_pfn(). */
@@ -31,6 +28,9 @@ enum {
 	MMOP_ONLINE_MOVABLE,
 };
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+struct page *pfn_to_online_page(unsigned long pfn);
+
 /* Flags for add_memory() and friends to specify memory hotplug details. */
 typedef int __bitwise mhp_t;
 
@@ -286,8 +286,8 @@ static inline void __remove_memory(u64 start, u64 size) {}
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /* Default online_type (MMOP_*) when new memory blocks are added. */
-extern int mhp_get_default_online_type(void);
-extern void mhp_set_default_online_type(int online_type);
+extern enum mmop mhp_get_default_online_type(void);
+extern void mhp_set_default_online_type(enum mmop online_type);
 extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat);
 extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
 extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
@@ -310,8 +310,8 @@ extern void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
 				  struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
-extern struct zone *zone_for_pfn_range(int online_type, int nid,
-		struct memory_group *group, unsigned long start_pfn,
+extern struct zone *zone_for_pfn_range(enum mmop online_type,
+		int nid, struct memory_group *group, unsigned long start_pfn,
 		unsigned long nr_pages);
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
 				      struct mhp_params *params);
-- 
cgit v1.2.3


From 652d12bc74a075f345f228f8945e05517a38874d Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 11 Feb 2026 12:31:38 +0200
Subject: mm: don't special case !MMU for is_zero_pfn() and my_zero_pfn()

Patch series "arch, mm: consolidate empty_zero_page", v3.

These patches cleanup handling of ZERO_PAGE() and zero_pfn.


This patch (of 4):

nommu architectures have empty_zero_page and define ZERO_PAGE() and
although they don't really use it to populate page tables, there is no
reason to hardwire !MMU implementation of is_zero_pfn() and my_zero_pfn()
to 0.

Drop #ifdef CONFIG_MMU around implementations of is_zero_pfn() and
my_zero_pfn() and remove !MMU version.

While on it, make zero_pfn __ro_after_init.

Link: https://lkml.kernel.org/r/20260211103141.3215197-1-rppt@kernel.org
Link: https://lkml.kernel.org/r/20260211103141.3215197-2-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Magnus Lindholm <linmag7@gmail.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a50df42a893f..5e772599d9a5 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1917,7 +1917,6 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot)
 	pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot);
 }
 
-#ifdef CONFIG_MMU
 #ifdef __HAVE_COLOR_ZERO_PAGE
 static inline int is_zero_pfn(unsigned long pfn)
 {
@@ -1940,18 +1939,7 @@ static inline unsigned long my_zero_pfn(unsigned long addr)
 	extern unsigned long zero_pfn;
 	return zero_pfn;
 }
-#endif
-#else
-static inline int is_zero_pfn(unsigned long pfn)
-{
-	return 0;
-}
-
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
-	return 0;
-}
-#endif /* CONFIG_MMU */
+#endif /* __HAVE_COLOR_ZERO_PAGE */
 
 #ifdef CONFIG_MMU
 
-- 
cgit v1.2.3


From 9a1d0c738b45ea8da4e6897099c708e89f43daad Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 11 Feb 2026 12:31:39 +0200
Subject: mm: rename my_zero_pfn() to zero_pfn()

my_zero_pfn() is a silly name.

Rename zero_pfn variable to zero_page_pfn and my_zero_pfn() function to
zero_pfn().

While on it, move extern declarations of zero_page_pfn outside the
functions that use it and add a comment about what ZERO_PAGE is.

Link: https://lkml.kernel.org/r/20260211103141.3215197-3-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Magnus Lindholm <linmag7@gmail.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e772599d9a5..c3a56f6b1ea5 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1917,27 +1917,39 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot)
 	pfnmap_setup_cachemode(pfn, PAGE_SIZE, prot);
 }
 
+/*
+ * ZERO_PAGE() is global shared page(s) that is always zero. It is used for
+ * zero-mapped memory areas, CoW etc.
+ *
+ * On architectures that __HAVE_COLOR_ZERO_PAGE there are several such pages
+ * for different ranges in the virtual address space.
+ *
+ * zero_page_pfn identifies the first (or the only) pfn for these pages.
+ */
 #ifdef __HAVE_COLOR_ZERO_PAGE
 static inline int is_zero_pfn(unsigned long pfn)
 {
-	extern unsigned long zero_pfn;
-	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
+	extern unsigned long zero_page_pfn;
+	unsigned long offset_from_zero_pfn = pfn - zero_page_pfn;
+
 	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
 }
 
-#define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
+#define zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
 
 #else
 static inline int is_zero_pfn(unsigned long pfn)
 {
-	extern unsigned long zero_pfn;
-	return pfn == zero_pfn;
+	extern unsigned long zero_page_pfn;
+
+	return pfn == zero_page_pfn;
 }
 
-static inline unsigned long my_zero_pfn(unsigned long addr)
+static inline unsigned long zero_pfn(unsigned long addr)
 {
-	extern unsigned long zero_pfn;
-	return zero_pfn;
+	extern unsigned long zero_page_pfn;
+
+	return zero_page_pfn;
 }
 #endif /* __HAVE_COLOR_ZERO_PAGE */
 
-- 
cgit v1.2.3


From 6215d9f4470fbb48245ffdfade821685e2728c65 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 11 Feb 2026 12:31:40 +0200
Subject: arch, mm: consolidate empty_zero_page

Reduce 22 declarations of empty_zero_page to 3 and 23 declarations of
ZERO_PAGE() to 4.

Every architecture defines empty_zero_page that way or another, but for the
most of them it is always a page aligned page in BSS and most definitions
of ZERO_PAGE do virt_to_page(empty_zero_page).

Move Linus vetted x86 definition of empty_zero_page and ZERO_PAGE() to the
core MM and drop these definitions in architectures that do not implement
colored zero page (MIPS and s390).

ZERO_PAGE() remains a macro because turning it to a wrapper for a static
inline causes severe pain in header dependencies.

For the most part the change is mechanical, with these being noteworthy:

* alpha: aliased empty_zero_page with ZERO_PGE that was also used for boot
  parameters. Switching to a generic empty_zero_page removes the aliasing
  and keeps ZERO_PGE for boot parameters only
* arm64: uses __pa_symbol() in ZERO_PAGE() so that definition of
  ZERO_PAGE() is kept intact.
* m68k/parisc/um: allocated empty_zero_page from memblock,
  although they do not support zero page coloring and having it in BSS
  will work fine.
* sparc64 can have empty_zero_page in BSS rather allocate it, but it
  can't use virt_to_page() for BSS. Keep it's definition of ZERO_PAGE()
  but instead of allocating it, make mem_map_zero point to
  empty_zero_page.
* sh: used empty_zero_page for boot parameters at the very early boot.
  Rename the parameters page to boot_params_page and let sh use the generic
  empty_zero_page.
* hexagon: had an amusing comment about empty_zero_page

	/* A handy thing to have if one has the RAM. Declared in head.S */

  that unfortunately had to go :)

Link: https://lkml.kernel.org/r/20260211103141.3215197-4-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Helge Deller <deller@gmx.de>		[parisc]
Tested-by: Helge Deller <deller@gmx.de>		[parisc]
Reviewed-by: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Magnus Lindholm <linmag7@gmail.com>	[alpha]
Acked-by: Dinh Nguyen <dinguyen@kernel.org>	[nios2]
Acked-by: Andreas Larsson <andreas@gaisler.com>	[sparc]
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c3a56f6b1ea5..2a05c3885f85 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1925,6 +1925,9 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot)
  * for different ranges in the virtual address space.
  *
  * zero_page_pfn identifies the first (or the only) pfn for these pages.
+ *
+ * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in
+ * empty_zero_page in BSS.
  */
 #ifdef __HAVE_COLOR_ZERO_PAGE
 static inline int is_zero_pfn(unsigned long pfn)
@@ -1951,6 +1954,13 @@ static inline unsigned long zero_pfn(unsigned long addr)
 
 	return zero_page_pfn;
 }
+
+extern uint8_t empty_zero_page[PAGE_SIZE];
+
+#ifndef ZERO_PAGE
+#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))
+#endif
+
 #endif /* __HAVE_COLOR_ZERO_PAGE */
 
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3


From 26513781d1b3a1e8b4b576ed62751d604a69b374 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Wed, 11 Feb 2026 12:31:41 +0200
Subject: mm: cache struct page for empty_zero_page and return it from
 ZERO_PAGE()

For most architectures every invocation of ZERO_PAGE() does
virt_to_page(empty_zero_page).  But empty_zero_page is in BSS and it is
enough to get its struct page once at initialization time and then use it
whenever a zero page should be accessed.

Add yet another __zero_page variable that will be initialized as
virt_to_page(empty_zero_page) for most architectures in a weak
arch_setup_zero_pages() function.

For architectures that use colored zero pages (MIPS and s390) rename their
setup_zero_pages() to arch_setup_zero_pages() and make it global rather
than static.

For architectures that cannot use virt_to_page() for BSS (arm64 and
sparc64) add override of arch_setup_zero_pages().

Link: https://lkml.kernel.org/r/20260211103141.3215197-5-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Magnus Lindholm <linmag7@gmail.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2a05c3885f85..776993d4567b 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1929,6 +1929,8 @@ static inline void pfnmap_setup_cachemode_pfn(unsigned long pfn, pgprot_t *prot)
  * For architectures that don't __HAVE_COLOR_ZERO_PAGE the zero page lives in
  * empty_zero_page in BSS.
  */
+void arch_setup_zero_pages(void);
+
 #ifdef __HAVE_COLOR_ZERO_PAGE
 static inline int is_zero_pfn(unsigned long pfn)
 {
@@ -1956,10 +1958,13 @@ static inline unsigned long zero_pfn(unsigned long addr)
 }
 
 extern uint8_t empty_zero_page[PAGE_SIZE];
+extern struct page *__zero_page;
 
-#ifndef ZERO_PAGE
-#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page))
-#endif
+static inline struct page *_zero_page(unsigned long addr)
+{
+	return __zero_page;
+}
+#define ZERO_PAGE(vaddr) _zero_page(vaddr)
 
 #endif /* __HAVE_COLOR_ZERO_PAGE */
 
-- 
cgit v1.2.3


From 6cc153f90b7cf07db2b49469dfd79141b145036a Mon Sep 17 00:00:00 2001
From: Vernon Yang <yanglincheng@kylinos.cn>
Date: Sat, 21 Feb 2026 17:39:17 +0800
Subject: mm: add folio_test_lazyfree helper

Add folio_test_lazyfree() function to identify lazy-free folios to improve
code readability.

Link: https://lkml.kernel.org/r/20260221093918.1456187-4-vernon2gm@gmail.com
Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Liam Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f7a0e4af0c73..415e9f2ef616 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -724,6 +724,11 @@ static __always_inline bool folio_test_anon(const struct folio *folio)
 	return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0;
 }
 
+static __always_inline bool folio_test_lazyfree(const struct folio *folio)
+{
+	return folio_test_anon(folio) && !folio_test_swapbacked(folio);
+}
+
 static __always_inline bool PageAnonNotKsm(const struct page *page)
 {
 	unsigned long flags = (unsigned long)page_folio(page)->mapping;
-- 
cgit v1.2.3


From 3f2ad90060f65d6f66414b8a67c569154bafec7b Mon Sep 17 00:00:00 2001
From: Jason Miu <jasonmiu@google.com>
Date: Thu, 5 Feb 2026 18:14:27 -0800
Subject: kho: adopt radix tree for preserved memory tracking

Patch series "Make KHO Stateless", v9.

This series transitions KHO from an xarray-based metadata tracking system
with serialization to a radix tree data structure that can be passed
directly to the next kernel.

The key motivations for this change are to:
- Eliminate the need for data serialization before kexec.
- Remove the KHO finalize state.
- Pass preservation metadata more directly to the next kernel via the FDT.

The new approach uses a radix tree to mark preserved pages.  A page's
physical address and its order are encoded into a single value.  The tree
is composed of multiple levels of page-sized tables, with leaf nodes being
bitmaps where each set bit represents a preserved page.  The physical
address of the radix tree's root is passed in the FDT, allowing the next
kernel to reconstruct the preserved memory map.

This series is broken down into the following patches:

1.  kho: Adopt radix tree for preserved memory tracking:
    Replaces the xarray-based tracker with the new radix tree
    implementation and increments the ABI version.

2.  kho: Remove finalize state and clients:
    Removes the now-obsolete kho_finalize() function and its usage
    from client code and debugfs.


This patch (of 2):

Introduce a radix tree implementation for tracking preserved memory pages
and switch the KHO memory tracking mechanism to use it.  This lays the
groundwork for a stateless KHO implementation that eliminates the need for
serialization and the associated "finalize" state.

This patch introduces the core radix tree data structures and constants to
the KHO ABI.  It adds the radix tree node and leaf structures, along with
documentation for the radix tree key encoding scheme that combines a
page's physical address and order.

To support broader use by other kernel subsystems, such as hugetlb
preservation, the core radix tree manipulation functions are exported as a
public API.

The xarray-based memory tracking is replaced with this new radix tree
implementation.  The core KHO preservation and unpreservation functions
are wired up to use the radix tree helpers.  On boot, the second kernel
restores the preserved memory map by walking the radix tree whose root
physical address is passed via the FDT.

The ABI `compatible` version is bumped to "kho-v2" to reflect the
structural changes in the preserved memory map and sub-FDT property names.
This includes renaming "fdt" to "preserved-data" to better reflect that
preserved state may use formats other than FDT.

[ran.xiaokai@zte.com.cn: fix child node parsing for debugfs in/sub_fdts]
  Link: https://lkml.kernel.org/r/20260309033530.244508-1-ranxiaokai627@163.com
Link: https://lkml.kernel.org/r/20260206021428.3386442-1-jasonmiu@google.com
Link: https://lkml.kernel.org/r/20260206021428.3386442-2-jasonmiu@google.com
Signed-off-by: Jason Miu <jasonmiu@google.com>
Signed-off-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: David Matlack <dmatlack@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kho/abi/kexec_handover.h | 144 +++++++++++++++++++++++++++++----
 include/linux/kho_radix_tree.h         |  70 ++++++++++++++++
 2 files changed, 199 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/kho_radix_tree.h

(limited to 'include/linux')

diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h
index 2201a0d2c159..6b7d8ef550f9 100644
--- a/include/linux/kho/abi/kexec_handover.h
+++ b/include/linux/kho/abi/kexec_handover.h
@@ -10,8 +10,13 @@
 #ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H
 #define _LINUX_KHO_ABI_KEXEC_HANDOVER_H
 
+#include <linux/bits.h>
+#include <linux/log2.h>
+#include <linux/math.h>
 #include <linux/types.h>
 
+#include <asm/page.h>
+
 /**
  * DOC: Kexec Handover ABI
  *
@@ -29,32 +34,32 @@
  * compatibility is only guaranteed for kernels supporting the same ABI version.
  *
  * FDT Structure Overview:
- *   The FDT serves as a central registry for physical
- *   addresses of preserved data structures and sub-FDTs. The first kernel
- *   populates this FDT with references to memory regions and other FDTs that
- *   need to persist across the kexec transition. The subsequent kernel then
- *   parses this FDT to locate and restore the preserved data.::
+ *   The FDT serves as a central registry for physical addresses of preserved
+ *   data structures. The first kernel populates this FDT with references to
+ *   memory regions and other metadata that need to persist across the kexec
+ *   transition. The subsequent kernel then parses this FDT to locate and
+ *   restore the preserved data.::
  *
  *     / {
- *         compatible = "kho-v1";
+ *         compatible = "kho-v2";
  *
  *         preserved-memory-map = <0x...>;
  *
  *         <subnode-name-1> {
- *             fdt = <0x...>;
+ *             preserved-data = <0x...>;
  *         };
  *
  *         <subnode-name-2> {
- *             fdt = <0x...>;
+ *             preserved-data = <0x...>;
  *         };
  *               ... ...
  *         <subnode-name-N> {
- *             fdt = <0x...>;
+ *             preserved-data = <0x...>;
  *         };
  *     };
  *
  *   Root KHO Node (/):
- *     - compatible: "kho-v1"
+ *     - compatible: "kho-v2"
  *
  *       Indentifies the overall KHO ABI version.
  *
@@ -69,20 +74,20 @@
  *     is provided by the subsystem that uses KHO for preserving its
  *     data.
  *
- *     - fdt: u64
+ *     - preserved-data: u64
  *
- *       Physical address pointing to a subnode FDT blob that is also
+ *       Physical address pointing to a subnode data blob that is also
  *       being preserved.
  */
 
 /* The compatible string for the KHO FDT root node. */
-#define KHO_FDT_COMPATIBLE "kho-v1"
+#define KHO_FDT_COMPATIBLE "kho-v2"
 
 /* The FDT property for the preserved memory map. */
 #define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map"
 
-/* The FDT property for sub-FDTs. */
-#define KHO_FDT_SUB_TREE_PROP_NAME "fdt"
+/* The FDT property for preserved data blobs. */
+#define KHO_FDT_SUB_TREE_PROP_NAME "preserved-data"
 
 /**
  * DOC: Kexec Handover ABI for vmalloc Preservation
@@ -160,4 +165,113 @@ struct kho_vmalloc {
 	unsigned short order;
 };
 
+/**
+ * DOC: KHO persistent memory tracker
+ *
+ * KHO tracks preserved memory using a radix tree data structure. Each node of
+ * the tree is exactly a single page. The leaf nodes are bitmaps where each set
+ * bit is a preserved page of any order. The intermediate nodes are tables of
+ * physical addresses that point to a lower level node.
+ *
+ * The tree hierarchy is shown below::
+ *
+ *   root
+ *   +-------------------+
+ *   |     Level 5       | (struct kho_radix_node)
+ *   +-------------------+
+ *     |
+ *     v
+ *   +-------------------+
+ *   |     Level 4       | (struct kho_radix_node)
+ *   +-------------------+
+ *     |
+ *     | ... (intermediate levels)
+ *     |
+ *     v
+ *   +-------------------+
+ *   |      Level 0      | (struct kho_radix_leaf)
+ *   +-------------------+
+ *
+ * The tree is traversed using a key that encodes the page's physical address
+ * (pa) and its order into a single unsigned long value. The encoded key value
+ * is composed of two parts: the 'order bit' in the upper part and the
+ * 'shifted physical address' in the lower part.::
+ *
+ *   +------------+-----------------------------+--------------------------+
+ *   | Page Order | Order Bit                   | Shifted Physical Address |
+ *   +------------+-----------------------------+--------------------------+
+ *   | 0          | ...000100 ... (at bit 52)   | pa >> (PAGE_SHIFT + 0)   |
+ *   | 1          | ...000010 ... (at bit 51)   | pa >> (PAGE_SHIFT + 1)   |
+ *   | 2          | ...000001 ... (at bit 50)   | pa >> (PAGE_SHIFT + 2)   |
+ *   | ...        | ...                         | ...                      |
+ *   +------------+-----------------------------+--------------------------+
+ *
+ * Shifted Physical Address:
+ * The 'shifted physical address' is the physical address normalized for its
+ * order. It effectively represents the PFN shifted right by the order.
+ *
+ * Order Bit:
+ * The 'order bit' encodes the page order by setting a single bit at a
+ * specific position. The position of this bit itself represents the order.
+ *
+ * For instance, on a 64-bit system with 4KB pages (PAGE_SHIFT = 12), the
+ * maximum range for the shifted physical address (for order 0) is 52 bits
+ * (64 - 12). This address occupies bits [0-51]. For order 0, the order bit is
+ * set at position 52.
+ *
+ * The following diagram illustrates how the encoded key value is split into
+ * indices for the tree levels, with PAGE_SIZE of 4KB::
+ *
+ *        63:60   59:51    50:42    41:33    32:24    23:15         14:0
+ *   +---------+--------+--------+--------+--------+--------+-----------------+
+ *   |    0    |  Lv 5  |  Lv 4  |  Lv 3  |  Lv 2  |  Lv 1  |  Lv 0 (bitmap)  |
+ *   +---------+--------+--------+--------+--------+--------+-----------------+
+ *
+ * The radix tree stores pages of all orders in a single 6-level hierarchy. It
+ * efficiently shares higher tree levels, especially due to common zero top
+ * address bits, allowing a single, efficient algorithm to manage all
+ * pages. This bitmap approach also offers memory efficiency; for example, a
+ * 512KB bitmap can cover a 16GB memory range for 0-order pages with PAGE_SIZE =
+ * 4KB.
+ *
+ * The data structures defined here are part of the KHO ABI. Any modification
+ * to these structures that breaks backward compatibility must be accompanied by
+ * an update to the "compatible" string. This ensures that a newer kernel can
+ * correctly interpret the data passed by an older kernel.
+ */
+
+/*
+ * Defines constants for the KHO radix tree structure, used to track preserved
+ * memory. These constants govern the indexing, sizing, and depth of the tree.
+ */
+enum kho_radix_consts {
+	/*
+	 * The bit position of the order bit (and also the length of the
+	 * shifted physical address) for an order-0 page.
+	 */
+	KHO_ORDER_0_LOG2 = 64 - PAGE_SHIFT,
+
+	/* Size of the table in kho_radix_node, in log2 */
+	KHO_TABLE_SIZE_LOG2 = const_ilog2(PAGE_SIZE / sizeof(phys_addr_t)),
+
+	/* Number of bits in the kho_radix_leaf bitmap, in log2 */
+	KHO_BITMAP_SIZE_LOG2 = PAGE_SHIFT + const_ilog2(BITS_PER_BYTE),
+
+	/*
+	 * The total tree depth is the number of intermediate levels
+	 * and 1 bitmap level.
+	 */
+	KHO_TREE_MAX_DEPTH =
+		DIV_ROUND_UP(KHO_ORDER_0_LOG2 - KHO_BITMAP_SIZE_LOG2,
+			     KHO_TABLE_SIZE_LOG2) + 1,
+};
+
+struct kho_radix_node {
+	u64 table[1 << KHO_TABLE_SIZE_LOG2];
+};
+
+struct kho_radix_leaf {
+	DECLARE_BITMAP(bitmap, 1 << KHO_BITMAP_SIZE_LOG2);
+};
+
 #endif	/* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */
diff --git a/include/linux/kho_radix_tree.h b/include/linux/kho_radix_tree.h
new file mode 100644
index 000000000000..84e918b96e53
--- /dev/null
+++ b/include/linux/kho_radix_tree.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+