diff options
Diffstat (limited to 'kernel')
26 files changed, 3134 insertions, 571 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 54e581072617..15632358bcf7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -94,30 +94,6 @@ config KEXEC_JUMP Jump between original kernel and kexeced kernel and invoke code in physical address mode via KEXEC -config KEXEC_HANDOVER - bool "kexec handover" - depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE - depends on !DEFERRED_STRUCT_PAGE_INIT - select MEMBLOCK_KHO_SCRATCH - select KEXEC_FILE - select DEBUG_FS - select LIBFDT - select CMA - help - Allow kexec to hand over state across kernels by generating and - passing additional metadata to the target kernel. This is useful - to keep data or state alive across the kexec. For this to work, - both source and target kernels need to have this option enabled. - -config KEXEC_HANDOVER_DEBUG - bool "Enable Kexec Handover debug checks" - depends on KEXEC_HANDOVER - help - This option enables extra sanity checks for the Kexec Handover - subsystem. Since, KHO performance is crucial in live update - scenarios and the extra code might be adding overhead it is - only optionally enabled. - config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index 9fe722305c9b..e83669841b8c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -52,6 +52,7 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += liveupdate/ obj-y += dma/ obj-y += entry/ obj-y += unwind/ @@ -82,8 +83,6 @@ obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o -obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o -obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e81327d2cd63..9f6ab7dabf67 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -83,7 +83,7 @@ CONFIG_SLUB_DEBUG_ON=y # # Debug Oops, Lockups and Hangs # -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 87bf4d41eabb..62e60e0223cf 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsigned long long cma_size) #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { + if (!arch_add_crash_res_to_iomem()) + return 0; + if (crashk_res.start < crashk_res.end) insert_resource(&iomem_resource, &crashk_res); diff --git a/kernel/exit.c b/kernel/exit.c index 4dc1918db67b..8a87021211ae 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -251,10 +251,8 @@ repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials. But shut RCU-lockdep up */ - rcu_read_lock(); + * can't be modifying its own credentials. */ dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); - rcu_read_unlock(); pidfs_exit(p); cgroup_task_release(p); diff --git a/kernel/fork.c b/kernel/fork.c index 198e02e21e6e..b1f3915d5f8e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -208,15 +208,62 @@ struct vm_stack { struct vm_struct *stack_vm_area; }; +static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node) +{ + struct vm_struct *vm_area; + unsigned int i; + + /* + * If the node has memory, we are guaranteed the stacks are backed by local pages. + * Otherwise the pages are arbitrary. + * + * Note that depending on cpuset it is possible we will get migrated to a different + * node immediately after allocating here, so this does *not* guarantee locality for + * arbitrary callers. + */ + scoped_guard(preempt) { + if (node != NUMA_NO_NODE && numa_node_id() != node) + return NULL; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (vm_area) + return vm_area; + } + } + + return NULL; +} + static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; + int nid; - for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *tmp = NULL; + /* + * Don't cache stacks if any of the pages don't match the local domain, unless + * there is no local memory to begin with. + * + * Note that lack of local memory does not automatically mean it makes no difference + * performance-wise which other domain backs the stack. In this case we are merely + * trying to avoid constantly going to vmalloc. + */ + scoped_guard(preempt) { + nid = numa_node_id(); + if (node_state(nid, N_MEMORY)) { + for (i = 0; i < vm_area->nr_pages; i++) { + struct page *page = vm_area->pages[i]; + if (page_to_nid(page) != nid) + return false; + } + } + + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) - return true; + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) + return true; + } } return false; } @@ -283,13 +330,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct vm_struct *vm_area; void *stack; - int i; - - for (i = 0; i < NR_CACHED_STACKS; i++) { - vm_area = this_cpu_xchg(cached_stacks[i], NULL); - if (!vm_area) - continue; + vm_area = alloc_thread_stack_node_from_cache(tsk, node); + if (vm_area) { if (memcg_charge_kernel_stack(vm_area)) { vfree(vm_area->addr); return -ENOMEM; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b2c1f14b8129..d2254c91450b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -24,6 +24,7 @@ #include <linux/sched/sysctl.h> #include <linux/hung_task.h> #include <linux/rwsem.h> +#include <linux/sys_info.h> #include <trace/events/sched.h> @@ -50,7 +51,6 @@ static unsigned long __read_mostly sysctl_hung_task_detect_count; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; -EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: @@ -60,12 +60,17 @@ static unsigned long __read_mostly sysctl_hung_task_check_interval_secs; static int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; -static bool hung_task_show_lock; static bool hung_task_call_panic; -static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +/* + * A bitmask to control what kinds of system info to be printed when + * a hung task is detected, it could be task, memory, lock etc. Refer + * include/linux/sys_info.h for detailed bit definition. + */ +static unsigned long hung_task_si_mask; + #ifdef CONFIG_SMP /* * Should we dump all CPUs backtraces in a hung task event? @@ -81,7 +86,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ static unsigned int __read_mostly sysctl_hung_task_panic = - IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); + CONFIG_BOOTPARAM_HUNG_TASK_PANIC; static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) @@ -218,8 +223,11 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti } #endif -static void check_hung_task(struct task_struct *t, unsigned long timeout) +static void check_hung_task(struct task_struct *t, unsigned long timeout, + unsigned long prev_detect_count) { + unsigned long total_hung_task; + if (!task_is_hung(t, timeout)) return; @@ -229,11 +237,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) */ sysctl_hung_task_detect_count++; + total_hung_task = sysctl_hung_task_detect_count - prev_detect_count; trace_sched_process_hang(t); - if (sysctl_hung_task_panic) { + if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) { console_verbose(); - hung_task_show_lock = true; hung_task_call_panic = true; } @@ -256,10 +264,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); debug_show_blocker(t, timeout); - hung_task_show_lock = true; - if (sysctl_hung_task_all_cpu_backtrace) - hung_task_show_all_bt = true; if (!sysctl_hung_task_warnings) pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n"); } @@ -300,6 +305,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; + unsigned long prev_detect_count = sysctl_hung_task_detect_count; + int need_warning = sysctl_hung_task_warnings; + unsigned long si_mask = hung_task_si_mask; /* * If the system crashed already then all bets are off, @@ -308,7 +316,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; - hung_task_show_lock = false; + rcu_read_lock(); for_each_process_thread(g, t) { @@ -320,18 +328,23 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) last_break = jiffies; } - check_hung_task(t, timeout); + check_hung_task(t, timeout, prev_detect_count); } unlock: rcu_read_unlock(); - if (hung_task_show_lock) - debug_show_all_locks(); - if (hung_task_show_all_bt) { - hung_task_show_all_bt = false; - trigger_all_cpu_backtrace(); + if (!(sysctl_hung_task_detect_count - prev_detect_count)) + return; + + if (need_warning || hung_task_call_panic) { + si_mask |= SYS_INFO_LOCKS; + + if (sysctl_hung_task_all_cpu_backtrace) + si_mask |= SYS_INFO_ALL_BT; } + sys_info(si_mask); + if (hung_task_call_panic) panic("hung_task: blocked tasks"); } @@ -389,7 +402,7 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "hung_task_check_count", @@ -430,6 +443,13 @@ static const struct ctl_table hung_task_sysctls[] = { .mode = 0444, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "hung_task_sys_info", + .data = &hung_task_si_mask, + .maxlen = sizeof(hung_task_si_mask), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static void __init hung_task_sysctl_init(void) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index fa00b239c5d9..0f92acdd354d 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -15,6 +15,7 @@ #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/liveupdate.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> @@ -41,6 +42,7 @@ #include <linux/objtool.h> #include <linux/kmsg_dump.h> #include <linux/dma-map-ops.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/sections.h> @@ -742,7 +744,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) struct kexec_segment *segment = &image->segment[idx]; struct page *cma = image->segment_cma[idx]; char *ptr = page_address(cma); - unsigned long maddr; size_t ubytes, mbytes; int result = 0; unsigned char __user *buf = NULL; @@ -754,15 +755,12 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; - maddr = segment->mem; /* Then copy from source buffer to the CMA one */ while (mbytes) { size_t uchunk, mchunk; - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { @@ -784,7 +782,6 @@ static int kimage_load_cma_segment(struct kimage *image, int idx) } ptr += mchunk; - maddr += mchunk; mbytes -= mchunk; cond_resched(); @@ -839,9 +836,7 @@ static int kimage_load_normal_segment(struct kimage *image, int idx) ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (uchunk) { @@ -904,9 +899,7 @@ static int kimage_load_crash_segment(struct kimage *image, int idx) } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap_local_page(page); - ptr += maddr & ~PAGE_MASK; - mchunk = min_t(size_t, mbytes, - PAGE_SIZE - (maddr & ~PAGE_MASK)); + mchunk = min_t(size_t, mbytes, PAGE_SIZE); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ @@ -1146,6 +1139,10 @@ int kernel_kexec(void) goto Unlock; } + error = liveupdate_reboot(); + if (error) + goto Unlock; + #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { /* @@ -1229,3 +1226,143 @@ int kernel_kexec(void) kexec_unlock(); return error; } + +static ssize_t loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!kexec_image); +} +static struct kobj_attribute loaded_attr = __ATTR_RO(loaded); + +#ifdef CONFIG_CRASH_DUMP +static ssize_t crash_loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); +} +static struct kobj_attribute crash_loaded_attr = __ATTR_RO(crash_loaded); + +#ifdef CONFIG_CRASH_RESERVE +static ssize_t crash_cma_ranges_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + + ssize_t len = 0; + int i; + + for (i = 0; i < crashk_cma_cnt; ++i) { + len += sysfs_emit_at(buf, len, "%08llx-%08llx\n", + crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end); + } + return len; +} +static struct kobj_attribute crash_cma_ranges_attr = __ATTR_RO(crash_cma_ranges); +#endif + +static ssize_t crash_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sysfs_emit(buf, "%zd\n", size); +} +static ssize_t crash_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long cnt; + int ret; + + if (kstrtoul(buf, 0, &cnt)) + return -EINVAL; + + ret = crash_shrink_memory(cnt); + return ret < 0 ? ret : count; +} +static struct kobj_attribute crash_size_attr = __ATTR_RW(crash_size); + +#ifdef CONFIG_CRASH_HOTPLUG +static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + unsigned int sz = crash_get_elfcorehdr_size(); + + return sysfs_emit(buf, "%u\n", sz); +} +static struct kobj_attribute crash_elfcorehdr_size_attr = __ATTR_RO(crash_elfcorehdr_size); + +#endif /* CONFIG_CRASH_HOTPLUG */ +#endif /* CONFIG_CRASH_DUMP */ + +static struct attribute *kexec_attrs[] = { + &loaded_attr.attr, +#ifdef CONFIG_CRASH_DUMP + &crash_loaded_attr.attr, + &crash_size_attr.attr, +#ifdef CONFIG_CRASH_RESERVE + &crash_cma_ranges_attr.attr, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + &crash_elfcorehdr_size_attr.attr, +#endif +#endif + NULL +}; + +struct kexec_link_entry { + const char *target; + const char *name; +}; + +static struct kexec_link_entry kexec_links[] = { + { "loaded", "kexec_loaded" }, +#ifdef CONFIG_CRASH_DUMP + { "crash_loaded", "kexec_crash_loaded" }, + { "crash_size", "kexec_crash_size" }, +#ifdef CONFIG_CRASH_RESERVE + {"crash_cma_ranges", "kexec_crash_cma_ranges"}, +#endif +#ifdef CONFIG_CRASH_HOTPLUG + { "crash_elfcorehdr_size", "crash_elfcorehdr_size" }, +#endif +#endif +}; + +static struct kobject *kexec_kobj; +ATTRIBUTE_GROUPS(kexec); + +static int __init init_kexec_sysctl(void) +{ + int error; + int i; + + kexec_kobj = kobject_create_and_add("kexec", kernel_kobj); + if (!kexec_kobj) { + pr_err("failed to create kexec kobject\n"); + return -ENOMEM; + } + + error = sysfs_create_groups(kexec_kobj, kexec_groups); + if (error) + goto kset_exit; + + for (i = 0; i < ARRAY_SIZE(kexec_links); i++) { + error = compat_only_sysfs_link_entry_to_kobj(kernel_kobj, kexec_kobj, + kexec_links[i].target, + kexec_links[i].name); + if (error) + pr_err("Unable to create %s symlink (%d)", kexec_links[i].name, error); + } + + return 0; + +kset_exit: + kobject_put(kexec_kobj); + return error; +} + +subsys_initcall(init_kexec_sysctl); diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h deleted file mode 100644 index 3c3c7148ceed..000000000000 --- a/kernel/kexec_handover_internal.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H -#define LINUX_KEXEC_HANDOVER_INTERNAL_H - -#include <linux/kexec_handover.h> -#include <linux/types.h> - -extern struct kho_scratch *kho_scratch; -extern unsigned int kho_scratch_cnt; - -#ifdef CONFIG_KEXEC_HANDOVER_DEBUG -bool kho_scratch_overlap(phys_addr_t phys, size_t size); -#else -static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) -{ - return false; -} -#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ - -#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index eefb67d9883c..a9e6354d9e25 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -12,7 +12,7 @@ #include <linux/sysfs.h> #include <linux/export.h> #include <linux/init.h> -#include <linux/kexec.h> +#include <linux/vmcore_info.h> #include <linux/profile.h> #include <linux/stat.h> #include <linux/sched.h> @@ -119,50 +119,6 @@ static ssize_t profiling_store(struct kobject *kobj, KERNEL_ATTR_RW(profiling); #endif -#ifdef CONFIG_KEXEC_CORE -static ssize_t kexec_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", !!kexec_image); -} -KERNEL_ATTR_RO(kexec_loaded); - -#ifdef CONFIG_CRASH_DUMP -static ssize_t kexec_crash_loaded_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sysfs_emit(buf, "%d\n", kexec_crash_loaded()); -} -KERNEL_ATTR_RO(kexec_crash_loaded); - -static ssize_t kexec_crash_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - ssize_t size = crash_get_memory_size(); - - if (size < 0) - return size; - - return sysfs_emit(buf, "%zd\n", size); -} -static ssize_t kexec_crash_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long cnt; - int ret; - - if (kstrtoul(buf, 0, &cnt)) - return -EINVAL; - - ret = crash_shrink_memory(cnt); - return ret < 0 ? ret : count; -} -KERNEL_ATTR_RW(kexec_crash_size); - -#endif /* CONFIG_CRASH_DUMP*/ -#endif /* CONFIG_KEXEC_CORE */ - #ifdef CONFIG_VMCORE_INFO static ssize_t vmcoreinfo_show(struct kobject *kobj, @@ -174,18 +130,6 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#ifdef CONFIG_CRASH_HOTPLUG -static ssize_t crash_elfcorehdr_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - unsigned int sz = crash_get_elfcorehdr_size(); - - return sysfs_emit(buf, "%u\n", sz); -} -KERNEL_ATTR_RO(crash_elfcorehdr_size); - -#endif - #endif /* CONFIG_VMCORE_INFO */ /* whether file capabilities are enabled */ @@ -255,18 +199,8 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_PROFILING &profiling_attr.attr, #endif -#ifdef CONFIG_KEXEC_CORE - &kexec_loaded_attr.attr, -#ifdef CONFIG_CRASH_DUMP - &kexec_crash_loaded_attr.attr, - &kexec_crash_size_attr.attr, -#endif -#endif #ifdef CONFIG_VMCORE_INFO &vmcoreinfo_attr.attr, -#ifdef CONFIG_CRASH_HOTPLUG - &crash_elfcorehdr_size_attr.attr, -#endif #endif #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig new file mode 100644 index 000000000000..9b2515f31afb --- /dev/null +++ b/kernel/liveupdate/Kconfig @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Live Update and Kexec HandOver" + depends on !DEFERRED_STRUCT_PAGE_INIT + +config KEXEC_HANDOVER + bool "kexec handover" + depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + depends on !DEFERRED_STRUCT_PAGE_INIT + select MEMBLOCK_KHO_SCRATCH + select KEXEC_FILE + select LIBFDT + select CMA + help + Allow kexec to hand over state across kernels by generating and + passing additional metadata to the target kernel. This is useful + to keep data or state alive across the kexec. For this to work, + both source and target kernels need to have this option enabled. + +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + +config KEXEC_HANDOVER_DEBUGFS + bool "kexec handover debugfs interface" + default KEXEC_HANDOVER + depends on KEXEC_HANDOVER + select DEBUG_FS + help + Allow to control kexec handover device tree via debugfs + interface, i.e. finalize the state or aborting the finalization. + Also, enables inspecting the KHO fdt trees with the debugfs binary + blobs. + +config KEXEC_HANDOVER_ENABLE_DEFAULT + bool "Enable kexec handover by default" + depends on KEXEC_HANDOVER + help + Enable Kexec Handover by default. This avoids the need to + explicitly pass 'kho=on' on the kernel command line. + + This is useful for systems where KHO is a prerequisite for other + features, such as Live Update, ensuring the mechanism is always + active. + + The default behavior can still be overridden at boot time by + passing 'kho=off'. + +config LIVEUPDATE + bool "Live Update Orchestrator" + depends on KEXEC_HANDOVER + help + Enable the Live Update Orchestrator. Live Update is a mechanism, + typically based on kexec, that allows the kernel to be updated + while keeping selected devices operational across the transition. + These devices are intended to be reclaimed by the new kernel and + re-attached to their original workload without requiring a device + reset. + + Ability to handover a device from current to the next kernel depends + on specific support within device drivers and related kernel + subsystems. + + This feature primarily targets virtual machine hosts to quickly update + the kernel hypervisor with minimal disruption to the running virtual + machines. + + If unsure, say N. + +endmenu diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile new file mode 100644 index 000000000000..7cad2eece32d --- /dev/null +++ b/kernel/liveupdate/Makefile @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 + +luo-y := \ + luo_core.o \ + luo_file.o \ + luo_session.o + +obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUGFS) += kexec_handover_debugfs.o + +obj-$(CONFIG_LIVEUPDATE) += luo.o diff --git a/kernel/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 03d12e27189f..9dc51fab604f 100644 --- a/kernel/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -4,21 +4,22 @@ * Copyright (C) 2023 Alexander Graf <graf@amazon.com> * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> + * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com> */ #define pr_fmt(fmt) "KHO: " fmt #include <linux/cleanup.h> #include <linux/cma.h> +#include <linux/kmemleak.h> #include <linux/count_zeros.h> -#include <linux/debugfs.h> #include <linux/kexec.h> #include <linux/kexec_handover.h> #include <linux/libfdt.h> #include <linux/list.h> #include <linux/memblock.h> -#include <linux/notifier.h> #include <linux/page-isolation.h> +#include <linux/unaligned.h> #include <linux/vmalloc.h> #include <asm/early_ioremap.h> @@ -28,8 +29,9 @@ * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. */ -#include "../mm/internal.h" -#include "kexec_internal.h" +#include "../../mm/internal.h" +#include "../kexec_internal.h" +#include "kexec_handover_internal.h" #define KHO_FDT_COMPATIBLE "kho-v1" #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" @@ -51,7 +53,7 @@ union kho_page_info { static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); -static bool kho_enable __ro_after_init; +static bool kho_enable __ro_after_init = IS_ENABLED(CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT); bool kho_is_enabled(void) { @@ -103,34 +105,19 @@ struct kho_mem_track { struct khoser_mem_chunk; -struct kho_serialization { - struct page *fdt; - struct list_head fdt_list; - struct dentry *sub_fdt_dir; - struct kho_mem_track track; - /* First chunk of serialized preserved memory map */ - struct khoser_mem_chunk *preserved_mem_map; -}; - struct kho_out { - struct blocking_notifier_head chain_head; - - struct dentry *dir; - + void *fdt; + bool finalized; struct mutex lock; /* protects KHO FDT finalization */ - struct kho_serialization ser; - bool finalized; + struct kho_mem_track track; + struct kho_debugfs dbg; }; static struct kho_out kho_out = { - .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head), .lock = __MUTEX_INITIALIZER(kho_out.lock), - .ser = { - .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list), - .track = { - .orders = XARRAY_INIT(kho_out.ser.track.orders, 0), - }, + .track = { + .orders = XARRAY_INIT(kho_out.track.orders, 0), }, .finalized = false, }; @@ -159,26 +146,33 @@ static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) return no_free_ptr(elm); } -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, - unsigned long end_pfn) +static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, + unsigned int order) { struct kho_mem_phys_bits *bits; struct kho_mem_phys *physxa; + const unsigned long pfn_high = pfn >> order; - while (pfn < end_pfn) { - const unsigned int order = - min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - const unsigned long pfn_high = pfn >> order; + physxa = xa_load(&track->orders, order); + if (WARN_ON_ONCE(!physxa)) + re |
