aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2026-04-03 08:12:58 -0700
committerAlexei Starovoitov <ast@kernel.org>2026-04-03 08:14:13 -0700
commit891a05ccba927050cee17eb90c74692fe083ddaf (patch)
tree0ca1e1ee18f52cfe550898a7b78ae3839d73e8bd /kernel
parent6f6c794d0ff05dab1fa4677f39043de8a6a80da3 (diff)
parentd8a9a4b11a137909e306e50346148fc5c3b63f9d (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf 7.0-rc6+
Cross-merge BPF and other fixes after downstream PR. Minor conflict in kernel/bpf/verifier.c Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/syscall.c25
-rw-r--r--kernel/bpf/verifier.c37
-rw-r--r--kernel/cgroup/cgroup.c88
-rw-r--r--kernel/cgroup/cpuset.c29
-rw-r--r--kernel/dma/debug.c9
-rw-r--r--kernel/dma/direct.h7
-rw-r--r--kernel/dma/mapping.c6
-rw-r--r--kernel/dma/swiotlb.c21
-rw-r--r--kernel/futex/core.c2
-rw-r--r--kernel/futex/pi.c3
-rw-r--r--kernel/futex/syscalls.c8
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/snapshot.c11
-rw-r--r--kernel/rcu/rcu.h9
-rw-r--r--kernel/rcu/srcutiny.c19
-rw-r--r--kernel/rcu/srcutree.c211
-rw-r--r--kernel/sched/ext.c95
-rw-r--r--kernel/sched/ext_idle.c2
-rw-r--r--kernel/sched/sched.h3
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/trace/bpf_trace.c4
-rw-r--r--kernel/trace/trace_events_trigger.c79
-rw-r--r--kernel/trace/trace_osnoise.c10
-rw-r--r--kernel/workqueue.c25
25 files changed, 519 insertions, 190 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e1505c9cd09e..810991709da7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3261,6 +3261,18 @@ static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
bpf_link_dealloc(link);
}
+static bool bpf_link_is_tracepoint(struct bpf_link *link)
+{
+ /*
+ * Only these combinations support a tracepoint bpf_link.
+ * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use
+ * bpf_raw_tp_link_lops and thus dealloc_deferred(), see
+ * bpf_raw_tp_link_attach().
+ */
+ return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT ||
+ (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP);
+}
+
static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
{
if (rcu_trace_implies_rcu_gp())
@@ -3279,16 +3291,25 @@ static void bpf_link_free(struct bpf_link *link)
if (link->prog)
ops->release(link);
if (ops->dealloc_deferred) {
- /* Schedule BPF link deallocation, which will only then
+ /*
+ * Schedule BPF link deallocation, which will only then
* trigger putting BPF program refcount.
* If underlying BPF program is sleepable or BPF link's target
* attach hookpoint is sleepable or otherwise requires RCU GPs
* to ensure link and its underlying BPF program is not
* reachable anymore, we need to first wait for RCU tasks
- * trace sync, and then go through "classic" RCU grace period
+ * trace sync, and then go through "classic" RCU grace period.
+ *
+ * For tracepoint BPF links, we need to go through SRCU grace
+ * period wait instead when non-faultable tracepoint is used. We
+ * don't need to chain SRCU grace period waits, however, for the
+ * faultable case, since it exclusively uses RCU Tasks Trace.
*/
if (link->sleepable || (link->prog && link->prog->sleepable))
call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */
+ else if (bpf_link_is_tracepoint(link))
+ call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
else
call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
} else if (ops->dealloc) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a431b7d50e1b..5434b162c930 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -642,6 +642,13 @@ static bool is_atomic_load_insn(const struct bpf_insn *insn)
insn->imm == BPF_LOAD_ACQ;
}
+static bool is_atomic_fetch_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm & BPF_FETCH);
+}
+
static int __get_spi(s32 off)
{
return (-off - 1) / BPF_REG_SIZE;
@@ -4499,10 +4506,24 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* dreg still needs precision before this insn
*/
}
- } else if (class == BPF_LDX || is_atomic_load_insn(insn)) {
- if (!bt_is_reg_set(bt, dreg))
+ } else if (class == BPF_LDX ||
+ is_atomic_load_insn(insn) ||
+ is_atomic_fetch_insn(insn)) {
+ u32 load_reg = dreg;
+
+ /*
+ * Atomic fetch operation writes the old value into
+ * a register (sreg or r0) and if it was tracked for
+ * precision, propagate to the stack slot like we do
+ * in regular ldx.
+ */
+ if (is_atomic_fetch_insn(insn))
+ load_reg = insn->imm == BPF_CMPXCHG ?
+ BPF_REG_0 : sreg;
+
+ if (!bt_is_reg_set(bt, load_reg))
return 0;
- bt_clear_reg(bt, dreg);
+ bt_clear_reg(bt, load_reg);
/* scalars can only be spilled into stack w/o losing precision.
* Load from any other memory can be zero extended.
@@ -7996,7 +8017,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (reg->type == CONST_PTR_TO_MAP) {
err = check_ptr_to_map_access(env, regs, regno, off, size, t,
value_regno);
- } else if (base_type(reg->type) == PTR_TO_BUF) {
+ } else if (base_type(reg->type) == PTR_TO_BUF &&
+ !type_may_be_null(reg->type)) {
bool rdonly_mem = type_is_rdonly_mem(reg->type);
u32 *max_access;
@@ -20094,8 +20116,13 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
* since someone could have accessed through (ptr - k), or
* even done ptr -= k in a register, to get a safe access.
*/
- if (rold->range > rcur->range)
+ if (rold->range < 0 || rcur->range < 0) {
+ /* special case for [BEYOND|AT]_PKT_END */
+ if (rold->range != rcur->range)
+ return false;
+ } else if (rold->range > rcur->range) {
return false;
+ }
/* id relations must be preserved */
if (!check_ids(rold->id, rcur->id, idmap))
return false;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 01fc2a93f3ef..4ca3cb993da2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2126,6 +2126,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif
init_waitqueue_head(&cgrp->offline_waitq);
+ init_waitqueue_head(&cgrp->dying_populated_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -6224,6 +6225,78 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return 0;
};
+/**
+ * cgroup_drain_dying - wait for dying tasks to leave before rmdir
+ * @cgrp: the cgroup being removed
+ *
+ * cgroup.procs and cgroup.threads use css_task_iter which filters out
+ * PF_EXITING tasks so that userspace doesn't see tasks that have already been
+ * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
+ * cgroup has non-empty css_sets - is only updated when dying tasks pass through
+ * cgroup_task_dead() in finish_task_switch(). This creates a window where
+ * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
+ * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
+ * tasks.
+ *
+ * This function aligns cgroup_has_tasks() with what userspace can observe. If
+ * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
+ * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
+ * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
+ *
+ * This function only concerns itself with this cgroup's own dying tasks.
+ * Whether the cgroup has children is cgroup_destroy_locked()'s problem.
+ *
+ * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
+ * retry the full check from scratch.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+static int cgroup_drain_dying(struct cgroup *cgrp)
+ __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+ DEFINE_WAIT(wait);
+
+ lockdep_assert_held(&cgroup_mutex);
+retry:
+ if (!cgroup_has_tasks(cgrp))
+ return 0;
+
+ /* Same iterator as cgroup.threads - if any task is visible, it's busy */
+ css_task_iter_start(&cgrp->self, 0, &it);
+ task = css_task_iter_next(&it);
+ css_task_iter_end(&it);
+
+ if (task)
+ return -EBUSY;
+
+ /*
+ * All remaining tasks are PF_EXITING and will pass through
+ * cgroup_task_dead() shortly. Wait for a kick and retry.
+ *
+ * cgroup_has_tasks() can't transition from false to true while we're
+ * holding cgroup_mutex, but the true to false transition happens
+ * under css_set_lock (via cgroup_task_dead()). We must retest and
+ * prepare_to_wait() under css_set_lock. Otherwise, the transition
+ * can happen between our first test and prepare_to_wait(), and we
+ * sleep with no one to wake us.
+ */
+ spin_lock_irq(&css_set_lock);
+ if (!cgroup_has_tasks(cgrp)) {
+ spin_unlock_irq(&css_set_lock);
+ return 0;
+ }
+ prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&css_set_lock);
+ mutex_unlock(&cgroup_mutex);
+ schedule();
+ finish_wait(&cgrp->dying_populated_waitq, &wait);
+ mutex_lock(&cgroup_mutex);
+ goto retry;
+}
+
int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
@@ -6233,9 +6306,12 @@ int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;
- ret = cgroup_destroy_locked(cgrp);
- if (!ret)
- TRACE_CGROUP_PATH(rmdir, cgrp);
+ ret = cgroup_drain_dying(cgrp);
+ if (!ret) {
+ ret = cgroup_destroy_locked(cgrp);
+ if (!ret)
+ TRACE_CGROUP_PATH(rmdir, cgrp);
+ }
cgroup_kn_unlock(kn);
return ret;
@@ -6995,6 +7071,7 @@ void cgroup_task_exit(struct task_struct *tsk)
static void do_cgroup_task_dead(struct task_struct *tsk)
{
+ struct cgrp_cset_link *link;
struct css_set *cset;
unsigned long flags;
@@ -7008,6 +7085,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
+ /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+ if (waitqueue_active(&link->cgrp->dying_populated_waitq))
+ wake_up(&link->cgrp->dying_populated_waitq);
+
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index d21868455341..1335e437098e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2988,7 +2988,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct cpuset *cs, *oldcs;
struct task_struct *task;
- bool cpus_updated, mems_updated;
+ bool setsched_check;
int ret;
/* used later by cpuset_attach() */
@@ -3003,20 +3003,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
if (ret)
goto out_unlock;
- cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
- mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+ /*
+ * Skip rights over task setsched check in v2 when nothing changes,
+ * migration permission derives from hierarchy ownership in
+ * cgroup_procs_write_permission()).
+ */
+ setsched_check = !cpuset_v2() ||
+ !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) ||
+ !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ /*
+ * A v1 cpuset with tasks will have no CPU left only when CPU hotplug
+ * brings the last online CPU offline as users are not allowed to empty
+ * cpuset.cpus when there are active tasks inside. When that happens,
+ * we should allow tasks to migrate out without security check to make
+ * sure they will be able to run after migration.
+ */
+ if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus))
+ setsched_check = false;
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task);
if (ret)
goto out_unlock;
- /*
- * Skip rights over task check in v2 when nothing changes,
- * migration permission derives from hierarchy ownership in
- * cgroup_procs_write_permission()).
- */
- if (!cpuset_v2() || (cpus_updated || mems_updated)) {
+ if (setsched_check) {
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 86f87e43438c..0677918f06a8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -453,7 +453,7 @@ static int active_cacheline_set_overlap(phys_addr_t cln, int overlap)
return overlap;
}
-static void active_cacheline_inc_overlap(phys_addr_t cln)
+static void active_cacheline_inc_overlap(phys_addr_t cln, bool is_cache_clean)
{
int overlap = active_cacheline_read_overlap(cln);
@@ -462,7 +462,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
/* If we overflowed the overlap counter then we're potentially
* leaking dma-mappings.
*/
- WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
+ WARN_ONCE(!is_cache_clean && overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
}
@@ -495,7 +495,7 @@ static int active_cacheline_insert(struct dma_debug_entry *entry,
if (rc == -EEXIST) {
struct dma_debug_entry *existing;
- active_cacheline_inc_overlap(cln);
+ active_cacheline_inc_overlap(cln, entry->is_cache_clean);
existing = radix_tree_lookup(&dma_active_cacheline, cln);
/* A lookup failure here after we got -EEXIST is unexpected. */
WARN_ON(!existing);
@@ -601,7 +601,8 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
unsigned long flags;
int rc;
- entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN);
+ entry->is_cache_clean = attrs & (DMA_ATTR_DEBUGGING_IGNORE_CACHELINES |
+ DMA_ATTR_REQUIRE_COHERENT);
bucket = get_hash_bucket(entry, &flags);
hash_bucket_add(bucket, entry);
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index e89f175e9c2d..6184ff303f08 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -84,7 +84,7 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
dma_addr_t dma_addr;
if (is_swiotlb_force_bounce(dev)) {
- if (attrs & DMA_ATTR_MMIO)
+ if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
return DMA_MAPPING_ERROR;
return swiotlb_map(dev, phys, size, dir, attrs);
@@ -98,7 +98,8 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
dma_addr = phys_to_dma(dev, phys);
if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
dma_kmalloc_needs_bounce(dev, size, dir)) {
- if (is_swiotlb_active(dev))
+ if (is_swiotlb_active(dev) &&
+ !(attrs & DMA_ATTR_REQUIRE_COHERENT))
return swiotlb_map(dev, phys, size, dir, attrs);
goto err_overflow;
@@ -123,7 +124,7 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
{
phys_addr_t phys;
- if (attrs & DMA_ATTR_MMIO)
+ if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
/* nothing to do: uncached and no swiotlb */
return;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 3928a509c44c..6d3dd0bd3a88 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -164,6 +164,9 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
if (WARN_ON_ONCE(!dev->dma_mask))
return DMA_MAPPING_ERROR;
+ if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT))
+ return DMA_MAPPING_ERROR;
+
if (dma_map_direct(dev, ops) ||
(!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
@@ -235,6 +238,9 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
+ if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT))
+ return -EOPNOTSUPP;
+
if (WARN_ON_ONCE(!dev->dma_mask))
return 0;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d8e6f1d889d5..9fd73700ddcf 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -30,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <linux/kmsan-checks.h>
#include <linux/iommu-helper.h>
#include <linux/init.h>
#include <linux/memblock.h>
@@ -901,10 +902,19 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
local_irq_save(flags);
page = pfn_to_page(pfn);
- if (dir == DMA_TO_DEVICE)
+ if (dir == DMA_TO_DEVICE) {
+ /*
+ * Ideally, kmsan_check_highmem_page()
+ * could be used here to detect infoleaks,
+ * but callers may map uninitialized buffers
+ * that will be written by the device,
+ * causing false positives.
+ */
memcpy_from_page(vaddr, page, offset, sz);
- else
+ } else {
+ kmsan_unpoison_memory(vaddr, sz);
memcpy_to_page(page, offset, vaddr, sz);
+ }
local_irq_restore(flags);
size -= sz;
@@ -913,8 +923,15 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
offset = 0;
}
} else if (dir == DMA_TO_DEVICE) {
+ /*
+ * Ideally, kmsan_check_memory() could be used here to detect
+ * infoleaks (uninitialized data being sent to device), but
+ * callers may map uninitialized buffers that will be written
+ * by the device, causing false positives.
+ */
memcpy(vaddr, phys_to_virt(orig_addr), size);
} else {
+ kmsan_unpoison_memory(vaddr, size);
memcpy(phys_to_virt(orig_addr), vaddr, size);
}
}
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index cf7e610eac42..31e83a09789e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -342,7 +342,7 @@ static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
if (!vma)
return FUTEX_NO_NODE;
- mpol = vma_policy(vma);
+ mpol = READ_ONCE(vma->vm_policy);
if (!mpol)
return FUTEX_NO_NODE;
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index bc1f7e83a37e..7808068fa59e 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -918,7 +918,7 @@ int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to;
- struct task_struct *exiting = NULL;
+ struct task_struct *exiting;
struct rt_mutex_waiter rt_waiter;
struct futex_q q = futex_q_init;
DEFINE_WAKE_Q(wake_q);
@@ -933,6 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
to = futex_setup_timer(time, &timeout, flags, 0);
retry:
+ exiting = NULL;
ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 743c7a728237..77ad9691f6a6 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -459,6 +459,14 @@ SYSCALL_DEFINE4(futex_requeue,
if (ret)
return ret;
+ /*
+ * For now mandate both flags are identical, like the sys_futex()
+ * interface has. If/when we merge the variable sized futex support,
+ * that patch can modify this test to allow a difference in size.
+ */
+ if (futexes[0].w.flags != futexes[1].w.flags)
+ return -EINVAL;
+
cmpval = futexes[0].w.val;
return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 5f8c9e12eaec..5429e9f19b65 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -40,7 +40,7 @@ void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
- if (WARN_ON(!saved_gfp_count) || --saved_gfp_count)
+ if (!saved_gfp_count || --saved_gfp_count)
return;
gfp_allowed_mask = saved_gfp_mask;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6e1321837c66..a564650734dc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2855,6 +2855,17 @@ int snapshot_write_finalize(struct snapshot_handle *handle)
{
int error;
+ /*
+ * Call snapshot_write_next() to drain any trailing zero pages,
+ * but make sure we're in the data page region first.
+ * This function can return PAGE_SIZE if the kernel was expecting
+ * another copy page. Return -ENODATA in that situation.
+ */
+ if (handle->cur > nr_meta_pages + 1) {
+ error = snapshot_write_next(handle);
+ if (error)
+ return error > 0 ? -ENODATA : error;
+ }
copy_last_highmem_page();
error = hibernate_restore_protect_page(handle->buffer);
/* Do that only if we have loaded the image entirely */
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index dc5d614b372c..9b10b57b79ad 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -502,6 +502,15 @@ do { \
___locked; \
})
+#define raw_spin_trylock_irqsave_rcu_node(p, flags) \
+({ \
+ bool ___locked = raw_spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 3450c3751ef7..a2e2d516e51b 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -9,6 +9,7 @@
*/
#include <linux/export.h>
+#include <linux/irq_work.h>
#include <linux/mutex.h>
#include <linux/preempt.h>
#include <linux/rcupdate_wait.h>
@@ -41,6 +42,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
ssp->srcu_idx_max = 0;
INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
INIT_LIST_HEAD(&ssp->srcu_work.entry);
+ init_irq_work(&ssp->srcu_irq_work, srcu_tiny_irq_work);
return 0;
}
@@ -84,6 +86,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
+ irq_work_sync(&ssp->srcu_irq_work);
flush_work(&ssp->srcu_work);
WARN_ON(ssp->srcu_gp_running);
WARN_ON(ssp->srcu_gp_waiting);
@@ -177,6 +180,20 @@ void srcu_drive_gp(struct work_struct *wp)
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
+/*
+ * Use an irq_work to defer schedule_work() to avoid acquiring the workqueue
+ * pool->lock while the caller might hold scheduler locks, causing lockdep
+ * splats due to workqueue_init() doing a wakeup.
+ */
+void srcu_tiny_irq_work(struct irq_work *irq_work)
+{
+ struct srcu_struct *ssp;
+
+ ssp = container_of(irq_work, struct srcu_struct, srcu_irq_work);
+ schedule_work(&ssp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(srcu_tiny_irq_work);
+
static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
unsigned long cookie;
@@ -189,7 +206,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
WRITE_ONCE(ssp->srcu_idx_max, cookie);
if (!READ_ONCE(ssp->srcu_gp_running)) {
if (likely(srcu_init_done))
- schedule_work(&ssp->srcu_work);
+ irq_work_queue(&ssp->srcu_irq_work);
else if (list_empty(&ssp->srcu_work.entry))
list_add(&ssp->srcu_work.entry, &srcu_boot_list);
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index aef8e91ad33e..0d01cd8c4b4a 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
+#include <linux/irq_work.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched.h>
#include <linux/smp.h>
@@ -75,44 +76,9 @@ static bool __read_mostly srcu_init_done;
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);
static void process_srcu(struct work_struct *work);
+static void srcu_irq_work(struct irq_work *work);
static void srcu_delay_timer(struct timer_list *t);
-/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
-#define spin_lock_rcu_node(p) \
-do { \
- spin_lock(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
-
-#define spin_lock_irq_rcu_node(p) \
-do { \
- spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define spin_unlock_irq_rcu_node(p) \
- spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
-
-#define spin_lock_irqsave_rcu_node(p, flags) \
-do { \
- spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- smp_mb__after_unlock_lock(); \
-} while (0)
-
-#define spin_trylock_irqsave_rcu_node(p, flags) \
-({ \
- bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- \
- if (___locked) \
- smp_mb__after_unlock_lock(); \
- ___locked; \
-})
-
-#define spin_unlock_irqrestore_rcu_node(p, flags) \
- spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
-
/*
* Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
@@ -131,7 +97,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
*/
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
@@ -186,7 +152,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
/* Each pass through this loop initializes one srcu_node structure. */
srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_init(&ACCESS_PRIVATE(snp, lock));
+ raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
@@ -242,7 +208,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
if (!ssp->srcu_sup)
return -ENOMEM;
if (!is_static)
- spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ raw_spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
ssp->srcu_sup->node = NULL;
mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
@@ -252,6 +218,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ init_irq_work(&ssp->srcu_sup->irq_work, srcu_irq_work);
ssp->srcu_sup->sda_is_static = is_static;
if (!is_static) {
ssp->sda = alloc_percpu(struct srcu_data);
@@ -263,9 +230,12 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
- if (!init_srcu_struct_nodes(ssp, is_static ? GFP_ATOMIC : GFP_KERNEL))
+ if (!preemptible())
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
+ else if (init_srcu_struct_nodes(ssp, GFP_KERNEL))
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
+ else
goto err_free_sda;
- WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
ssp->srcu_sup->srcu_ssp = ssp;
smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed,
@@ -394,20 +364,20 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
/* Double-checked locking on ->srcu_size-state. */
if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
return;
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
__srcu_transition_to_big(ssp);
- spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
* Check to see if the just-encountered contention event justifies
* a transition to SRCU_SIZE_BIG.
*/
-static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
+static void raw_spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
{
unsigned long j;
@@ -429,16 +399,16 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
* to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
* parameter permits this.
*/
-static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+static void raw_spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
{
struct srcu_struct *ssp = sdp->ssp;
- if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+ if (raw_spin_trylock_irqsave_rcu_node(sdp, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
- spin_lock_irqsave_check_contention(ssp);
- spin_unlock_irqrestore_rcu