aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@kernel.org>2026-04-04 20:59:34 +0200
committerThomas Gleixner <tglx@kernel.org>2026-04-04 20:59:34 +0200
commitbad28e01f206c533a8ff7fec5efe8ab9607a8364 (patch)
tree3d6ed488c49711f13984e4ce1fb08dd186330ff8 /kernel
parentfd7400cfcbaaa1f3d1b904711d9daf029e996364 (diff)
parent7aaa8047eafd0bd628065b15757d9b48c5f9c07d (diff)
Merge tag 'v7.0-rc6' into irq/core
to be able to merge the hyper-v patch related to randomness.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/btf.c24
-rw-r--r--kernel/bpf/core.c43
-rw-r--r--kernel/bpf/verifier.c33
-rw-r--r--kernel/cgroup/cgroup.c6
-rw-r--r--kernel/cgroup/cpuset.c59
-rw-r--r--kernel/crash_dump_dm_crypt.c4
-rw-r--r--kernel/dma/debug.c9
-rw-r--r--kernel/dma/direct.h7
-rw-r--r--kernel/dma/mapping.c6
-rw-r--r--kernel/dma/swiotlb.c21
-rw-r--r--kernel/events/core.c19
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/futex/core.c2
-rw-r--r--kernel/futex/pi.c3
-rw-r--r--kernel/futex/syscalls.c8
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/snapshot.c11
-rw-r--r--kernel/rcu/rcu.h9
-rw-r--r--kernel/rcu/srcutiny.c19
-rw-r--r--kernel/rcu/srcutree.c211
-rw-r--r--kernel/sched/core.c79
-rw-r--r--kernel/sched/ext.c22
-rw-r--r--kernel/sched/ext_internal.h114
-rw-r--r--kernel/sched/idle.c39
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c36
-rw-r--r--kernel/trace/trace_events_trigger.c79
-rw-r--r--kernel/trace/trace_osnoise.c10
-rw-r--r--kernel/workqueue.c55
-rw-r--r--kernel/workqueue_internal.h1
35 files changed, 620 insertions, 334 deletions
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4872d2a6c42d..71f9143fe90f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1787,7 +1787,16 @@ static void btf_free_id(struct btf *btf)
* of the _bh() version.
*/
spin_lock_irqsave(&btf_idr_lock, flags);
- idr_remove(&btf_idr, btf->id);
+ if (btf->id) {
+ idr_remove(&btf_idr, btf->id);
+ /*
+ * Clear the id here to make this function idempotent, since it will get
+ * called a couple of times for module BTFs: on module unload, and then
+ * the final btf_put(). btf_alloc_id() starts IDs with 1, so we can use
+ * 0 as sentinel value.
+ */
+ WRITE_ONCE(btf->id, 0);
+ }
spin_unlock_irqrestore(&btf_idr_lock, flags);
}
@@ -8115,7 +8124,7 @@ static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct btf *btf = filp->private_data;
- seq_printf(m, "btf_id:\t%u\n", btf->id);
+ seq_printf(m, "btf_id:\t%u\n", READ_ONCE(btf->id));
}
#endif
@@ -8197,7 +8206,7 @@ int btf_get_info_by_fd(const struct btf *btf,
if (copy_from_user(&info, uinfo, info_copy))
return -EFAULT;
- info.id = btf->id;
+ info.id = READ_ONCE(btf->id);
ubtf = u64_to_user_ptr(info.btf);
btf_copy = min_t(u32, btf->data_size, info.btf_size);
if (copy_to_user(ubtf, btf->data, btf_copy))
@@ -8260,7 +8269,7 @@ int btf_get_fd_by_id(u32 id)
u32 btf_obj_id(const struct btf *btf)
{
- return btf->id;
+ return READ_ONCE(btf->id);
}
bool btf_is_kernel(const struct btf *btf)
@@ -8382,6 +8391,13 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
if (btf_mod->module != module)
continue;
+ /*
+ * For modules, we do the freeing of BTF IDR as soon as
+ * module goes away to disable BTF discovery, since the
+ * btf_try_get_module() on such BTFs will fail. This may
+ * be called again on btf_put(), but it's ok to do so.
+ */
+ btf_free_id(btf_mod->btf);
list_del(&btf_mod->list);
if (btf_mod->sysfs_attr)
sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3ece2da55625..7b675a451ec8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1422,6 +1422,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
*to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
break;
+
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
+ case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^
+ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ /*
+ * Cannot use BPF_STX_MEM() macro here as it
+ * hardcodes BPF_MEM mode, losing PROBE_MEM32
+ * and breaking arena addressing in the JIT.
+ */
+ *to++ = (struct bpf_insn) {
+ .code = BPF_STX | BPF_PROBE_MEM32 |
+ BPF_SIZE(from->code),
+ .dst_reg = from->dst_reg,
+ .src_reg = BPF_REG_AX,
+ .off = from->off,
+ };
+ break;
}
out:
return to - to_buff;
@@ -1736,6 +1757,12 @@ bool bpf_opcode_in_insntable(u8 code)
}
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+/* Absolute value of s32 without undefined behavior for S32_MIN */
+static u32 abs_s32(s32 x)
+{
+ return x >= 0 ? (u32)x : -(u32)x;
+}
+
/**
* ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
@@ -1900,8 +1927,8 @@ select_insn:
DST = do_div(AX, (u32) SRC);
break;
case 1:
- AX = abs((s32)DST);
- AX = do_div(AX, abs((s32)SRC));
+ AX = abs_s32((s32)DST);
+ AX = do_div(AX, abs_s32((s32)SRC));
if ((s32)DST < 0)
DST = (u32)-AX;
else
@@ -1928,8 +1955,8 @@ select_insn:
DST = do_div(AX, (u32) IMM);
break;
case 1:
- AX = abs((s32)DST);
- AX = do_div(AX, abs((s32)IMM));
+ AX = abs_s32((s32)DST);
+ AX = do_div(AX, abs_s32((s32)IMM));
if ((s32)DST < 0)
DST = (u32)-AX;
else
@@ -1955,8 +1982,8 @@ select_insn:
DST = (u32) AX;
break;
case 1:
- AX = abs((s32)DST);
- do_div(AX, abs((s32)SRC));
+ AX = abs_s32((s32)DST);
+ do_div(AX, abs_s32((s32)SRC));
if (((s32)DST < 0) == ((s32)SRC < 0))
DST = (u32)AX;
else
@@ -1982,8 +2009,8 @@ select_insn:
DST = (u32) AX;
break;
case 1:
- AX = abs((s32)DST);
- do_div(AX, abs((s32)IMM));
+ AX = abs_s32((s32)DST);
+ do_div(AX, abs_s32((s32)IMM));
if (((s32)DST < 0) == ((s32)IMM < 0))
DST = (u32)AX;
else
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 159b25f8269d..f108c01ff6d0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15910,6 +15910,13 @@ static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *ins
/* Apply bswap if alu64 or switch between big-endian and little-endian machines */
bool need_bswap = alu64 || (to_le == is_big_endian);
+ /*
+ * If the register is mutated, manually reset its scalar ID to break
+ * any existing ties and avoid incorrect bounds propagation.
+ */
+ if (need_bswap || insn->imm == 16 || insn->imm == 32)
+ dst_reg->id = 0;
+
if (need_bswap) {
if (insn->imm == 16)
dst_reg->var_off = tnum_bswap16(dst_reg->var_off);
@@ -15992,7 +15999,7 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins
else
return 0;
- branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ branch = push_stack(env, env->insn_idx, env->insn_idx, false);
if (IS_ERR(branch))
return PTR_ERR(branch);
@@ -17408,6 +17415,12 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
continue;
if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
continue;
+ /*
+ * Skip mixed 32/64-bit links: the delta relationship doesn't
+ * hold across different ALU widths.
+ */
+ if (((reg->id ^ known_reg->id) & BPF_ADD_CONST) == BPF_ADD_CONST)
+ continue;
if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
reg->off == known_reg->off) {
s32 saved_subreg_def = reg->subreg_def;
@@ -17435,7 +17448,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s
scalar32_min_max_add(reg, &fake_reg);
scalar_min_max_add(reg, &fake_reg);
reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
- if (known_reg->id & BPF_ADD_CONST32)
+ if ((reg->id | known_reg->id) & BPF_ADD_CONST32)
zext_32_to_64(reg);
reg_bounds_sync(reg);
}
@@ -19863,11 +19876,14 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
* Also verify that new value satisfies old value range knowledge.
*/
- /* ADD_CONST mismatch: different linking semantics */
- if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST))
- return false;
-
- if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST))
+ /*
+ * ADD_CONST flags must match exactly: BPF_ADD_CONST32 and
+ * BPF_ADD_CONST64 have different linking semantics in
+ * sync_linked_regs() (alu32 zero-extends, alu64 does not),
+ * so pruning across different flag types is unsafe.
+ */
+ if (rold->id &&
+ (rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
return false;
/* Both have offset linkage: offsets must match */
@@ -20904,7 +20920,8 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env,
* state when it exits.
*/
int err = check_resource_leak(env, exception_exit,
- !env->cur_state->curframe,
+ exception_exit || !env->cur_state->curframe,
+ exception_exit ? "bpf_throw" :
"BPF_EXIT instruction in main prog");
if (err)
return err;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index be1d71dda317..01fc2a93f3ef 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5109,6 +5109,12 @@ repeat:
return;
task = list_entry(it->task_pos, struct task_struct, cg_list);
+ /*
+ * Hide tasks that are exiting but not yet removed. Keep zombie
+ * leaders with live threads visible.
+ */
+ if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live))
+ goto repeat;
if (it->flags & CSS_TASK_ITER_PROCS) {
/* if PROCS, skip over tasks which aren't group leaders */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e200de7c60b6..d21868455341 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -879,7 +879,7 @@ generate_doms:
/*
* Cgroup v2 doesn't support domain attributes, just set all of them
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
- * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ * subset of HK_TYPE_DOMAIN_BOOT housekeeping CPUs.
*/
for (i = 0; i < ndoms; i++) {
/*
@@ -888,7 +888,7 @@ generate_doms:
*/
if (!csa || csa[i] == &top_cpuset)
cpumask_and(doms[i], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
else
cpumask_copy(doms[i], csa[i]->effective_cpus);
if (dattr)
@@ -1329,17 +1329,22 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
}
/*
- * update_hk_sched_domains - Update HK cpumasks & rebuild sched domains
+ * cpuset_update_sd_hk_unlock - Rebuild sched domains, update HK & unlock
*
- * Update housekeeping cpumasks and rebuild sched domains if necessary.
- * This should be called at the end of cpuset or hotplug actions.
+ * Update housekeeping cpumasks and rebuild sched domains if necessary and
+ * then do a cpuset_full_unlock().
+ * This should be called at the end of cpuset operation.
*/
-static void update_hk_sched_domains(void)
+static void cpuset_update_sd_hk_unlock(void)
+ __releases(&cpuset_mutex)
+ __releases(&cpuset_top_mutex)
{
+ /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
+
if (update_housekeeping) {
- /* Updating HK cpumasks implies rebuild sched domains */
update_housekeeping = false;
- force_sd_rebuild = true;
cpumask_copy(isolated_hk_cpus, isolated_cpus);
/*
@@ -1350,22 +1355,19 @@ static void update_hk_sched_domains(void)
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus));
- cpus_read_lock();
- mutex_lock(&cpuset_mutex);
+ mutex_unlock(&cpuset_top_mutex);
+ } else {
+ cpuset_full_unlock();
}
- /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */
- if (force_sd_rebuild)
- rebuild_sched_domains_locked();
}
/*
- * Work function to invoke update_hk_sched_domains()
+ * Work function to invoke cpuset_update_sd_hk_unlock()
*/
static void hk_sd_workfn(struct work_struct *work)
{
cpuset_full_lock();
- update_hk_sched_domains();
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
}
/**
@@ -3230,8 +3232,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
- update_hk_sched_domains();
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
if (of_cft(of)->private == FILE_MEMLIST)
schedule_flush_migrate_mm();
return retval ?: nbytes;
@@ -3338,8 +3339,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
cpuset_full_lock();
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
- update_hk_sched_domains();
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
return retval ?: nbytes;
}
@@ -3513,8 +3513,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css)
/* Reset valid partition back to member */
if (is_partition_valid(cs))
update_prstate(cs, PRS_MEMBER);
- update_hk_sched_domains();
- cpuset_full_unlock();
+ cpuset_update_sd_hk_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3923,11 +3922,13 @@ static void cpuset_handle_hotplug(void)
rcu_read_unlock();
}
-
/*
- * Queue a work to call housekeeping_update() & rebuild_sched_domains()
- * There will be a slight delay before the HK_TYPE_DOMAIN housekeeping
- * cpumask can correctly reflect what is in isolated_cpus.
+ * rebuild_sched_domains() will always be called directly if needed
+ * to make sure that newly added or removed CPU will be reflected in
+ * the sched domains. However, if isolated partition invalidation
+ * or recreation is being done (update_housekeeping set), a work item
+ * will be queued to call housekeeping_update() to update the
+ * corresponding housekeeping cpumasks after some slight delay.
*
* We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that
* is still pending. Before the pending bit is cleared, the work data
@@ -3936,8 +3937,10 @@ static void cpuset_handle_hotplug(void)
* previously queued work. Since hk_sd_workfn() doesn't use the work
* item at all, this is not a problem.
*/
- if (update_housekeeping || force_sd_rebuild)
- queue_work(system_unbound_wq, &hk_sd_work);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_cpuslocked();
+ if (update_housekeeping)
+ queue_work(system_dfl_wq, &hk_sd_work);
free_tmpmasks(ptmp);
}
diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c
index 1f4067fbdb94..a20d4097744a 100644
--- a/kernel/crash_dump_dm_crypt.c
+++ b/kernel/crash_dump_dm_crypt.c
@@ -168,8 +168,8 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
memcpy(dm_key->data, ukp->data, ukp->datalen);
dm_key->key_size = ukp->datalen;
- kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size,
- dm_key->key_desc, dm_key->data);
+ kexec_dprintk("Get dm crypt key (size=%u) %s\n", dm_key->key_size,
+ dm_key->key_desc);
out:
up_read(&key->sem);
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 86f87e43438c..0677918f06a8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -453,7 +453,7 @@ static int active_cacheline_set_overlap(phys_addr_t cln, int overlap)
return overlap;
}
-static void active_cacheline_inc_overlap(phys_addr_t cln)
+static void active_cacheline_inc_overlap(phys_addr_t cln, bool is_cache_clean)
{
int overlap = active_cacheline_read_overlap(cln);
@@ -462,7 +462,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln)
/* If we overflowed the overlap counter then we're potentially
* leaking dma-mappings.
*/
- WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
+ WARN_ONCE(!is_cache_clean && overlap > ACTIVE_CACHELINE_MAX_OVERLAP,
pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"),
ACTIVE_CACHELINE_MAX_OVERLAP, &cln);
}
@@ -495,7 +495,7 @@ static int active_cacheline_insert(struct dma_debug_entry *entry,
if (rc == -EEXIST) {
struct dma_debug_entry *existing;
- active_cacheline_inc_overlap(cln);
+ active_cacheline_inc_overlap(cln, entry->is_cache_clean);
existing = radix_tree_lookup(&dma_active_cacheline, cln);
/* A lookup failure here after we got -EEXIST is unexpected. */
WARN_ON(!existing);
@@ -601,7 +601,8 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
unsigned long flags;
int rc;
- entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN);
+ entry->is_cache_clean = attrs & (DMA_ATTR_DEBUGGING_IGNORE_CACHELINES |
+ DMA_ATTR_REQUIRE_COHERENT);
bucket = get_hash_bucket(entry, &flags);
hash_bucket_add(bucket, entry);
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index e89f175e9c2d..6184ff303f08 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -84,7 +84,7 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
dma_addr_t dma_addr;
if (is_swiotlb_force_bounce(dev)) {
- if (attrs & DMA_ATTR_MMIO)
+ if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
return DMA_MAPPING_ERROR;
return swiotlb_map(dev, phys, size, dir, attrs);
@@ -98,7 +98,8 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
dma_addr = phys_to_dma(dev, phys);
if (unlikely(!dma_capable(dev, dma_addr, size, true)) ||
dma_kmalloc_needs_bounce(dev, size, dir)) {
- if (is_swiotlb_active(dev))
+ if (is_swiotlb_active(dev) &&
+ !(attrs & DMA_ATTR_REQUIRE_COHERENT))
return swiotlb_map(dev, phys, size, dir, attrs);
goto err_overflow;
@@ -123,7 +124,7 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
{
phys_addr_t phys;
- if (attrs & DMA_ATTR_MMIO)
+ if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))
/* nothing to do: uncached and no swiotlb */
return;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 3928a509c44c..6d3dd0bd3a88 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -164,6 +164,9 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size,
if (WARN_ON_ONCE(!dev->dma_mask))
return DMA_MAPPING_ERROR;
+ if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT))
+ return DMA_MAPPING_ERROR;
+
if (dma_map_direct(dev, ops) ||
(!is_mmio && arch_dma_map_phys_direct(dev, phys + size)))
addr = dma_direct_map_phys(dev, phys, size, dir, attrs);
@@ -235,6 +238,9 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
+ if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT))
+ return -EOPNOTSUPP;
+
if (WARN_ON_ONCE(!dev->dma_mask))
return 0;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d8e6f1d889d5..9fd73700ddcf 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -30,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <linux/kmsan-checks.h>
#include <linux/iommu-helper.h>
#include <linux/init.h>
#include <linux/memblock.h>
@@ -901,10 +902,19 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
local_irq_save(flags);
page = pfn_to_page(pfn);
- if (dir == DMA_TO_DEVICE)
+ if (dir == DMA_TO_DEVICE) {
+ /*
+ * Ideally, kmsan_check_highmem_page()
+ * could be used here to detect infoleaks,
+ * but callers may map uninitialized buffers
+ * that will be written by the device,
+ * causing false positives.
+ */
memcpy_from_page(vaddr, page, offset, sz);
- else
+ } else {
+ kmsan_unpoison_memory(vaddr, sz);
memcpy_to_page(page, offset, vaddr, sz);
+ }
local_irq_restore(flags);
size -= sz;
@@ -913,8 +923,15 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
offset = 0;
}
} else if (dir == DMA_TO_DEVICE) {
+ /*
+ * Ideally, kmsan_check_memory() could be used here to detect
+ * infoleaks (uninitialized data being sent to device), but
+ * callers may map uninitialized buffers that will be written
+ * by the device, causing false positives.
+ */
memcpy(vaddr, phys_to_virt(orig_addr), size);
} else {
+ kmsan_unpoison_memory(vaddr, size);
memcpy(phys_to_virt(orig_addr), vaddr, size);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1f5699b339ec..89b40e439717 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4813,7 +4813,7 @@ static void __perf_event_read(void *info)
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
- struct pmu *pmu = event->pmu;
+ struct pmu *pmu;
/*
* If this is a task context, we need to check whether it is
@@ -4825,7 +4825,7 @@ static void __perf_event_read(void *info)
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- raw_spin_lock(&ctx->lock);
+ guard(raw_spinlock)(&ctx->lock);
ctx_time_update_event(ctx, event);
perf_event_update_time(event);
@@ -4833,25 +4833,22 @@ static void __perf_event_read(void *info)
perf_event_update_sibling_time(event);
if (event->state != PERF_EVENT_STATE_ACTIVE)
- goto unlock;
+ return;
if (!data->group) {
- pmu->read(event);
+ perf_pmu_read(event);
data->ret = 0;
- goto unlock;
+ return;
}
+ pmu = event->pmu_ctx->pmu;
pmu->start_txn(pmu, PERF_PMU_TXN_READ);
- pmu->read(event);
-
+ perf_pmu_read(event);
for_each_sibling_event(sub, event)
perf_pmu_read(sub);
data->ret = pmu->commit_txn(pmu);
-
-unlock:
- raw_spin_unlock(&ctx->lock);
}
static inline u64 perf_event_count(struct perf_event *event, bool self)
@@ -14744,7 +14741,7 @@ inherit_event(struct perf_event *parent_event,
get_ctx(child_ctx);
child_event->ctx = child_ctx;
- pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ pmu_ctx = find_get_pmu_context(parent_event->pmu_ctx->pmu, child_ctx, child_event);
if (IS_ERR(pmu_ctx)) {
free_event(child_event);
return ERR_CAST(pmu_ctx);
diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..bc2bf58b93b6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1000,6 +1000,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid.cid = MM_CID_UNSET;
tsk->mm_cid.active = 0;
+ INIT_HLIST_NODE(&tsk->mm_cid.node);
#endif
return tsk;
@@ -1586,7 +1587,6 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
tsk->mm = mm;
tsk->active_mm = mm;
- sched_mm_cid_fork(tsk);
return 0;
}
@@ -2498,7 +2498,6 @@ bad_fork_cleanup_namespaces:
exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
- sched_mm_cid_exit(p);
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index cf7e610eac42..31e83a09789e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -342,7 +342,7 @@ static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
if (!vma)
return FUTEX_NO_NODE;
- mpol = vma_policy(vma);
+ mpol = READ_ONCE(vma->vm_policy);
if (!mpol)
return FUTEX_NO_NODE;
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index bc1f7e83a37e..7808068fa59e 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -918,7 +918,7 @@ int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to;
- struct task_struct *exiting = NULL;
+ struct task_struct *exiting;
struct rt_mutex_waiter rt_waiter;
struct futex_q q = futex_q_init;
DEFINE_WAKE_Q(wake_q);
@@ -933,6 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
to = futex_setup_timer(time, &timeout, flags, 0);
retry:
+ exiting = NULL;
ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
if (unlikely(ret != 0))
goto out;
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 743c7a728237..77ad9691f6a6 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -459,6 +459,14 @@ SYSCALL_DEFINE4(futex_requeue,
if (ret)
return ret;
+ /*
+ * For now mandate both flags are identical, like the sys_futex()
+ * interface has. If/when we merge the variable sized futex support,
+ * that patch can modify this test to allow a difference in size.
+ */
+ if (futexes[0].w.flags != futexes[1].w.flags)
+ return -EINVAL;
+
cmpval = futexes[0].w.val;
return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ab25b4aa9095..bfc89083daa9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1144,12 +1144,12 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
lockdep_assert_held(&kprobe_mutex);
ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
- if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret))
+ if (ret < 0)
return ret;
if (*cnt == 0) {
ret = register_ftrace_function(ops);
- if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) {
+ if (ret < 0) {
/*
* At this point, sinec ops is not registered, we should be sefe from
* registering empty filter.
@@ -1178,6 +1178,10 @@ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
int ret;
lockdep_assert_held(&kprobe_mutex);
+ if (unlikely(kprobe_ftrace_disabled)) {
+ /* Now ftrace is disabled forever, disarm is already done. */
+ return 0;
+ }
if (*cnt == 1) {
ret = unregister_ftrace_function(ops);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 5f8c9e12eaec..5429e9f19b65 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -40,7 +40,7 @@ void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
- if (WARN_ON(!saved_gfp_count) || --saved_gfp_count)
+ if (!saved_gfp_count || --saved_gf