aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2025-03-04 11:19:21 +0100
committerIngo Molnar <mingo@kernel.org>2025-03-04 11:19:21 +0100
commitcfdaa618defc5ebe1ee6aa5bd40a7ccedffca6de (patch)
treefba004535821850f0d10cc4deac3885545083f0c /kernel
parentad546940b5991d3e141238cd80a6d1894b767184 (diff)
parent4f2a0b765c9731d2fa94e209ee9ae0e96b280f17 (diff)
Merge branch 'x86/cpu' into x86/asm, to pick up dependent commits
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c134
-rw-r--r--kernel/bpf/arena.c2
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c2
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/bpf/ringbuf.c4
-rw-r--r--kernel/bpf/syscall.c43
-rw-r--r--kernel/bpf/verifier.c31
-rw-r--r--kernel/cgroup/dmem.c50
-rw-r--r--kernel/events/core.c31
-rw-r--r--kernel/events/uprobes.c17
-rw-r--r--kernel/iomem.c5
-rw-r--r--kernel/module/main.c81
-rw-r--r--kernel/module/strict_rwx.c9
-rw-r--r--kernel/rseq.c11
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/ext.c11
-rw-r--r--kernel/sched/sched.h25
-rw-r--r--kernel/trace/fprobe.c12
-rw-r--r--kernel/trace/ftrace.c63
-rw-r--r--kernel/trace/trace_events.c11
-rw-r--r--kernel/trace/trace_events_hist.c30
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/vhost_task.c4
-rw-r--r--kernel/workqueue.c4
24 files changed, 319 insertions, 271 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 31222e8cd534..6520baa13669 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -103,48 +103,50 @@ struct bsd_acct_struct {
atomic_long_t count;
struct rcu_head rcu;
struct mutex lock;
- int active;
+ bool active;
+ bool check_space;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct work_struct work;
struct completion done;
+ acct_t ac;
};
-static void do_acct_process(struct bsd_acct_struct *acct);
+static void fill_ac(struct bsd_acct_struct *acct);
+static void acct_write_process(struct bsd_acct_struct *acct);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct bsd_acct_struct *acct)
+static bool check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_after_jiffies(acct->needcheck))
- goto out;
+ if (!acct->check_space)
+ return acct->active;
/* May block */
if (vfs_statfs(&acct->file->f_path, &sbuf))
- goto out;
+ return acct->active;
if (acct->active) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
- acct->active = 0;
+ acct->active = false;
pr_info("Process accounting paused\n");
}
} else {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
- acct->active = 1;
+ acct->active = true;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-out:
return acct->active;
}
@@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct = to_acct(pin);
mutex_lock(&acct->lock);
- do_acct_process(acct);
+ /*
+ * Fill the accounting struct with the exiting task's info
+ * before punting to the workqueue.
+ */
+ fill_ac(acct);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
cmpxchg(&acct->ns->bacct, pin, NULL);
@@ -202,6 +208,9 @@ static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
+
+ /* We were fired by acct_pin_kill() which holds acct->lock. */
+ acct_write_process(acct);
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
@@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname)
return -EACCES;
}
+ /* Exclude kernel kernel internal filesystems. */
+ if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return -EINVAL;
+ }
+
+ /* Exclude procfs and sysfs. */
+ if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return -EINVAL;
+ }
+
if (!(file->f_mode & FMODE_CAN_WRITE)) {
kfree(acct);
filp_close(file, NULL);
@@ -430,13 +453,27 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
-static void fill_ac(acct_t *ac)
+static void fill_ac(struct bsd_acct_struct *acct)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ struct file *file = acct->file;
+ acct_t *ac = &acct->ac;
u64 elapsed, run_time;
time64_t btime;
struct tty_struct *tty;
+ lockdep_assert_held(&acct->lock);
+
+ if (time_is_after_jiffies(acct->needcheck)) {
+ acct->check_space = false;
+
+ /* Don't fill in @ac if nothing will be written. */
+ if (!acct->active)
+ return;
+ } else {
+ acct->check_space = true;
+ }
+
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
@@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac)
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
-}
-/*
- * do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct)
-{
- acct_t ac;
- unsigned long flim;
- const struct cred *orig_cred;
- struct file *file = acct->file;
- /*
- * Accounting records are not subject to resource limits.
- */
- flim = rlimit(RLIMIT_FSIZE);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- /* Perform file operations on behalf of whoever enabled accounting */
- orig_cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system.
- */
- if (!check_free_space(acct))
- goto out;
-
- fill_ac(&ac);
/* we really need to bite the bullet and change layout */
- ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
- ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+ ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
+ ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
- ac.ac_uid16 = ac.ac_uid;
- ac.ac_gid16 = ac.ac_gid;
+ ac->ac_uid16 = ac->ac_uid;
+ ac->ac_gid16 = ac->ac_gid;
#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
- ac.ac_pid = task_tgid_nr_ns(current, ns);
+ ac->ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
- ns);
+ ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
}
#endif
+}
+
+static void acct_write_process(struct bsd_acct_struct *acct)
+{
+ struct file *file = acct->file;
+ const struct cred *cred;
+ acct_t *ac = &acct->ac;
+
+ /* Perform file operations on behalf of whoever enabled accounting */
+ cred = override_creds(file->f_cred);
+
/*
- * Get freeze protection. If the fs is frozen, just skip the write
- * as we could deadlock the system otherwise.
+ * First check to see if there is enough free_space to continue
+ * the process accounting system. Then get freeze protection. If
+ * the fs is frozen, just skip the write as we could deadlock
+ * the system otherwise.
*/
- if (file_start_write_trylock(file)) {
+ if (check_free_space(acct) && file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
- __kernel_write(file, &ac, sizeof(acct_t), &pos);
+ __kernel_write(file, ac, sizeof(acct_t), &pos);
file_end_write(file);
}
-out:
+
+ revert_creds(cred);
+}
+
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ unsigned long flim;
+
+ /* Accounting records are not subject to resource limits. */
+ flim = rlimit(RLIMIT_FSIZE);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ fill_ac(acct);
+ acct_write_process(acct);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
- revert_creds(orig_cred);
}
/**
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 870aeb51d70a..095a9554e1de 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -39,7 +39,7 @@
*/
/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
-#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8)
+#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
struct bpf_arena {
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index d5dc65bb1755..54ff2a85d4c0 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -153,7 +153,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
static void cgroup_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+ bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy);
}
/* *gfp_flags* is a hidden argument provided by the verifier */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9de6acddd479..c3223e0db2f5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6507,6 +6507,8 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
/* rxrpc */
{ "rxrpc_recvdata", 0x1 },
{ "rxrpc_resend", 0x10 },
+ /* skb */
+ {"kfree_skb", 0x1000},
/* sunrpc */
{ "xs_stream_read_data", 0x1 },
/* ... from xprt_cong_event event class */
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index e1cfe890e0be..1499d8caa9a3 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -268,8 +268,6 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma
/* allow writable mapping for the consumer_pos only */
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
- } else {
- vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
@@ -289,8 +287,6 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
* position, and the ring buffer data itself.
*/
return -EPERM;
- } else {
- vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c420edbfb7c8..e1e42e918ba7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1035,7 +1035,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = {
static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
- int err;
+ int err = 0;
if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
@@ -1059,24 +1059,33 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
err = -EACCES;
goto out;
}
+ bpf_map_write_active_inc(map);
}
+out:
+ mutex_unlock(&map->freeze_mutex);
+ if (err)
+ return err;
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
vm_flags_clear(vma, VM_MAYEXEC);
+ /* If mapping is read-only, then disallow potentially re-mapping with
+ * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing
+ * means that as far as BPF map's memory-mapped VMAs are concerned,
+ * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set,
+ * both should be set, so we can forget about VM_MAYWRITE and always
+ * check just VM_WRITE
+ */
if (!(vma->vm_flags & VM_WRITE))
- /* disallow re-mapping with PROT_WRITE */
vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
- if (err)
- goto out;
+ if (err) {
+ if (vma->vm_flags & VM_WRITE)
+ bpf_map_write_active_dec(map);
+ }
- if (vma->vm_flags & VM_MAYWRITE)
- bpf_map_write_active_inc(map);
-out:
- mutex_unlock(&map->freeze_mutex);
return err;
}
@@ -1968,8 +1977,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
return err;
}
-#define MAP_LOOKUP_RETRIES 3
-
int generic_map_lookup_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1979,8 +1986,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
void *buf, *buf_prevkey, *prev_key, *key, *value;
- int err, retry = MAP_LOOKUP_RETRIES;
u32 value_size, cp, max_count;
+ int err;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
@@ -2026,14 +2033,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
err = bpf_map_copy_value(map, key, value,
attr->batch.elem_flags);
- if (err == -ENOENT) {
- if (retry) {
- retry--;
- continue;
- }
- err = -EINTR;
- break;
- }
+ if (err == -ENOENT)
+ goto next_key;
if (err)
goto free_buf;
@@ -2048,12 +2049,12 @@ int generic_map_lookup_batch(struct bpf_map *map,
goto free_buf;
}
+ cp++;
+next_key:
if (!prev_key)
prev_key = buf_prevkey;
swap(prev_key, key);
- retry = MAP_LOOKUP_RETRIES;
- cp++;
cond_resched();
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f74263b206e4..f4859516b190 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1501,6 +1501,8 @@ static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum r
struct bpf_reference_state *s;
s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
s->type = type;
s->id = id;
s->ptr = ptr;
@@ -9149,10 +9151,11 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
return 0;
}
-/* Returns constant key value if possible, else negative error */
-static s64 get_constant_map_key(struct bpf_verifier_env *env,
+/* Returns constant key value in `value` if possible, else negative error */
+static int get_constant_map_key(struct bpf_verifier_env *env,
struct bpf_reg_state *key,
- u32 key_size)
+ u32 key_size,
+ s64 *value)
{
struct bpf_func_state *state = func(env, key);
struct bpf_reg_state *reg;
@@ -9179,8 +9182,10 @@ static s64 get_constant_map_key(struct bpf_verifier_env *env,
/* First handle precisely tracked STACK_ZERO */
for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--)
zero_size++;
- if (zero_size >= key_size)
+ if (zero_size >= key_size) {
+ *value = 0;
return 0;
+ }
/* Check that stack contains a scalar spill of expected size */
if (!is_spilled_scalar_reg(&state->stack[spi]))
@@ -9203,9 +9208,12 @@ static s64 get_constant_map_key(struct bpf_verifier_env *env,
if (err < 0)
return err;
- return reg->var_off.value;
+ *value = reg->var_off.value;
+ return 0;
}
+static bool can_elide_value_nullness(enum bpf_map_type type);
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn,
@@ -9354,9 +9362,16 @@ skip_type_check:
err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
if (err)
return err;
- meta->const_map_key = get_constant_map_key(env, reg, key_size);
- if (meta->const_map_key < 0 && meta->const_map_key != -EOPNOTSUPP)
- return meta->const_map_key;
+ if (can_elide_value_nullness(meta->map_ptr->map_type)) {
+ err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
+ if (err < 0) {
+ meta->const_map_key = -1;
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ else
+ return err;
+ }
+ }
break;
case ARG_PTR_TO_MAP_VALUE:
if (type_may_be_null(arg_type) && register_is_null(reg))
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index fbe34299673d..10b63433f057 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -220,60 +220,32 @@ dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
struct dmem_cgroup_pool_state *test_pool)
{
struct page_counter *climit;
- struct cgroup_subsys_state *css, *next_css;
+ struct cgroup_subsys_state *css;
struct dmemcg_state *dmemcg_iter;
- struct dmem_cgroup_pool_state *pool, *parent_pool;
- bool found_descendant;
+ struct dmem_cgroup_pool_state *pool, *found_pool;
climit = &limit_pool->cnt;
rcu_read_lock();
- parent_pool = pool = limit_pool;
- css = &limit_pool->cs->css;
- /*
- * This logic is roughly equivalent to css_foreach_descendant_pre,
- * except we also track the parent pool to find out which pool we need
- * to calculate protection values for.
- *
- * We can stop the traversal once we find test_pool among the
- * descendants since we don't really care about any others.
- */
- while (pool != test_pool) {
- next_css = css_next_child(NULL, css);
- if (next_css) {
- parent_pool = pool;
- } else {
- while (css != &limit_pool->cs->css) {
- next_css = css_next_child(css, css->parent);
- if (next_css)
- break;
- css = css->parent;
- parent_pool = pool_parent(parent_pool);
- }
- /*
- * We can only hit this when test_pool is not a
- * descendant of limit_pool.
- */
- if (WARN_ON_ONCE(css == &limit_pool->cs->css))
- break;
- }
- css = next_css;
-
- found_descendant = false;
+ css_for_each_descendant_pre(css, &limit_pool->cs->css) {
dmemcg_iter = container_of(css, struct dmemcg_state, css);
+ found_pool = NULL;
list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
- if (pool_parent(pool) == parent_pool) {
- found_descendant = true;
+ if (pool->region == limit_pool->region) {
+ found_pool = pool;
break;
}
}
- if (!found_descendant)
+ if (!found_pool)
continue;
page_counter_calculate_protection(
- climit, &pool->cnt, true);
+ climit, &found_pool->cnt, true);
+
+ if (found_pool == test_pool)
+ break;
}
rcu_read_unlock();
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bcb09e011e9e..6364319e2f88 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4950,7 +4950,7 @@ static struct perf_event_pmu_context *
find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
struct perf_event *event)
{
- struct perf_event_pmu_context *new = NULL, *epc;
+ struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;
void *task_ctx_data = NULL;
if (!ctx->task) {
@@ -5007,12 +5007,19 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
atomic_inc(&epc->refcount);
goto found_epc;
}
+ /* Make sure the pmu_ctx_list is sorted by PMU type: */
+ if (!pos && epc->pmu->type > pmu->type)
+ pos = epc;
}
epc = new;
new = NULL;
- list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ if (!pos)
+ list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ else
+ list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev);
+
epc->ctx = ctx;
found_epc:
@@ -5962,14 +5969,15 @@ static int _perf_event_period(struct perf_event *event, u64 value)
if (!value)
return -EINVAL;
- if (event->attr.freq && value > sysctl_perf_event_sample_rate)
- return -EINVAL;
-
- if (perf_event_check_period(event, value))
- return -EINVAL;
-
- if (!event->attr.freq && (value & (1ULL << 63)))
- return -EINVAL;
+ if (event->attr.freq) {
+ if (value > sysctl_perf_event_sample_rate)
+ return -EINVAL;
+ } else {
+ if (perf_event_check_period(event, value))
+ return -EINVAL;
+ if (value & (1ULL << 63))
+ return -EINVAL;
+ }
event_function_call(event, __perf_event_period, &value);
@@ -8321,7 +8329,8 @@ void perf_event_exec(void)
perf_event_enable_on_exec(ctx);
perf_event_remove_on_exec(ctx);
- perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+ scoped_guard(rcu)
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
perf_unpin_context(ctx);
put_ctx(ctx);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2ca797cbe465..b4ca8898fe17 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -417,7 +417,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe,
struct mm_struct *mm, short d)
{
pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
- "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
+ "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
(unsigned long long) uprobe->offset,
(unsigned long long) uprobe->ref_ctr_offset, mm);
@@ -495,6 +495,11 @@ retry:
if (ret <= 0)
goto put_old;
+ if (is_zero_page(old_page)) {
+ ret = -EINVAL;
+ goto put_old;
+ }
+
if (WARN(!is_register && PageCompound(old_page),
"uprobe unregister should never work on compound page\n")) {
ret = -EINVAL;
@@ -762,10 +767,14 @@ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
enum hprobe_state hstate;
/*
- * return_instance's hprobe is protected by RCU.
- * Underlying uprobe is itself protected from reuse by SRCU.
+ * Caller should guarantee that return_instance is not going to be
+ * freed from under us. This can be achieved either through holding
+ * rcu_read_lock() or by owning return_instance in the first place.
+ *
+ * Underlying uprobe is itself protected from reuse by SRCU, so ensure
+ * SRCU lock is held properly.
*/
- lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu));
+ lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));
hstate = READ_ONCE(hprobe->state);
switch (hstate) {
diff --git a/kernel/iomem.c b/kernel/iomem.c
index dc2120776e1c..75e61c1c6bc0 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -6,7 +6,8 @@
#include <linux/ioremap.h>
#ifndef arch_memremap_wb
-static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size,
+ unsigned long flags)
{
#ifdef ioremap_cache
return (__force void *)ioremap_cache(offset, size);
@@ -91,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
if (is_ram == REGION_INTERSECTS)
addr = try_ram_remap(offset, size, flags);
if (!addr)
- addr = arch_memremap_wb(offset, size);
+ addr = arch_memremap_wb(offset, size, flags);
}
/*
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 1fb9ad289a6f..a256cc919ad7 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1221,18 +1221,6 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
-void *__module_writable_address(struct module *mod, void *loc)
-{
- for_class_mod_mem_type(type, text) {
- struct module_memory *mem = &mod->mem[type];
-
- if (loc >= mem->base && loc < mem->base + mem->size)
- return loc + (mem->rw_copy - mem->base);
- }
-
- return loc;
-}
-
static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
{
unsigned int size = PAGE_ALIGN(mod->mem[type].size);
@@ -1250,21 +1238,15 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
if (!ptr)
return -ENOMEM;
- mod->mem[type].base = ptr;
-
if (execmem_is_rox(execmem_type)) {
- ptr = vzalloc(size);
+ int err = execmem_make_temp_rw(ptr, size);
- if (!ptr) {
- execmem_free(mod->mem[type].base);
+ if (err) {
+ execmem_free(ptr);
return -ENOMEM;
}
- mod->mem[type].rw_copy = ptr;
mod->mem[type].is_rox = true;
- } else {
- mod->mem[type].rw_copy = mod->mem[type].base;
- memset(mod->mem[type].base, 0, size);
}
/*
@@ -1278,18 +1260,29 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
* *do* eventually get freed, but let's just keep things simple
* and avoid *any* false positives.
*/
- kmemleak_not_leak(ptr);
+ if (!mod->mem[type].is_rox)
+ kmemleak_not_leak(ptr);
+
+ memset(ptr, 0, size);
+ mod->mem[type].base = ptr;
return 0;
}
+static void module_memory_restore_rox(struct module *mod)
+{
+ for_class_mod_mem_type(type, text) {
+ struct module_memory *mem = &mod->mem[type];
+
+ if (mem->is_rox)
+ execmem_restore_rox(mem->base, mem->size);
+ }
+}
+
static void module_memory_free(struct module *mod, enum mod_mem_type type)
{
struct module_memory *mem = &mod->mem[type];
- if (mem->is_rox)
- vfree(mem->rw_copy);
-
execmem_free(mem->base);
}
@@ -2642,7 +2635,6 @@ static int move_module(struct module *mod, struct load_info *info)
for_each_mod_mem_type(type) {
if (!mod->mem[type].size) {
mod->mem[type].base = NULL;
- mod->mem[type].rw_copy = NULL;
continue;
}
@@ -2659,7 +2651,6 @@ static int move_module(struct module *mod, struct load_info *info)
void *dest;
Elf_Shdr *shdr = &info->sechdrs[i];
const char *sname;
- unsigned long addr;
if (!(shdr->sh_flags & SHF_ALLOC))
continue;
@@ -2680,14 +2671,12 @@ static int move_module(struct module *mod, struct load_info *info)
ret = PTR_ERR(dest);
goto out_err;
}
- addr = (unsigned long)dest;
codetag_section_found = true;
} else {
enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
- addr = (unsigned long)mod->mem[type].base + offset;
- dest = mod->mem[type].rw_copy + offset;
+ dest = mod->mem[type].base + offset;
}
if (shdr->sh_type != SHT_NOBITS) {
@@ -2710,13 +2699,14 @@ static int move_module(struct module *mod, struct load_info *info)
* users of info can keep taking advantage and using the newly
* minted official memory area.
*/
- shdr->sh_addr = addr;
+ shdr->sh_addr = (unsigned long)dest;
pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
(long)shdr->sh_size, info->secstrings + shdr->sh_name);
}
return 0;
out_err:
+ module_memory_restore_rox(mod);
for (t--; t >= 0; t--)
module_memory_free(mod, t);
if (codetag_section_found)
@@ -2863,17 +2853,8 @@ int __weak module_finalize(const Elf_Ehdr *hdr,
return 0;
}
-int __weak module_post_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- return 0;
-}
-
static int post_relocation(struct module *mod, const struct load_info *info)
{
- int ret;
-
/* Sort exception table now relocations are done. */
sort_extable(mod->extable, mod->extable + mod->num_exentries);
@@ -2885,24 +2866,7 @@ static int post_relocation(struct module *mod, const struct load_info *info)
add_kallsyms(mod, info);
/* Arch-specific module finalizing. */
- ret = module_finalize(info->hdr, info->sechdrs, mod);
- if (ret)
- return ret;
-
- for_each_mod_mem_type(type) {
- struct module_memory *mem = &mod->mem[type];
-
- if (mem->is_rox) {
- if (!execmem_update_copy(mem->base, mem->rw_copy,
- mem->size))
- return -ENOMEM;
-
- vfree(mem->rw_copy);
- mem->rw_copy = NULL;
- }
- }
-
- return module_post_finalize(info->hdr, info->sechdrs, mod);
+ return module_finalize(info->hdr, info->sechdrs, mod);
}
/* Call module constructors. */
@@ -3499,6 +3463,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod->mem[type].size);
}
+ module_memory_restore_rox(mod);
module_deallocate(mod, info);
free_copy:
/*
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 74834ba15615..03f4142cfbf4 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
+#include <linux/execmem.h>