aboutsummaryrefslogtreecommitdiff
path: root/mm/vmalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r--mm/vmalloc.c106
1 files changed, 59 insertions, 47 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ecbac900c35f..c31a8615a832 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
@@ -134,7 +134,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -305,6 +305,11 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end,
int err;
pgtbl_mod_mask mask = 0;
+ /*
+ * Might allocate pagetables (for most archs a more precise annotation
+ * would be might_alloc(GFP_PGTABLE_KERNEL)). Also might shootdown TLB
+ * (requires IRQs enabled on x86).
+ */
might_sleep();
BUG_ON(addr >= end);
@@ -366,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
#ifdef CONFIG_HUGETLB_PAGE
@@ -385,7 +390,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
} while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -533,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte)
return -ENOMEM;
- arch_enter_lazy_mmu_mode();
+ lazy_mmu_mode_enable();
do {
struct page *page = pages[*nr];
@@ -555,7 +560,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
- arch_leave_lazy_mmu_mode();
+ lazy_mmu_mode_disable();
*mask |= PGTBL_PTE_MODIFIED;
return err;
@@ -1063,14 +1068,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
-static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages;
static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr;
-unsigned long vmalloc_nr_pages(void)
-{
- return atomic_long_read(&nr_vmalloc_pages);
-}
-
static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
struct rb_node *n = root->rb_node;
@@ -2268,11 +2267,14 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
reclaim_list_global(&decay_list);
}
+#define KASAN_RELEASE_BATCH_SIZE 32
+
static void
kasan_release_vmalloc_node(struct vmap_node *vn)
{
struct vmap_area *va;
unsigned long start, end;
+ unsigned int batch_count = 0;
start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
@@ -2282,6 +2284,11 @@ kasan_release_vmalloc_node(struct vmap_node *vn)
kasan_release_vmalloc(va->va_start, va->va_end,
va->va_start, va->va_end,
KASAN_VMALLOC_PAGE_RANGE);
+
+ if (need_resched() || (++batch_count >= KASAN_RELEASE_BATCH_SIZE)) {
+ cond_resched();
+ batch_count = 0;
+ }
}
kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
@@ -3176,7 +3183,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
-static void clear_vm_uninitialized_flag(struct vm_struct *vm)
+void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
* Before removing VM_UNINITIALIZED,
@@ -3452,9 +3459,6 @@ void vfree(const void *addr)
if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
vm_reset_perms(vm);
- /* All pages of vm should be charged to same memcg, so use first one. */
- if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES))
- mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages);
for (i = 0; i < vm->nr_pages; i++) {
struct page *page = vm->pages[i];
@@ -3463,11 +3467,11 @@ void vfree(const void *addr)
* High-order allocs for huge vmallocs are split, so
* can be freed as an array of order-0 allocations
*/
+ if (!(vm->flags & VM_MAP_PUT_PAGES))
+ mod_lruvec_page_state(page, NR_VMALLOC, -1);
__free_page(page);
cond_resched();
}
- if (!(vm->flags & VM_MAP_PUT_PAGES))
- atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
kvfree(vm->pages);
kfree(vm);
}
@@ -3655,6 +3659,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
continue;
}
+ mod_lruvec_page_state(page, NR_VMALLOC, 1 << large_order);
+
split_page(page, large_order);
for (i = 0; i < (1U << large_order); i++)
pages[nr_allocated + i] = page + i;
@@ -3675,6 +3681,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (!order) {
while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request;
+ int i;
/*
* A maximum allowed request is hard-coded and is 100
@@ -3698,6 +3705,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
nr_pages_request,
pages + nr_allocated);
+ for (i = nr_allocated; i < nr_allocated + nr; i++)
+ mod_lruvec_page_state(pages[i], NR_VMALLOC, 1);
+
nr_allocated += nr;
/*
@@ -3722,6 +3732,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (unlikely(!page))
break;
+ mod_lruvec_page_state(page, NR_VMALLOC, 1 << order);
+
/*
* High-order allocations must be able to be treated as
* independent small pages by callers (as they can with
@@ -3785,6 +3797,8 @@ static void defer_vm_area_cleanup(struct vm_struct *area)
* non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save()
* GFP_NOFS - memalloc_nofs_save()
* GFP_NOIO - memalloc_noio_save()
+ * __GFP_RETRY_MAYFAIL, __GFP_NORETRY - memalloc_noreclaim_save()
+ * to prevent OOMs
*
* Returns a flag cookie to pair with restore.
*/
@@ -3793,7 +3807,8 @@ memalloc_apply_gfp_scope(gfp_t gfp_mask)
{
unsigned int flags = 0;
- if (!gfpflags_allow_blocking(gfp_mask))
+ if (!gfpflags_allow_blocking(gfp_mask) ||
+ (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_NORETRY)))
flags = memalloc_noreclaim_save();
else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
flags = memalloc_nofs_save();
@@ -3864,12 +3879,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
vmalloc_gfp_adjust(gfp_mask, page_order), node,
page_order, nr_small_pages, area->pages);
- atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- /* All pages of vm should be charged to same memcg, so use first one. */
- if (gfp_mask & __GFP_ACCOUNT && area->nr_pages)
- mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC,
- area->nr_pages);
-
/*
* If not enough pages were obtained to accomplish an
* allocation request, free them via vfree() if any.
@@ -3888,7 +3897,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (!fatal_signal_pending(current) && page_order == 0)
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocate pages",
- area->nr_pages * PAGE_SIZE);
+ nr_small_pages * PAGE_SIZE);
goto fail;
}
@@ -3927,7 +3936,8 @@ fail:
* GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP.
*/
#define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\
- __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\
+ __GFP_NOFAIL | __GFP_ZERO |\
+ __GFP_NORETRY | __GFP_RETRY_MAYFAIL |\
GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\
GFP_USER | __GFP_NOLOCKDEP)
@@ -3958,12 +3968,15 @@ static gfp_t vmalloc_fix_flags(gfp_t flags)
* virtual range with protection @prot.
*
* Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT,
- * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported.
+ * %__GFP_RETRY_MAYFAIL, %__GFP_NORETRY, %GFP_NOFS and %GFP_NOIO.
+ * Zone modifiers are not supported.
* Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only
* by __vmalloc().
*
- * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY
- * and %__GFP_RETRY_MAYFAIL are not supported.
+ * Retry modifiers: only %__GFP_NOFAIL is fully supported;
+ * %__GFP_NORETRY and %__GFP_RETRY_MAYFAIL are supported with limitation,
+ * i.e. page tables are allocated with NOWAIT semantic so they might fail
+ * under moderate memory pressure.
*
* %__GFP_NOWARN can be used to suppress failure messages.
*
@@ -4248,7 +4261,7 @@ void *vzalloc_node_noprof(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node_noprof);
/**
- * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents
+ * vrealloc_node_align - reallocate virtually contiguous memory; contents
* remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
@@ -4322,7 +4335,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align
if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
vm->requested_size = size;
- kasan_poison_vmalloc(p + size, old_size - size);
+ kasan_vrealloc(p, old_size, size);
return (void *)p;
}
@@ -4330,14 +4343,13 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align
* We already have the bytes available in the allocation; use them.
*/
if (size <= alloced_size) {
- kasan_unpoison_vmalloc(p + old_size, size - old_size,
- KASAN_VMALLOC_PROT_NORMAL);
/*
* No need to zero memory here, as unused memory will have
* already been zeroed at initial allocation time or during
* realloc shrink time.
*/
vm->requested_size = size;
+ kasan_vrealloc(p, old_size, size);
return (void *)p;
}
@@ -4349,12 +4361,13 @@ need_realloc:
return NULL;
if (p) {
- memcpy(n, p, old_size);
+ memcpy(n, p, min(size, old_size));
vfree(p);
}
return n;
}
+EXPORT_SYMBOL(vrealloc_node_align_noprof);
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
@@ -4562,20 +4575,20 @@ finished:
* @count: number of bytes to be read.
*
* This function checks that addr is a valid vmalloc'ed area, and
- * copy data from that area to a given buffer. If the given memory range
+ * copies data from that area to a given iterator. If the given memory range
* of [addr...addr+count) includes some valid address, data is copied to
- * proper area of @buf. If there are memory holes, they'll be zero-filled.
+ * proper area of @iter. If there are memory holes, they'll be zero-filled.
* IOREMAP area is treated as memory hole and no copy is done.
*
* If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
+ * vm_struct area, returns 0.
*
- * Note: In usual ops, vread() is never necessary because the caller
+ * Note: In usual ops, vread_iter() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
* This is for routines which have to access vmalloc area without
* any information, as /proc/kcore.
*
- * Return: number of bytes for which addr and buf should be increased
+ * Return: number of bytes for which addr and iter should be advanced
* (same number as @count) or %0 if [addr...addr+count) doesn't
* include any intersection with valid vmalloc area
*/
@@ -4907,14 +4920,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
return NULL;
}
- vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
- vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
+ vms = kzalloc_objs(vms[0], nr_vms);
+ vas = kzalloc_objs(vas[0], nr_vms);
if (!vas || !vms)
goto err_free2;
for (area = 0; area < nr_vms; area++) {
vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
- vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
+ vms[area] = kzalloc_obj(struct vm_struct);
if (!vas[area] || !vms[area])
goto err_free;
}
@@ -5025,9 +5038,7 @@ retry:
* With hardware tag-based KASAN, marking is skipped for
* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
*/
- for (area = 0; area < nr_vms; area++)
- vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
- vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
+ kasan_unpoison_vmap_areas(vms, nr_vms, KASAN_VMALLOC_PROT_NORMAL);
kfree(vas);
return vms;
@@ -5355,7 +5366,7 @@ static void vmap_init_nodes(void)
int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);
if (n > 1) {
- vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT);
+ vn = kmalloc_objs(*vn, n, GFP_NOWAIT);
if (vn) {
/* Node partition is 16 pages. */
vmap_zone_size = (1 << 4) * PAGE_SIZE;
@@ -5405,6 +5416,7 @@ vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
struct vmap_node *vn;
+ guard(mutex)(&vmap_purge_lock);
for_each_vmap_node(vn)
decay_va_pool_node(vn, true);