diff options
Diffstat (limited to 'mm/vmalloc.c')
| -rw-r--r-- | mm/vmalloc.c | 106 |
1 files changed, 59 insertions, 47 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ecbac900c35f..c31a8615a832 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -108,7 +108,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { if (unlikely(!pte_none(ptep_get(pte)))) { @@ -134,7 +134,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pfn++; } while (pte += PFN_DOWN(size), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -305,6 +305,11 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end, int err; pgtbl_mod_mask mask = 0; + /* + * Might allocate pagetables (for most archs a more precise annotation + * would be might_alloc(GFP_PGTABLE_KERNEL)). Also might shootdown TLB + * (requires IRQs enabled on x86). + */ might_sleep(); BUG_ON(addr >= end); @@ -366,7 +371,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long size = PAGE_SIZE; pte = pte_offset_kernel(pmd, addr); - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { #ifdef CONFIG_HUGETLB_PAGE @@ -385,7 +390,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; } @@ -533,7 +538,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, if (!pte) return -ENOMEM; - arch_enter_lazy_mmu_mode(); + lazy_mmu_mode_enable(); do { struct page *page = pages[*nr]; @@ -555,7 +560,7 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); - arch_leave_lazy_mmu_mode(); + lazy_mmu_mode_disable(); *mask |= PGTBL_PTE_MODIFIED; return err; @@ -1063,14 +1068,8 @@ static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static void drain_vmap_area_work(struct work_struct *work); static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); -static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages; static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr; -unsigned long vmalloc_nr_pages(void) -{ - return atomic_long_read(&nr_vmalloc_pages); -} - static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { struct rb_node *n = root->rb_node; @@ -2268,11 +2267,14 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) reclaim_list_global(&decay_list); } +#define KASAN_RELEASE_BATCH_SIZE 32 + static void kasan_release_vmalloc_node(struct vmap_node *vn) { struct vmap_area *va; unsigned long start, end; + unsigned int batch_count = 0; start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start; end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end; @@ -2282,6 +2284,11 @@ kasan_release_vmalloc_node(struct vmap_node *vn) kasan_release_vmalloc(va->va_start, va->va_end, va->va_start, va->va_end, KASAN_VMALLOC_PAGE_RANGE); + + if (need_resched() || (++batch_count >= KASAN_RELEASE_BATCH_SIZE)) { + cond_resched(); + batch_count = 0; + } } kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH); @@ -3176,7 +3183,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) kasan_populate_early_vm_area_shadow(vm->addr, vm->size); } -static void clear_vm_uninitialized_flag(struct vm_struct *vm) +void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* * Before removing VM_UNINITIALIZED, @@ -3452,9 +3459,6 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); - /* All pages of vm should be charged to same memcg, so use first one. */ - if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) - mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; @@ -3463,11 +3467,11 @@ void vfree(const void *addr) * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations */ + if (!(vm->flags & VM_MAP_PUT_PAGES)) + mod_lruvec_page_state(page, NR_VMALLOC, -1); __free_page(page); cond_resched(); } - if (!(vm->flags & VM_MAP_PUT_PAGES)) - atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); kvfree(vm->pages); kfree(vm); } @@ -3655,6 +3659,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, continue; } + mod_lruvec_page_state(page, NR_VMALLOC, 1 << large_order); + split_page(page, large_order); for (i = 0; i < (1U << large_order); i++) pages[nr_allocated + i] = page + i; @@ -3675,6 +3681,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (!order) { while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; + int i; /* * A maximum allowed request is hard-coded and is 100 @@ -3698,6 +3705,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, nr_pages_request, pages + nr_allocated); + for (i = nr_allocated; i < nr_allocated + nr; i++) + mod_lruvec_page_state(pages[i], NR_VMALLOC, 1); + nr_allocated += nr; /* @@ -3722,6 +3732,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, if (unlikely(!page)) break; + mod_lruvec_page_state(page, NR_VMALLOC, 1 << order); + /* * High-order allocations must be able to be treated as * independent small pages by callers (as they can with @@ -3785,6 +3797,8 @@ static void defer_vm_area_cleanup(struct vm_struct *area) * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save() * GFP_NOFS - memalloc_nofs_save() * GFP_NOIO - memalloc_noio_save() + * __GFP_RETRY_MAYFAIL, __GFP_NORETRY - memalloc_noreclaim_save() + * to prevent OOMs * * Returns a flag cookie to pair with restore. */ @@ -3793,7 +3807,8 @@ memalloc_apply_gfp_scope(gfp_t gfp_mask) { unsigned int flags = 0; - if (!gfpflags_allow_blocking(gfp_mask)) + if (!gfpflags_allow_blocking(gfp_mask) || + (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_NORETRY))) flags = memalloc_noreclaim_save(); else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) flags = memalloc_nofs_save(); @@ -3864,12 +3879,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, vmalloc_gfp_adjust(gfp_mask, page_order), node, page_order, nr_small_pages, area->pages); - atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - /* All pages of vm should be charged to same memcg, so use first one. */ - if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) - mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, - area->nr_pages); - /* * If not enough pages were obtained to accomplish an * allocation request, free them via vfree() if any. @@ -3888,7 +3897,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocate pages", - area->nr_pages * PAGE_SIZE); + nr_small_pages * PAGE_SIZE); goto fail; } @@ -3927,7 +3936,8 @@ fail: * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP. */ #define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\ - __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\ + __GFP_NOFAIL | __GFP_ZERO |\ + __GFP_NORETRY | __GFP_RETRY_MAYFAIL |\ GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ GFP_USER | __GFP_NOLOCKDEP) @@ -3958,12 +3968,15 @@ static gfp_t vmalloc_fix_flags(gfp_t flags) * virtual range with protection @prot. * * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT, - * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported. + * %__GFP_RETRY_MAYFAIL, %__GFP_NORETRY, %GFP_NOFS and %GFP_NOIO. + * Zone modifiers are not supported. * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only * by __vmalloc(). * - * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY - * and %__GFP_RETRY_MAYFAIL are not supported. + * Retry modifiers: only %__GFP_NOFAIL is fully supported; + * %__GFP_NORETRY and %__GFP_RETRY_MAYFAIL are supported with limitation, + * i.e. page tables are allocated with NOWAIT semantic so they might fail + * under moderate memory pressure. * * %__GFP_NOWARN can be used to suppress failure messages. * @@ -4248,7 +4261,7 @@ void *vzalloc_node_noprof(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node_noprof); /** - * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents + * vrealloc_node_align - reallocate virtually contiguous memory; contents * remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate @@ -4322,7 +4335,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); vm->requested_size = size; - kasan_poison_vmalloc(p + size, old_size - size); + kasan_vrealloc(p, old_size, size); return (void *)p; } @@ -4330,14 +4343,13 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align * We already have the bytes available in the allocation; use them. */ if (size <= alloced_size) { - kasan_unpoison_vmalloc(p + old_size, size - old_size, - KASAN_VMALLOC_PROT_NORMAL); /* * No need to zero memory here, as unused memory will have * already been zeroed at initial allocation time or during * realloc shrink time. */ vm->requested_size = size; + kasan_vrealloc(p, old_size, size); return (void *)p; } @@ -4349,12 +4361,13 @@ need_realloc: return NULL; if (p) { - memcpy(n, p, old_size); + memcpy(n, p, min(size, old_size)); vfree(p); } return n; } +EXPORT_SYMBOL(vrealloc_node_align_noprof); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) @@ -4562,20 +4575,20 @@ finished: * @count: number of bytes to be read. * * This function checks that addr is a valid vmalloc'ed area, and - * copy data from that area to a given buffer. If the given memory range + * copies data from that area to a given iterator. If the given memory range * of [addr...addr+count) includes some valid address, data is copied to - * proper area of @buf. If there are memory holes, they'll be zero-filled. + * proper area of @iter. If there are memory holes, they'll be zero-filled. * IOREMAP area is treated as memory hole and no copy is done. * * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. + * vm_struct area, returns 0. * - * Note: In usual ops, vread() is never necessary because the caller + * Note: In usual ops, vread_iter() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any information, as /proc/kcore. * - * Return: number of bytes for which addr and buf should be increased + * Return: number of bytes for which addr and iter should be advanced * (same number as @count) or %0 if [addr...addr+count) doesn't * include any intersection with valid vmalloc area */ @@ -4907,14 +4920,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); - vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); + vms = kzalloc_objs(vms[0], nr_vms); + vas = kzalloc_objs(vas[0], nr_vms); if (!vas || !vms) goto err_free2; for (area = 0; area < nr_vms; area++) { vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); - vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); + vms[area] = kzalloc_obj(struct vm_struct); if (!vas[area] || !vms[area]) goto err_free; } @@ -5025,9 +5038,7 @@ retry: * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ - for (area = 0; area < nr_vms; area++) - vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, - vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); + kasan_unpoison_vmap_areas(vms, nr_vms, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; @@ -5355,7 +5366,7 @@ static void vmap_init_nodes(void) int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { - vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT); + vn = kmalloc_objs(*vn, n, GFP_NOWAIT); if (vn) { /* Node partition is 16 pages. */ vmap_zone_size = (1 << 4) * PAGE_SIZE; @@ -5405,6 +5416,7 @@ vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct vmap_node *vn; + guard(mutex)(&vmap_purge_lock); for_each_vmap_node(vn) decay_va_pool_node(vn, true); |
