diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 4 | ||||
| -rw-r--r-- | mm/execmem.c | 39 | ||||
| -rw-r--r-- | mm/filemap.c | 106 | ||||
| -rw-r--r-- | mm/folio-compat.c | 14 | ||||
| -rw-r--r-- | mm/gup.c | 6 | ||||
| -rw-r--r-- | mm/internal.h | 11 | ||||
| -rw-r--r-- | mm/memblock.c | 66 | ||||
| -rw-r--r-- | mm/memcontrol-v1.c | 6 | ||||
| -rw-r--r-- | mm/memcontrol.c | 57 | ||||
| -rw-r--r-- | mm/memory.c | 168 | ||||
| -rw-r--r-- | mm/mempolicy.c | 31 | ||||
| -rw-r--r-- | mm/migrate_device.c | 116 | ||||
| -rw-r--r-- | mm/mmap.c | 54 | ||||
| -rw-r--r-- | mm/nommu.c | 101 | ||||
| -rw-r--r-- | mm/page_alloc.c | 203 | ||||
| -rw-r--r-- | mm/page_owner.c | 8 | ||||
| -rw-r--r-- | mm/percpu.c | 4 | ||||
| -rw-r--r-- | mm/readahead.c | 14 | ||||
| -rw-r--r-- | mm/shmem.c | 8 | ||||
| -rw-r--r-- | mm/slab.h | 34 | ||||
| -rw-r--r-- | mm/slab_common.c | 48 | ||||
| -rw-r--r-- | mm/slub.c | 336 | ||||
| -rw-r--r-- | mm/swap.c | 16 | ||||
| -rw-r--r-- | mm/swap.h | 1 | ||||
| -rw-r--r-- | mm/usercopy.c | 18 | ||||
| -rw-r--r-- | mm/util.c | 232 | ||||
| -rw-r--r-- | mm/vmscan.c | 23 | ||||
| -rw-r--r-- | mm/vmstat.c | 46 |
28 files changed, 1231 insertions, 539 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 4a4e7b63d30a..d3fb3762887b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -195,6 +195,10 @@ menu "Slab allocator options" config SLUB def_bool y +config KVFREE_RCU_BATCHED + def_bool y + depends on !SLUB_TINY && !TINY_RCU + config SLUB_TINY bool "Configure for minimal memory footprint" depends on EXPERT diff --git a/mm/execmem.c b/mm/execmem.c index 317b6a8d35be..e6c4f5076ca8 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -257,7 +257,6 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; - unsigned long start, end; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - start = (unsigned long)p; - end = start + alloc_size; - - vunmap_range(start, end); - - err = execmem_set_direct_map_valid(vm, false); - if (err) - goto err_free_mem; - - err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, - PMD_SHIFT); + err = set_memory_rox((unsigned long)p, vm->nr_pages); if (err) goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) - goto err_free_mem; + goto err_reset_direct_map; return 0; +err_reset_direct_map: + execmem_set_direct_map_valid(vm, true); err_free_mem: vfree(p); return err; @@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr) return true; } + +int execmem_make_temp_rw(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + int ret; + + ret = set_memory_nx(addr, nr); + if (ret) + return ret; + + return set_memory_rw(addr, nr); +} + +int execmem_restore_rox(void *ptr, size_t size) +{ + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long addr = (unsigned long)ptr; + + return set_memory_rox(addr, nr); +} + #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { diff --git a/mm/filemap.c b/mm/filemap.c index cc69f174f76b..b5e784f34d98 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -47,7 +47,7 @@ #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> -#include <linux/fsnotify.h> +#include <linux/sysctl.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" @@ -1066,6 +1066,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio) return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } +/* How many times do we accept lock stealing from under a waiter? */ +static int sysctl_page_lock_unfairness = 5; +static const struct ctl_table filemap_sysctl_table[] = { + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +}; + void __init pagecache_init(void) { int i; @@ -1074,6 +1087,7 @@ void __init pagecache_init(void) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); + register_sysctl_init("vm", filemap_sysctl_table); } /* @@ -1221,9 +1235,6 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, return true; } -/* How many times do we accept lock stealing from under a waiter? */ -int sysctl_page_lock_unfairness = 5; - static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { @@ -1367,7 +1378,7 @@ repeat: * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is - * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. @@ -3197,14 +3208,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; - /* - * If we have pre-content watches we need to disable readahead to make - * sure that we don't populate our mapping with 0 filled pages that we - * never emitted an event for. - */ - if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) - return fpin; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { @@ -3273,10 +3276,6 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct file *fpin = NULL; unsigned int mmap_miss; - /* See comment in do_sync_mmap_readahead. */ - if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) - return fpin; - /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; @@ -3336,48 +3335,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) } /** - * filemap_fsnotify_fault - maybe emit a pre-content event. - * @vmf: struct vm_fault containing details of the fault. - * - * If we have a pre-content watch on this file we will emit an event for this - * range. If we return anything the fault caller should return immediately, we - * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the - * fault again and then the fault handler will run the second time through. - * - * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened. - */ -vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) -{ - struct file *fpin = NULL; - int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS; - loff_t pos = vmf->pgoff >> PAGE_SHIFT; - size_t count = PAGE_SIZE; - int err; - - /* - * We already did this and now we're retrying with everything locked, - * don't emit the event and continue. - */ - if (vmf->flags & FAULT_FLAG_TRIED) - return 0; - - /* No watches, we're done. */ - if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode))) - return 0; - - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - if (!fpin) - return VM_FAULT_SIGBUS; - - err = fsnotify_file_area_perm(fpin, mask, &pos, count); - fput(fpin); - if (err) - return VM_FAULT_SIGBUS; - return VM_FAULT_RETRY; -} -EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); - -/** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault * @@ -3481,37 +3438,6 @@ retry_find: */ if (unlikely(!folio_test_uptodate(folio))) { /* - * If this is a precontent file we have can now emit an event to - * try and populate the folio. - */ - if (!(vmf->flags & FAULT_FLAG_TRIED) && - unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { - loff_t pos = folio_pos(folio); - size_t count = folio_size(folio); - - /* We're NOWAIT, we have to retry. */ - if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) { - folio_unlock(folio); - goto out_retry; - } - - if (mapping_locked) - filemap_invalidate_unlock_shared(mapping); - mapping_locked = false; - - folio_unlock(folio); - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - if (!fpin) - goto out_retry; - - error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos, - count); - if (error) - ret = VM_FAULT_SIGBUS; - goto out_retry; - } - - /* * If the invalidate lock is not held, the folio was in cache * and uptodate and now it is not. Strange but possible since we * didn't hold the page lock all the time. Let's drop diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 1d1832e2a599..45540942d148 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -28,12 +28,6 @@ void wait_on_page_writeback(struct page *page) } EXPORT_SYMBOL_GPL(wait_on_page_writeback); -void wait_for_stable_page(struct page *page) -{ - return folio_wait_stable(page_folio(page)); -} -EXPORT_SYMBOL_GPL(wait_for_stable_page); - void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); @@ -90,11 +84,3 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, return folio_file_page(folio, index); } EXPORT_SYMBOL(pagecache_get_page); - -struct page *grab_cache_page_write_begin(struct address_space *mapping, - pgoff_t index) -{ - return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); -} -EXPORT_SYMBOL(grab_cache_page_write_begin); @@ -2254,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address + * @locked: a pointer to an int denoting whether the mmap sem is held * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). @@ -2266,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable); * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) +struct page *get_dump_page(unsigned long addr, int *locked) { struct page *page; - int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } diff --git a/mm/internal.h b/mm/internal.h index 21f2643f3d95..50c2f590b2d0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1165,9 +1165,13 @@ static inline void mminit_verify_zonelist(void) #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA +extern int node_reclaim_mode; + extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else +#define node_reclaim_mode 0 + static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { @@ -1179,6 +1183,12 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask) } #endif +static inline bool node_reclaim_enabled(void) +{ + /* Is any node_reclaim_mode bit set? */ + return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); +} + /* * mm/memory-failure.c */ @@ -1256,6 +1266,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ +#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ diff --git a/mm/memblock.c b/mm/memblock.c index 64ae678cd1d1..284154445409 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -16,6 +16,7 @@ #include <linux/kmemleak.h> #include <linux/seq_file.h> #include <linux/memblock.h> +#include <linux/mutex.h> #include <asm/sections.h> #include <linux/io.h> @@ -2282,6 +2283,7 @@ struct reserve_mem_table { }; static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES]; static int reserved_mem_count; +static DEFINE_MUTEX(reserve_mem_lock); /* Add wildcard region with a lookup name */ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, @@ -2295,6 +2297,21 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, strscpy(map->name, name); } +static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name) +{ + struct reserve_mem_table *map; + int i; + + for (i = 0; i < reserved_mem_count; i++) { + map = &reserved_mem_table[i]; + if (!map->size) + continue; + if (strcmp(name, map->name) == 0) + return map; + } + return NULL; +} + /** * reserve_mem_find_by_name - Find reserved memory region with a given name * @name: The name that is attached to a reserved memory region @@ -2308,22 +2325,47 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size, int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size) { struct reserve_mem_table *map; - int i; - for (i = 0; i < reserved_mem_count; i++) { - map = &reserved_mem_table[i]; - if (!map->size) - continue; - if (strcmp(name, map->name) == 0) { - *start = map->start; - *size = map->size; - return 1; - } - } - return 0; + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + *start = map->start; + *size = map->size; + return 1; } EXPORT_SYMBOL_GPL(reserve_mem_find_by_name); +/** + * reserve_mem_release_by_name - Release reserved memory region with a given name + * @name: The name that is attatched to a reserved memory region + * + * Forcibly release the pages in the reserved memory region so that those memory + * can be used as free memory. After released the reserved region size becomes 0. + * + * Returns: 1 if released or 0 if not found. + */ +int reserve_mem_release_by_name(const char *name) +{ + char buf[RESERVE_MEM_NAME_SIZE + 12]; + struct reserve_mem_table *map; + void *start, *end; + + guard(mutex)(&reserve_mem_lock); + map = reserve_mem_find_by_name_nolock(name); + if (!map) + return 0; + + start = phys_to_virt(map->start); + end = start + map->size - 1; + snprintf(buf, sizeof(buf), "reserve_mem:%s", name); + free_reserved_area(start, end, 0, buf); + map->size = 0; + + return 1; +} + /* * Parse reserve_mem=nn:align:name */ diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index c1feb3945350..8660908850dc 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1955,9 +1955,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > MAX_SWAPPINESS) return -EINVAL; - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { + pr_info_once("Per memcg swappiness does not exist in cgroup v2. " + "See memory.reclaim or memory.swap.max there\n "); WRITE_ONCE(memcg->swappiness, val); - else + } else WRITE_ONCE(vm_swappiness, val); return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 40c07b8699ae..421740f1bcdc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1759,7 +1759,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } struct memcg_stock_pcp { - local_lock_t stock_lock; + localtry_lock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -1774,7 +1774,7 @@ struct memcg_stock_pcp { #define FLUSHING_CACHED_CHARGE 0 }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { - .stock_lock = INIT_LOCAL_LOCK(stock_lock), + .stock_lock = INIT_LOCALTRY_LOCK(stock_lock), }; static DEFINE_MUTEX(percpu_charge_mutex); @@ -1786,6 +1786,7 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. * @nr_pages: how many pages to charge. + * @gfp_mask: allocation mask. * * The charges will only happen if @memcg matches the current cpu's memcg * stock, and at least @nr_pages are available in that stock. Failure to @@ -1793,7 +1794,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, * * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, + gfp_t gfp_mask) { struct memcg_stock_pcp *stock; unsigned int stock_pages; @@ -1803,7 +1805,11 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + if (!gfpflags_allow_spinning(gfp_mask)) + return ret; + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); + } stock = this_cpu_ptr(&memcg_stock); stock_pages = READ_ONCE(stock->nr_pages); @@ -1812,7 +1818,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ret = true; } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @@ -1851,14 +1857,14 @@ static void drain_local_stock(struct work_struct *dummy) * drain_stock races is that we always operate on local CPU stock * here with IRQ disabled */ - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); old = drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); obj_cgroup_put(old); } @@ -1888,9 +1894,20 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { unsigned long flags; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + /* + * In case of unlikely failure to lock percpu stock_lock + * uncharge memcg directly. + */ + if (mem_cgroup_is_root(memcg)) + return; + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); + return; + } __refill_stock(memcg, nr_pages); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @@ -1947,9 +1964,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) stock = &per_cpu(memcg_stock, cpu); /* drain_obj_stock requires stock_lock */ - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); old = drain_obj_stock(stock); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); drain_stock(stock); obj_cgroup_put(old); @@ -2242,9 +2259,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long pflags; retry: - if (consume_stock(memcg, nr_pages)) + if (consume_stock(memcg, nr_pages, gfp_mask)) return 0; + if (!gfpflags_allow_spinning(gfp_mask)) + /* Avoid the refill and flush of the older stock */ + batch = nr_pages; + if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) @@ -2766,7 +2787,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, unsigned long flags; int *bytes; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); /* @@ -2815,7 +2836,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) __mod_objcg_mlstate(objcg, pgdat, idx, nr); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); obj_cgroup_put(old); } @@ -2825,7 +2846,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) unsigned long flags; bool ret = false; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { @@ -2833,7 +2854,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) ret = true; } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @@ -2925,7 +2946,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, unsigned long flags; unsigned int nr_pages = 0; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ @@ -2939,7 +2960,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); obj_cgroup_put(old); if (nr_pages) diff --git a/mm/memory.c b/mm/memory.c index 3900225d99c5..6ea3551eb2df 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -76,7 +76,6 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> -#include <linux/fsnotify.h> #include <trace/events/kmem.h> @@ -1357,12 +1356,12 @@ int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; - unsigned long next; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; + unsigned long next, pfn; bool is_cow; int ret; @@ -1373,11 +1372,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - /* - * We do not free on error cases below as remove_vma - * gets called on error from higher level routine - */ - ret = track_pfn_copy(src_vma); + ret = track_pfn_copy(dst_vma, src_vma, &pfn); if (ret) return ret; } @@ -1414,7 +1409,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { - untrack_pfn_clear(dst_vma); ret = -ENOMEM; break; } @@ -1424,6 +1418,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } + if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) + untrack_pfn_copy(dst_vma, pfn); return ret; } @@ -4432,7 +4428,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->page = pfn_swap_entry_to_page(entry); ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { - struct dev_pagemap *pgmap; if (vmf->flags & FAULT_FLAG_VMA_LOCK) { /* * migrate_to_ram is not yet ready to operate @@ -4455,11 +4450,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * Get a page reference while we know the page can't be * freed. */ - get_page(vmf->page); - pte_unmap_unlock(vmf->pte, vmf->ptl); - pgmap = page_pgmap(vmf->page); - ret = pgmap->ops->migrate_to_ram(vmf); - put_page(vmf->page); + if (trylock_page(vmf->page)) { + struct dev_pagemap *pgmap; + + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + pgmap = page_pgmap(vmf->page); + ret = pgmap->ops->migrate_to_ram(vmf); + unlock_page(vmf->page); + put_page(vmf->page); + } else { + pte_unmap_unlock(vmf->pte, vmf->ptl); + } } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_pte_marker_entry(entry)) { @@ -5856,17 +5858,8 @@ out_map: static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); - /* - * Currently we just emit PAGE_SIZE for our fault events, so don't allow - * a huge fault if we have a pre content watch on this file. This would - * be trivial to support, but there would need to be tests to ensure - * this works properly and those don't exist currently. - */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; @@ -5890,9 +5883,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) } if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - /* See comment in create_huge_pmd. */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - goto split; if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) @@ -5915,9 +5905,6 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; - /* See comment in create_huge_pmd. */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PUD_ORDER); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -5935,9 +5922,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) if (vma_is_anonymous(vma)) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - /* See comment in create_huge_pmd. */ - if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode))) - goto split; if (vma->vm_ops->huge_fault) { ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) @@ -7012,6 +6996,124 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, } EXPORT_SYMBOL_GPL(access_process_vm); +#ifdef CONFIG_BPF_SYSCALL +/* + * Copy a string from another process's address space as given in mm. + * If there is any error return -EFAULT. + */ +static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + void *old_buf = buf; + int err = 0; + + *(char *)buf = '\0'; + + if (mmap_read_lock_killable(mm)) + return -EFAULT; + + addr = untagged_addr_remote(mm, addr); + + /* Avoid triggering the temporary warning in __get_user_pages */ + if (!vma_lookup(mm, addr)) { + err = -EFAULT; + goto out; + } + + while (len) { + int bytes, offset, retval; + void *maddr; + struct page *page; + struct vm_area_struct *vma = NULL; + + page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); + if (IS_ERR(page)) { + /* + * Treat as a total failure for now until we decide how + * to handle the CONFIG_HAVE_IOREMAP_PROT case and + * stack expansion. + */ + *(char *)buf = '\0'; + err = -EFAULT; + goto out; + } + + bytes = len; + offset = addr & (PAGE_SIZE - 1); + if (bytes > PAGE_SIZE - offset) + bytes = PAGE_SIZE - offset; + + maddr = kmap_local_page(page); + retval = strscpy(buf, maddr + offset, bytes); + if (retval >= 0) { + /* Found the end of the string */ + buf += retval; + unmap_and_put_page(page, maddr); + break; + } + + buf += bytes - 1; + /* + * Because strscpy always NUL terminates we need to + * copy the last byte in the page if we are going to + * load more pages + */ + if (bytes != len) { + addr += bytes - 1; + copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1); + buf += 1; + addr += 1; + } + len -= bytes; + + unmap_and_put_page(page, maddr); + } + +out: + mmap_read_unlock(mm); + if (err) + return err; + return buf - old_buf; +} + +/** + * copy_remote_vm_str - copy a string from another process's address space. + * @tsk: the task of the target address space + * @addr: start address to read from + * @buf: destination buffer + * @len: number of bytes to copy + * @gup_flags: flags modifying lookup behaviour + * + * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from @addr (source) to @buf (destination); + * not including the trailing NUL. Always guaranteed to leave NUL-terminated + * buffer. On any error, return -EFAULT. + */ +int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + if (unlikely(len == 0)) + return 0; + + mm = get_task_mm(tsk); + if (!mm) { + *(char *)buf = '\0'; + return -EFAULT; + } + + ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags); + + mmput(mm); + + return ret; +} +EXPORT_SYMBOL_GPL(copy_remote_vm_str); +#endif /* CONFIG_BPF_SYSCALL */ + /* * Print the name of a VMA. */ @@ -7044,10 +7146,8 @@ void __might_fault(const char *file, int line) if (pagefault_disabled()) return; __might_sleep(file, line); -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) if (current->mm) might_lock_read(¤t->mm->mmap_lock); -#endif } EXPORT_SYMBOL(__might_fault); #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 530e71fe9147..b28a1e6ae096 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state) } EXPORT_SYMBOL_GPL(numa_nearest_node); +/** + * nearest_node_nodemask - Find the node in @mask at the nearest distance + * from @node. + * + * @node: a valid node ID to start the search from. + * @mask: a pointer to a nodemask representing the allowed nodes. + * + * This function iterates over all nodes in @mask and calculates the + * distance from the starting @node, then it returns the node ID that is + * the closest to @node, or MAX_NUMNODES if no node is found. + * + * Note that @node must be a valid node ID usable with node_distance(), + * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes + * or unexpected behavior. + */ +int nearest_node_nodemask(int node, nodemask_t *mask) +{ + int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES; + + for_each_node_mask(n, *mask) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(nearest_node_nodemask); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 7d0d64f67cdf..3158afe7eb23 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; + struct folio *fault_folio = migrate->fault_page ? + page_folio(migrate->fault_page) : NULL; struct vm_area_struct *vma = walk->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = start, unmapped = 0; @@ -88,11 +90,16 @@ again: folio_get(folio); spin_unlock(ptl); + /* FIXME: we don't expect THP for fault_folio */ + if (WARN_ON_ONCE(fault_folio == folio)) + return migrate_vma_collect_skip(start, end, + walk); if (unlikely(!folio_trylock(folio))) return migrate_vma_collect_skip(start, end, walk); ret = split_folio(folio); - folio_unlock(folio); + if (fault_folio != folio) + folio_unlock(folio); folio_put(folio); if (ret) return migrate_vma_collect_skip(start, end, @@ -198,7 +205,7 @@ again: * optimisation to avoid walking the rmap later with * try_to_migrate(). */ - if (folio_trylock(folio)) { + if (fault_folio == folio || folio_trylock(folio)) { bool anon_exclusive; pte_t swp_pte; @@ -210,7 +217,8 @@ again: if (folio_try_share_anon_rmap_pte(folio, page)) { set_pte_at(mm, addr, ptep, pte); - folio_unlock(folio); + if (fault_folio != folio) + folio_unlock(folio); folio_put(folio); mpfn = 0; goto next; @@ -369,6 +377,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, unsigned long npages, struct page *fault_page) { + struct folio *fault_folio = fault_page ? + page_folio(fault_page) : NULL; unsigned long i, restore = 0; bool allow_drain = true; unsigned long unmapped = 0; @@ -433,7 +443,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, remove_migration_ptes(folio, folio, 0); src_pfns[i] = 0; - folio_unlock(folio); + if (fault_folio != folio) + folio_unlock(folio); folio_put(folio); restore--; } @@ -542,6 +553,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (args->fault_page && !is_device_private_page(args->fault_page)) return -EINVAL; + if (args->fault_page && !PageLocked(args->fault_page)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -805,19 +818,13 @@ void migrate_vma_pages(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_pages); -/* - * migrate_device_finalize() - complete page migration - * @src_pfns: src_pfns returned from migrate_device_range() - * @dst_pfns: array of pfns allocated by the driver to migrate memory to - * @npages: number of pages in the range - * - * Completes migration of the page by removing special migration entries. - * Drivers must ensure copying of page data is complete and visible to the CPU - * before calling this. - */ -void migrate_device_finalize(unsigned long *src_pfns, - unsigned long *dst_pfns, unsigned long npages) +static void __migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, + unsigned long npages, + struct page *fault_page) { + struct folio *fault_folio = fault_page ? + page_folio(fault_page) : NULL; unsigned long i; for (i = 0; i < npages; i++) { @@ -830,6 +837,7 @@ void migrate_device_finalize(unsigned long *src_pfns, if (!page) { if (dst) { + WARN_ON_ONCE(fault_folio == dst); folio_unlock(dst); folio_put(dst); } @@ -840,6 +848,7 @@ void migrate_device_finalize(unsigned long *src_pfns, if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) { if (dst) { + WARN_ON_ONCE(fault_folio == dst); folio_unlock(dst); folio_put(dst); } @@ -849,15 +858,33 @@ void migrate_device_finalize(unsigned long *src_pfns, if (!folio_is_zone_device(dst)) folio_add_lru(dst); remove_migration_ptes(src, dst, 0); - folio_unlock(src); + if (fault_folio != src) + folio_unlock(src); folio_put(src); if (dst != src) { + WARN_ON_ONCE(fault_folio == dst); folio_unlock(dst); folio_put(dst); } } } + +/* + * migrate_device_finalize() - complete page migration + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Completes migration of the page by removing special migration entries. + * Drivers must ensure copying of page data is complete and visible to the CPU + * before calling this. + */ +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) +{ + return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL); +} EXPORT_SYMBOL(migrate_device_finalize); /** @@ -873,10 +900,27 @@ EXPORT_SYMBOL(migrate_device_finalize); */ void migrate_vma_finalize(struct migrate_vma *migrate) { - migrate_device_finalize(migrate->src, migrate->dst, migrate->npages); + __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages, + migrate->fault_page); } EXPORT_SYMBOL(migrate_vma_finalize); +static unsigned long migrate_device_pfn_lock(unsigned long pfn) +{ + struct folio *folio; + + folio = folio_get_nontail_page(pfn_to_page(pfn)); + if (!folio) + return 0; + + if (!folio_trylock(folio)) { + folio_put(folio); + return 0; + } + + return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; +} + /** * migrate_device_range() - migrate device private pfns to normal memory. * @src_pfns: array large enough to hold migrating source device private pfns. @@ -901,29 +945,35 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start, { unsigned long i, pfn; - for (pfn = start, i = 0; i < npages; pfn++, i++) { - struct folio *folio; + for (pfn = start, i = 0; i < npages; pfn++, i++) + src_pfns[i] = migrate_device_pfn_lock(pfn); - folio = folio_get_nontail_page(pfn_to_page(pfn)); - if (!folio) { - src_pfns[i] = 0; - continue; - } + migrate_device_unmap(src_pfns, npages, NULL); - if (!folio_trylock(folio)) { - src_pfns[i] = 0; - folio_put(folio); - continue; - } + return 0; +} +EXPORT_SYMBOL(migrate_device_range); - src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; - } +/** + * migrate_device_pfns() - migrate device private pfns to normal memory. + * @src_pfns: pre-popluated array of source device private pfns to migrate. + * @npages: number of pages to migrate. + * + * Similar to migrate_device_range() but supports non-contiguous pre-popluated + * array of device pages to migrate. + */ +int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages) +{ + unsigned long i; + + for (i = 0; i < npages; i++) + src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]); migrate_device_unmap(src_pfns, npages, NULL); return 0; } -EXPORT_SYMBOL(migrate_device_range); +EXPORT_SYMBOL(migrate_device_pfns); /* * Migrate a device coherent folio back to normal memory. The caller should have diff --git a/mm/mmap.c b/mm/mmap.c index efcc4ca7500d..bd210aaf7ebd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1544,6 +1544,57 @@ struct vm_area_struct *_install_special_mapping( &special_mapping_vmops); } +#ifdef CONFIG_SYSCTL +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) +int sysctl_legacy_va_layout; +#endif + +static const struct ctl_table mmap_table[] = { + { + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ + defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) + { + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif +}; +#endif /* CONFIG_SYSCTL */ + /* * initialise the percpu counter for VM */ @@ -1553,6 +1604,9 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", mmap_table); +#endif } /* diff --git a/mm/nommu.c b/mm/nommu.c index 15a396ce2553..617e7ba8022f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -43,7 +43,6 @@ #include "internal.h" unsigned long highest_memmap_pfn; -int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -386,6 +385,19 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) return mm->brk = brk; } +static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; + +static const struct ctl_table nommu_table[] = { + { + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +}; + /* * initialise the percpu counter for VM and region record slabs */ @@ -396,6 +408,7 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); + register_sysctl_init("vm", nommu_table); } /* @@ -1607,13 +1620,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf) -{ - BUG(); - return 0; -} -EXPORT_SYMBOL_GPL(filemap_fsnotify_fault); - vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); @@ -1702,6 +1708,85 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in } EXPORT_SYMBOL_GPL(access_process_vm); +#ifdef CONFIG_BPF_SYSCALL +/* + * Copy a string from another process's address space as given in mm. + * If there is any error return -EFAULT. + */ +static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr, + void *buf, int len) +{ + unsigned long addr_end; + struct vm_area_struct *vma; + int ret = -EFAULT; + + *(char *)buf = '\0'; + + if (mmap_read_lock_killable(mm)) + return ret; + + /* the access must start within one of the target process's mappings */ + vma = find_vma(mm, addr); + if (!vma) + goto out; + + if (check_add_overflow(addr, len, &addr_end)) + goto out; + + /* don't overrun this mapping */ + if (addr_end > vma->vm_end) + len = vma->vm_end - addr; + + /* only read mappings where it is permitted */ + if (vma->vm_flags & VM_MAYREAD) { + ret = strscpy(buf, (char *)addr, len); + if (ret < 0) + ret = len - 1; + } + +out: + mmap_read_unlock(mm); + return ret; +} + +/** + * copy_remote_vm_str - copy a string from another process's address space. + * @tsk: the task of the target address space + * @addr: start address to read from + * @buf: destination buffer + * @len: number of bytes to copy + * @gup_flags: flags modifying lookup behaviour (unused) + * + * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from @addr (source) to @buf (destination); + * not including the trailing NUL. Always guaranteed to leave NUL-terminated + * buffer. On any error, return -EFAULT. + */ +int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + if (unlikely(len == 0)) + return 0; + + mm = get_task_mm(tsk); + if (!mm) { + *(char *)buf = '\0'; + return -EFAULT; + } + + ret = __copy_remote_vm_str(mm, addr, buf, len); + + mmput(mm); + + return ret; +} +EXPORT_SYMBOL_GPL(copy_remote_vm_str); +#endif /* CONFIG_BPF_SYSCALL */ + /** * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode * @inode: The inode to check diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0c01998cb3a0..f51aa6051a99 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -88,6 +88,9 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +/* Free the page without taking locks. Rely on trylock only. */ +#define FPI_TRYLOCK ((__force fpi_t)BIT(2)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -1381,13 +1384,44 @@ static void split_large_buddy(struct zone *zone, struct page *page, } while (1); } +static void add_page_to_zone_llist(struct zone *zone, struct page *page, + unsigned int order) +{ + /* Remember the order */ + page->order = order; + /* Add the page to the free list */ + llist_add(&page->pcp_llist, &zone->trylock_free_pages); +} + static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, fpi_t fpi_flags) { + struct llist_head *llhead; unsigned long flags; - spin_lock_irqsave(&zone->lock, flags); + if (!spin_trylock_irqsave(&zone->lock, flags)) { + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + add_page_to_zone_llist(zone, page, order); + return; + } + spin_lock_irqsave(&zone->lock, flags); + } + + /* The lock succeeded. Process deferred pages. */ + llhead = &zone->trylock_free_pages; + if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) { + struct llist_node *llnode; + struct page *p, *tmp; + + llnode = llist_del_all(llhead); + llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { + unsigned int p_order = p->order; + + split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); + __count_vm_events(PGFREE, 1 << p_order); + } + } split_large_buddy(zone, page, pfn, order, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); @@ -2280,7 +2314,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long flags; int i; - spin_lock_irqsave(&zone->lock, flags); + if (!spin_trylock_irqsave(&zone->lock, flags)) { + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + return 0; + spin_lock_irqsave(&zone->lock, flags); + } for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, alloc_flags); @@ -2568,7 +2606,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, static void free_frozen_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, - unsigned int order) + unsigned int order, fpi_t fpi_flags) { int high, batch; int pindex; @@ -2603,6 +2641,14 @@ static void free_frozen_page_commit(struct zone *zone, } if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) pcp->free_count += (1 << order); + + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + /* + * Do not attempt to take a zone lock. Let pcp->count get + * over high mark temporarily. + */ + return; + } high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), @@ -2617,7 +2663,8 @@ static void free_frozen_page_commit(struct zone *zone, /* * Free a pcp page */ -void free_frozen_pages(struct page *page, unsigned int order) +static void __free_frozen_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) { unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; @@ -2626,7 +2673,7 @@ void free_frozen_pages(struct page *page, unsigned int order) int migratetype; if (!pcp_allowed_order(order)) { - __free_pages_ok(page, order, FPI_NONE); + __free_pages_ok(page, order, fpi_flags); return; } @@ -2644,23 +2691,33 @@ void free_frozen_pages(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, fpi_flags); return; } migratetype = MIGRATE_MOVABLE; } + if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT) + && (in_nmi() || in_hardirq()))) { + add_page_to_zone_llist(zone, page, order); + return; + } pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_frozen_page_commit(zone, pcp, page, migratetype, order); + free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags); pcp_spin_unlock(pcp); } else { - free_one_page(zone, page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, fpi_flags); } pcp_trylock_finish(UP_flags); } +void free_frozen_pages(struct page *page, unsigned int order) +{ + __free_frozen_pages(page, order, FPI_NONE); +} + /* * Free a batch of folios */ @@ -2749,7 +2806,7 @@ void free_unref_folios(struct folio_batch *folios) trace_mm_page_free_batched(&folio->page); free_frozen_page_commit(zone, pcp, &folio->page, migratetype, - order); + order, FPI_NONE); } if (pcp) { @@ -2880,7 +2937,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, do { page = NULL; - spin_lock_irqsave(&zone->lock, flags); + if (!spin_trylock_irqsave(&zone->lock, flags)) { + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + return NULL; + spin_lock_irqsave(&zone->lock, flags); + } if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { @@ -4640,7 +4701,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, might_alloc(gfp_mask); - if (should_fail_alloc_page(gfp_mask, order)) + /* + * Don't invoke should_fail logic, since it may call + * get_random_u32() and printk() which need to spin_lock. + */ + if (!(*alloc_flags & ALLOC_TRYLOCK) && + should_fail_alloc_page(gfp_mask, order)) return false; *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); @@ -4938,9 +5004,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) EXPORT_SYMBOL(get_zeroed_page_noprof); /** - * __free_pages - Free pages allocated with alloc_pages(). + * ___free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). * @order: The order of the allocation. + * @fpi_flags: Free Page Internal flags. * * This function can free multi-page allocations that are not compound * pages. It does not check that the @order passed in matches that of @@ -4957,21 +5024,36 @@ EXPORT_SYMBOL(get_zeroed_page_noprof); * Context: May be called in interrupt context or while holding a normal * spinlock, but not in NMI context or while holding a raw spinlock. */ -void __free_pages(struct page *page, unsigned int order) +static void ___free_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) { /* get PageHead before we drop reference */ int head = PageHead(page); if (put_page_testzero(page)) - free_frozen_pages(page, order); + __free_frozen_pages(page, order, fpi_flags); else if (!head) { pgalloc_tag_sub_pages(page, (1 << order) - 1); while (order-- > 0) - free_frozen_pages(page + (1 << order), order); + __free_frozen_pages(page + (1 << order), order, + fpi_flags); } } +void __free_pages(struct page *page, unsigned int order) +{ + ___free_pages(page, order, FPI_NONE); +} EXPORT_SYMBOL(__free_pages); +/* + * Can be called while holding raw_spin_lock or from IRQ and NMI for any + * page type (not only those that came from try_alloc_pages) + */ +void free_pages_nolock(struct page *page, unsigned int order) +{ + ___free_pages(page, order, FPI_TRYLOCK); +} + void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { @@ -7221,3 +7303,94 @@ static bool __free_unaccepted(struct page *page) } #endif /* CONFIG_UNACCEPTED_MEMORY */ + +/** + * try_alloc_pages - opportunistic reentrant allocation from any context + * @nid: node to allocate from + * @order: allocation order size + * + * Allocates pages of a given order from the given node. This is safe to + * call from any context (from atomic, NMI, and also reentrant + * allocator -> tracepoint -> try_alloc_pages_noprof). + * Allocation is best effort and to be expected to fail easily so nobody should + * rely on the success. Failures are not reported via warn_alloc(). + * See always fail conditions below. + * + * Return: allocated page or NULL on failure. + */ +struct page *try_alloc_pages_noprof(int nid, unsigned int order) +{ + /* + * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. + * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd + * is not safe in arbitrary context. + * + * These two are the conditions for gfpflags_allow_spinning() being true. + * + * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason + * to warn. Also warn would trigger printk() which is unsafe from + * various contexts. We cannot use printk_deferred_enter() to mitigate, + * since the running context is unknown. + * + * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below + * is safe in any context. Also zeroing the page is mandatory for + * BPF use cases. + * + * Though __GFP_NOMEMALLOC is not checked in the code path below, + * specify it here to highlight that try_alloc_pages() + * doesn't want to deplete reserves. + */ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC + | __GFP_ACCOUNT; + unsigned int alloc_flags = ALLOC_TRYLOCK; + struct alloc_context ac = { }; + struct page *page; + + /* + * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is + * unsafe in NMI. If spin_trylock() is called from hard IRQ the current + * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will + * mark the task as the owner of another rt_spin_lock which will + * confuse PI logic, so return immediately if called form hard IRQ or + * NMI. + * + * Note, irqs_disabled() case is ok. This function can be called + * from raw_spin_lock_irqsave region. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + return NULL; + if (!pcp_allowed_order(order)) + return NULL; + +#ifdef CONFIG_UNACCEPTED_MEMORY + /* Bailout, since try_to_accept_memory_one() needs to take a lock */ + if (has_unaccepted_memory()) + return NULL; +#endif + /* Bailout, since _deferred_grow_zone() needs to take a lock */ + if (deferred_pages_enabled()) + return NULL; + + if (nid == NUMA_NO_NODE) + nid = numa_node_id(); + + prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac, + &alloc_gfp, &alloc_flags); + + /* + * Best effort allocation from percpu free list. + * If it's empty attempt to spin_trylock zone->lock. + */ + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); + + /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ + + if (memcg_kmem_online() && page && + unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { + free_pages_nolock(page, order); + page = NULL; + } + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); + return page; +} diff --git a/mm/page_owner.c b/mm/page_owner.c index 849d4a471b6c..cc4a6916eec6 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -299,7 +299,13 @@ void __reset_page_owner(struct page *page, unsigned short order) alloc_handle = page_owner->handle; page_ext_put(page_ext); - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + /* + * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false + * to prevent issues in stack_depot_save(). + * This is similar to try_alloc_pages() gfp flags, but only used + * to signal stack_depot to avoid spin_locks. + */ + handle = save_stack(__GFP_NOWARN); __update_page_owner_free_handle(page, handle, order, current->pid, current->tgid, free_ts_nsec); diff --git a/mm/percpu.c b/mm/percpu.c index 027fb6497495..b35494c8ede2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3077,7 +3077,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, continue; } /* copy and return the unused part */ - memcpy(ptr, __per_cpu_load, ai->static_size); + memcpy(ptr, __per_cpu_start, ai->static_size); pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3246,7 +3246,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size); /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); + memcpy((void *)unit_addr, __per_cpu_start, ai->static_size); } /* we're ready, commit */ diff --git a/mm/readahead.c b/mm/readahead.c index 220155a5c964..6a4e96b69702 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -128,7 +128,6 @@ #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> -#include <linux/fsnotify.h> #include "internal.h" @@ -559,15 +558,6 @@ void page_cache_sync_ra(struct readahead_control *ractl, pgoff_t prev_index, miss; /* - * If we have pre-content watches we need to disable readahead to make - * sure that we don't find 0 filled pages in cache that we never emitted - * events for. Filesystems supporting HSM must make sure to not call - * this function with ractl->file unset for files handled by HSM. - */ - if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) - return; - - /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the @@ -645,10 +635,6 @@ void page_cache_async_ra(struct readahead_control *ractl, if (!ra->ra_pages) return; - /* See the comment in page_cache_sync_ra. */ - if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode))) - return; - /* * Same bit is used for PG_readahead and PG_reclaim. */ diff --git a/mm/shmem.c b/mm/shmem.c index 17f27d92c664..99327c30507c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3899,16 +3899,16 @@ out_iput: return error; } -static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) - return error; + return ERR_PTR(error); inc_nlink(dir); - return 0; + return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/mm/slab.h b/mm/slab.h index e9fd9bf0bfa6..05a21dc796e0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } -/* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ - SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ + SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) -#ifdef CONFIG_SLUB_DEBUG #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) -#else -#define SLAB_DEBUG_FLAGS (0) -#endif -#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT | \ - SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) - -/* Common flags available with current configuration */ -#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) - -/* Common flags permitted for kmem_cache_create */ -#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ - SLAB_RED_ZONE | \ - SLAB_POISON | \ - SLAB_STORE_USER | \ - SLAB_TRACE | \ - SLAB_CONSISTENCY_CHECKS | \ - SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | \ - SLAB_ACCOUNT | \ - SLAB_KMALLOC | \ - SLAB_NO_MERGE | \ - SLAB_NO_USER_FLAGS) +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); @@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif +void kvfree_rcu_cb(struct rcu_head *head); + size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index 4c9f0a87f733..5be257e03c7c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -298,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, static_branch_enable(&slub_debug_enabled); if (flags & SLAB_STORE_USER) stack_depot_init(); +#else + flags &= ~SLAB_DEBUG_FLAGS; #endif mutex_lock(&slab_mutex); @@ -307,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, goto out_unlock; } - /* Refuse requests with allocator specific flags */ if (flags & ~SLAB_FLAGS_PERMITTED) { err = -EINVAL; goto out_unlock; } - /* - * Some allocators will constraint the set of valid flags to a subset - * of all flags. We expect them to define CACHE_CREATE_MASK in this - * case, and we'll just provide them with a sanitized version of the - * passed flags. - */ - flags &= CACHE_CREATE_MASK; - /* Fail closed on bad usersize of useroffset values. */ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || WARN_ON(!args->usersize && args->useroffset) || @@ -1284,6 +1277,29 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +#ifndef CONFIG_KVFREE_RCU_BATCHED + +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + if (head) { + kasan_record_aux_stack(ptr); + call_rcu(head, kvfree_rcu_cb); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree(ptr); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +void __init kvfree_rcu_init(void) +{ +} + +#else /* CONFIG_KVFREE_RCU_BATCHED */ + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached @@ -1534,8 +1550,7 @@ kvfree_rcu_list(struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kvfree_callback("slab", head, offset); - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -1863,8 +1878,6 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return true; } -#if !defined(CONFIG_TINY_RCU) - static enum hrtimer_restart schedule_page_work_fn(struct hrtimer *t) { @@ -1889,8 +1902,8 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) &krcp->page_cache_work, msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); } } @@ -2073,8 +2086,6 @@ void kvfree_rcu_barrier(void) } EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); -#endif /* #if !defined(CONFIG_TINY_RCU) */ - static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -2168,3 +2179,6 @@ void __init kvfree_rcu_init(void) shrinker_register(kfree_rcu_shrinker); } + +#endif /* CONFIG_KVFREE_RCU_BATCHED */ + diff --git a/mm/slub.c b/mm/slub.c index 184fd2b14758..b46f87662e71 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" +#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> @@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object) set_orig_size(s, (void *)object, s->object_size); } -static void slab_bug(struct kmem_cache *s, char *fmt, ...) +static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp) { struct va_format vaf; va_list args; - va_start(args, fmt); + va_copy(args, argsp); vaf.fmt = fmt; vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); va_end(args); } +static void slab_bug(struct kmem_cache *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __slab_bug(s, fmt, args); + va_end(args); +} + __printf(2, 3) -static void slab_fix(struct kmem_cache *s, char *fmt, ...) +static void slab_fix(struct kmem_cache *s, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); - - dump_stack(); } static void object_err(struct kmem_cache *s, struct slab *slab, - u8 *object, char *reason) + u8 *object, const char *reason) { if (slab_add_kunit_errors()) return; - slab_bug(s, "%s", reason); + slab_bug(s, reason); print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); } static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, @@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, return false; } +static void __slab_err(struct slab *slab) +{ + if (slab_in_kunit_test()) + return; + + print_slab_info(slab); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); +} + static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { va_list args; - char buf[100]; if (slab_add_kunit_errors()) return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + __slab_bug(s, fmt, args); va_end(args); - slab_bug(s, "%s", buf); - print_slab_info(slab); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + __slab_err(slab); } static void init_object(struct kmem_cache *s, void *object, u8 val) @@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) s->inuse - poison_size); } -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, +static void restore_bytes(struct kmem_cache *s, const char *message, u8 data, void *from, void *to) { slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); @@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data, static pad_check_attributes int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, - u8 *object, char *what, - u8 *start, unsigned int value, unsigned int bytes) + u8 *object, const char *what, u8 *start, unsigned int value, + unsigned int bytes, bool slab_obj_print) { u8 *fault; u8 *end; @@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab, if (slab_add_kunit_errors()) goto skip_bug_print; - slab_bug(s, "%s overwritten", what); - pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault - addr, - fault[0], value); + pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + what, fault, end - 1, fault - addr, fault[0], value); + + if (slab_obj_print) + object_err(s, slab, object, "Object corrupt"); skip_bug_print: restore_bytes(s, what, value, fault, end); @@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) return 1; return check_bytes_and_report(s, slab, p, "Object padding", - p + off, POISON_INUSE, size_from_object(s) - off); + p + off, POISON_INUSE, size_from_object(s) - off, true); } /* Check the pad bytes at the end of a slab page */ @@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", - fault, end - 1, fault - start); + slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); + __slab_err(slab); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); } @@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", - object - s->red_left_pad, val, s->red_left_pad)) + object - s->red_left_pad, val, s->red_left_pad, ret)) ret = 0; if (!check_bytes_and_report(s, slab, object, "Right Redzone", - endobject, val, s->inuse - s->object_size)) + endobject, val, s->inuse - s->object_size, ret)) ret = 0; if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { @@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->object_size > orig_size && !check_bytes_and_report(s, slab, object, "kmalloc Redzone", p + orig_size, - val, s->object_size - orig_size)) { + val, s->object_size - orig_size, ret)) { ret = 0; } } @@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { if (!check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, - s->inuse - s->object_size)) + s->inuse - s->object_size, ret)) ret = 0; } } @@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (kasan_meta_size < s->object_size - 1 && !check_bytes_and_report(s, slab, p, "Poison", p + kasan_meta_size, POISON_FREE, - s->object_size - kasan_meta_size - 1)) + s->object_size - kasan_meta_size - 1, ret)) ret = 0; if (kasan_meta_size < s->object_size && !check_bytes_and_report(s, slab, p, "End Poison", - p + s->object_size - 1, POISON_END, 1)) + p + s->object_size - 1, POISON_END, 1, ret)) ret = 0; } /* @@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab, ret = 0; } - if (!ret && !slab_in_kunit_test()) { - print_trailer(s, slab, object); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); - } - return ret; } @@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) +static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; @@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) fp = slab->freelist; while (fp && nr <= slab->objects) { if (fp == search) - return 1; + return true; if (!check_valid_pointer(s, slab, fp)) { if (object) { object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); + break; } else { slab_err(s, slab, "Freepointer corrupt"); slab->freelist = NULL; slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); - return 0; + return false; } - break; } object = fp; fp = get_freepointer(s, object); nr++; } + if (nr > slab->objects) { + slab_err(s, slab, "Freelist cycle detected"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return false; + } + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s, slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", object); } else if (!slab->slab_cache) { - pr_err("SLUB <none>: no slab for object 0x%p.\n", - object); - dump_stack(); - } else + slab_err(NULL, slab, "No slab cache for object 0x%p", + object); + } else { object_err(s, slab, object, - "page slab pointer corrupt."); + "page slab pointer corrupt."); + } return 0; } return 1; @@ -4254,6 +4277,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); + __folio_set_large_kmalloc(folio); } ptr = kasan_kmalloc_large(ptr, size, flags); @@ -4729,6 +4753,11 @@ static void free_large_kmalloc(struct folio *folio, void *object) { unsigned int order = folio_order(folio); + if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { + dump_page(&folio->page, "Not a kmalloc allocation"); + return; + } + if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); @@ -4738,9 +4767,55 @@ static void free_large_kmalloc(struct folio *folio, void *object) lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __folio_clear_large_kmalloc(folio); folio_put(folio); } +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + folio = virt_to_folio(obj); + if (!folio_test_slab(folio)) { + /* + * rcu_head offset can be only less than page size so no need to + * consider folio order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(folio, obj); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_addr = folio_address(folio); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + /** * kfree - free previously allocated memory * @object: pointer returned by kmalloc() or kmem_cache_alloc() @@ -4891,6 +4966,168 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) } EXPORT_SYMBOL(krealloc_noprof); +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + +/** + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + void *ret; + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), + kmalloc_gfp_adjust(flags, size), + node, _RET_IP_); + if (ret || size <= PAGE_SIZE) + return ret; + + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kvmalloc_node_noprof); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + void *n; + + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; +} +EXPORT_SYMBOL(kvrealloc_noprof); + struct detached_freelist { struct slab *slab; void *tail; @@ -5583,14 +5820,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) return !!oo_objects(s->oo); } -static void list_slab_objects(struct kmem_cache *s, struct slab *slab, - const char *text) +static void list_slab_objects(struct kmem_cache *s, struct slab *slab) { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); void *p; - slab_err(s, slab, text, s->name); + if (!slab_add_kunit_errors()) + slab_bug(s, "Objects remaining on __kmem_cache_shutdown()"); spin_lock(&object_map_lock); __fill_map(object_map, s, slab); @@ -5605,6 +5842,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, } } spin_unlock(&object_map_lock); + + __slab_err(slab); #endif } @@ -5625,8 +5864,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) remove_partial(n, slab); list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, slab, - "Objects remaining in %s on __kmem_cache_shutdown()"); + list_slab_objects(s, slab); } } spin_unlock_irq(&n->list_lock); diff --git a/mm/swap.c b/mm/swap.c index 7523b65d8caa..77b2d5997873 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -45,7 +45,7 @@ /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; -const int page_cluster_max = 31; +static const int page_cluster_max = 31; struct cpu_fbatches { /* @@ -1074,6 +1074,18 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } +static const struct ctl_table swap_sysctl_table[] = { + { + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&page_cluster_max, + } +}; + /* * Perform any setup for the swap system */ @@ -1090,4 +1102,6 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ + + register_sysctl_init("vm", swap_sysctl_table); } diff --git a/mm/swap.h b/mm/swap.h index 0abb68091b4f..6f4a3f927edb 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -3,6 +3,7 @@ #define _MM_SWAP_H struct mempolicy; +extern int page_cluster; #ifdef CONFIG_SWAP #include <linux/swapops.h> /* for swp_offset */ diff --git a/mm/usercopy.c b/mm/usercopy.c index 83c164aba6e0..dbdcc43964fb 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -17,7 +17,7 @@ #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> -#include <linux/thread_info.h> +#include <linux/ucopysize.h> #include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> @@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n, } } -static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); +DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, + validate_usercopy_range); +EXPORT_SYMBOL(validate_usercopy_range); /* * Validates that the given object is: @@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); */ void __check_object_size(const void *ptr, unsigned long n, bool to_user) { - if (static_branch_unlikely(&bypass_usercopy_checks)) - return; - /* Skip all tests if size is zero. */ if (!n) return; @@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) } EXPORT_SYMBOL(__check_object_size); -static bool enable_checks __initdata = true; +static bool enable_checks __initdata = + IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON); static int __init parse_hardened_usercopy(char *str) { @@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy); static int __init set_hardened_usercopy(void) { - if (enable_checks == false) - static_branch_enable(&bypass_usercopy_checks); + if (enable_checks) + static_branch_enable(&validate_usercopy_range); + else + static_branch_disable(&validate_usercopy_range); return 1; } diff --git a/mm/util.c b/mm/util.c index b6b9684a1438..448117da071f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -12,6 +12,7 @@ #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/sysctl.h> #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> @@ -23,6 +24,7 @@ #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> +#include <linux/fsnotify.h> #include <linux/uaccess.h> @@ -569,6 +571,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); + if (!ret) + ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; @@ -612,168 +616,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); -static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) -{ - /* - * We want to attempt a large physically contiguous block first because - * it is less likely to fragment multiple larger blocks and therefore - * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. - */ - if (size > PAGE_SIZE) { - flags |= __GFP_NOWARN; - - if (!(flags & __GFP_RETRY_MAYFAIL)) - flags |= __GFP_NORETRY; - - /* nofail semantic is implemented by the vmalloc fallback */ - flags &= ~__GFP_NOFAIL; - } - - return flags; -} - -/** - * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon - * failure, fall back to non-contiguous (vmalloc) allocation. - * @size: size of the request. - * @b: which set of kmalloc buckets to allocate from. - * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. - * @node: numa node to allocate from - * - * Uses kmalloc to get the memory but if the allocation fails then falls back - * to the vmalloc allocator. Use kvfree for freeing the memory. - * - * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. - * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is - * preferable to the vmalloc fallback, due to visible performance drawbacks. - * - * Return: pointer to the allocated memory of %NULL in case of failure - */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) -{ - void *ret; - - /* - * It doesn't really make sense to fallback to vmalloc for sub page - * requests - */ - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), - kmalloc_gfp_adjust(flags, size), - node); - if (ret || size <= PAGE_SIZE) - return ret; - - /* non-sleeping allocations are not supported by vmalloc */ - if (!gfpflags_allow_blocking(flags)) - return NULL; - - /* Don't even allow crazy sizes */ - if (unlikely(size > INT_MAX)) { - WARN_ON_ONCE(!(flags & __GFP_NOWARN)); - return NULL; - } - - /* - * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, - * since the callers already cannot assume anything - * about the resulting pointer, and cannot play - * protection games. - */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - node, __builtin_return_address(0)); -} -EXPORT_SYMBOL(__kvmalloc_node_noprof); - -/** - * kvfree() - Free memory. - * @addr: Pointer to allocated memory. - * - * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). - * It is slightly more efficient to use kfree() or vfree() if you are certain - * that you know which one to use. - * - * Context: Either preemptible task context or not-NMI interrupt. - */ -void kvfree(const void *addr) -{ - if (is_vmalloc_addr(addr)) - vfree(addr); - else - kfree(addr); -} -EXPORT_SYMBOL(kvfree); - -/** - * kvfree_sensitive - Free a data object containing sensitive information. - * @addr: address of the data object to be freed. - * @len: length of the data object. - * - * Use the special memzero_explicit() function to clear the content of a - * kvmalloc'ed object containing sensitive data to make sure that the - * compiler won't optimize out the data clearing. - */ -void kvfree_sensitive(const void *addr, size_t len) -{ - if (likely(!ZERO_OR_NULL_PTR(addr))) { - memzero_explicit((void *)addr, len); - kvfree(addr); - } -} -EXPORT_SYMBOL(kvfree_sensitive); - -/** - * kvrealloc - reallocate memory; contents remain unchanged - * @p: object to reallocate memory for - * @size: the size to reallocate - * @flags: the flags for the page level allocator - * - * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 - * and @p is not a %NULL pointer, the object pointed to is freed. - * - * If __GFP_ZERO logic is requested, callers must ensure that, starting with the - * initial memory allocation, every subsequent call to this API for the same - * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that - * __GFP_ZERO is not fully honored by this API. - * - * In any case, the contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. - * - * This function must not be called concurrently with itself or kvfree() for the - * same memory allocation. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) -{ - void *n; - - if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); - - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); - if (!n) { - /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); - if (!n) - return NULL; - - if (p) { - /* We already know that `p` is not a vmalloc address. */ - kasan_disable_current(); - memcpy(n, kasan_reset_tag(p), ksize(p)); - kasan_enable_current(); - - kfree(p); - } - } - - return n; -} -EXPORT_SYMBOL(kvrealloc_noprof); - /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. @@ -906,14 +748,16 @@ int folio_mc_copy(struct folio *dst, struct folio *src) EXPORT_SYMBOL(folio_mc_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; -int sysctl_overcommit_ratio __read_mostly = 50; -unsigned long sysctl_overcommit_kbytes __read_mostly; +static int sysctl_overcommit_ratio __read_mostly = 50; +static unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SYSCTL + +static int overcommit_ratio_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -928,8 +772,8 @@ static void sync_overcommit_as(struct work_struct *dummy) percpu_counter_sync(&vm_committed_as); } -int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int overcommit_policy_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int new_policy = -1; @@ -964,8 +808,8 @@ int overcommit_policy_handler(const struct ctl_table *table, int write, void *bu return ret; } -int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int overcommit_kbytes_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -975,6 +819,54 @@ int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *bu return ret; } +static const struct ctl_table util_sysctl_table[] = { + { + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = overcommit_policy_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, + }, + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +}; + +static int __init init_vm_util_sysctls(void) +{ + register_sysctl_init("vm", util_sysctl_table); + return 0; +} +subsys_initcall(init_vm_util_sysctls); +#endif /* CONFIG_SYSCTL */ + /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 2b2ab386cab5..b620d74b0f66 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7432,6 +7432,28 @@ void __meminit kswapd_stop(int nid) pgdat_kswapd_unlock(pgdat); } +static const struct ctl_table vmscan_sysctl_table[] = { + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO_HUNDRED, + }, +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &node_reclaim_mode, + .maxlen = sizeof(node_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + } +#endif +}; + static int __init kswapd_init(void) { int nid; @@ -7439,6 +7461,7 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); + register_sysctl_init("vm", vmscan_sysctl_table); return 0; } diff --git a/mm/vmstat.c b/mm/vmstat.c index ab5c840941f3..4c268ce39ff2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -31,8 +31,10 @@ #include "internal.h" +#ifdef CONFIG_PROC_FS #ifdef CONFIG_NUMA -int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; +#define ENABLE_NUMA_STAT 1 +static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; /* zero numa counters within a zone */ static void zero_zone_numa_counters(struct zone *zone) @@ -74,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); -int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, +static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -102,6 +104,7 @@ out: return ret; } #endif +#endif /* CONFIG_PROC_FS */ #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -1440,6 +1443,8 @@ const char * const vmstat_text[] = { #ifdef CONFIG_X86 "direct_map_level2_splits", "direct_map_level3_splits", + "direct_map_level2_collapses", + "direct_map_level3_collapses", #endif #ifdef CONFIG_PER_VMA_LOCK_STATS "vma_lock_success", @@ -1943,7 +1948,7 @@ static const struct seq_operations vmstat_op = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); -int sysctl_stat_interval __read_mostly = HZ; +static int sysctl_stat_interval __read_mostly = HZ; static int vmstat_late_init_done; #ifdef CONFIG_PROC_FS @@ -1952,7 +1957,7 @@ static void refresh_vm_stats(struct work_struct *work) refresh_cpu_vm_stats(true); } -int vmstat_refresh(const struct ctl_table *table, int write, +static int vmstat_refresh(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { long val; @@ -2201,6 +2206,38 @@ static int __init vmstat_late_init(void) late_initcall(vmstat_late_init); #endif +#ifdef CONFIG_PROC_FS +static const struct ctl_table vmstat_table[] = { +#ifdef CONFIG_SMP + { + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif +}; +#endif + struct workqueue_struct *mm_percpu_wq; void __init init_mm_internals(void) @@ -2232,6 +2269,7 @@ void __init init_mm_internals(void) proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); proc_create_seq("vmstat", 0444, NULL, &vmstat_op); proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); + register_sysctl_init("vm", vmstat_table); #endif } |