summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/execmem.c39
-rw-r--r--mm/filemap.c106
-rw-r--r--mm/folio-compat.c14
-rw-r--r--mm/gup.c6
-rw-r--r--mm/internal.h11
-rw-r--r--mm/memblock.c66
-rw-r--r--mm/memcontrol-v1.c6
-rw-r--r--mm/memcontrol.c57
-rw-r--r--mm/memory.c168
-rw-r--r--mm/mempolicy.c31
-rw-r--r--mm/migrate_device.c116
-rw-r--r--mm/mmap.c54
-rw-r--r--mm/nommu.c101
-rw-r--r--mm/page_alloc.c203
-rw-r--r--mm/page_owner.c8
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.h34
-rw-r--r--mm/slab_common.c48
-rw-r--r--mm/slub.c336
-rw-r--r--mm/swap.c16
-rw-r--r--mm/swap.h1
-rw-r--r--mm/usercopy.c18
-rw-r--r--mm/util.c232
-rw-r--r--mm/vmscan.c23
-rw-r--r--mm/vmstat.c46
28 files changed, 1231 insertions, 539 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 4a4e7b63d30a..d3fb3762887b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -195,6 +195,10 @@ menu "Slab allocator options"
config SLUB
def_bool y
+config KVFREE_RCU_BATCHED
+ def_bool y
+ depends on !SLUB_TINY && !TINY_RCU
+
config SLUB_TINY
bool "Configure for minimal memory footprint"
depends on EXPERT
diff --git a/mm/execmem.c b/mm/execmem.c
index 317b6a8d35be..e6c4f5076ca8 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -257,7 +257,6 @@ out_unlock:
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
- unsigned long start, end;
struct vm_struct *vm;
size_t alloc_size;
int err = -ENOMEM;
@@ -275,26 +274,18 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
- start = (unsigned long)p;
- end = start + alloc_size;
-
- vunmap_range(start, end);
-
- err = execmem_set_direct_map_valid(vm, false);
- if (err)
- goto err_free_mem;
-
- err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages,
- PMD_SHIFT);
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
if (err)
goto err_free_mem;
err = execmem_cache_add(p, alloc_size);
if (err)
- goto err_free_mem;
+ goto err_reset_direct_map;
return 0;
+err_reset_direct_map:
+ execmem_set_direct_map_valid(vm, true);
err_free_mem:
vfree(p);
return err;
@@ -344,6 +335,28 @@ static bool execmem_cache_free(void *ptr)
return true;
}
+
+int execmem_make_temp_rw(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+ int ret;
+
+ ret = set_memory_nx(addr, nr);
+ if (ret)
+ return ret;
+
+ return set_memory_rw(addr, nr);
+}
+
+int execmem_restore_rox(void *ptr, size_t size)
+{
+ unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ unsigned long addr = (unsigned long)ptr;
+
+ return set_memory_rox(addr, nr);
+}
+
#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index cc69f174f76b..b5e784f34d98 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -47,7 +47,7 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
+#include <linux/sysctl.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -1066,6 +1066,19 @@ static wait_queue_head_t *folio_waitqueue(struct folio *folio)
return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}
+/* How many times do we accept lock stealing from under a waiter? */
+static int sysctl_page_lock_unfairness = 5;
+static const struct ctl_table filemap_sysctl_table[] = {
+ {
+ .procname = "page_lock_unfairness",
+ .data = &sysctl_page_lock_unfairness,
+ .maxlen = sizeof(sysctl_page_lock_unfairness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+};
+
void __init pagecache_init(void)
{
int i;
@@ -1074,6 +1087,7 @@ void __init pagecache_init(void)
init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
+ register_sysctl_init("vm", filemap_sysctl_table);
}
/*
@@ -1221,9 +1235,6 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
return true;
}
-/* How many times do we accept lock stealing from under a waiter? */
-int sysctl_page_lock_unfairness = 5;
-
static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
int state, enum behavior behavior)
{
@@ -1367,7 +1378,7 @@ repeat:
* @ptl: already locked ptl. This function will drop the lock.
*
* Wait for a migration entry referencing the given page to be removed. This is
- * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
+ * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
* this can be called without taking a reference on the page. Instead this
* should be called while holding the ptl for the migration entry referencing
* the page.
@@ -3197,14 +3208,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
- /*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't populate our mapping with 0 filled pages that we
- * never emitted an event for.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
@@ -3273,10 +3276,6 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file *fpin = NULL;
unsigned int mmap_miss;
- /* See comment in do_sync_mmap_readahead. */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
@@ -3336,48 +3335,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
}
/**
- * filemap_fsnotify_fault - maybe emit a pre-content event.
- * @vmf: struct vm_fault containing details of the fault.
- *
- * If we have a pre-content watch on this file we will emit an event for this
- * range. If we return anything the fault caller should return immediately, we
- * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
- * fault again and then the fault handler will run the second time through.
- *
- * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
- */
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- struct file *fpin = NULL;
- int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
- loff_t pos = vmf->pgoff >> PAGE_SHIFT;
- size_t count = PAGE_SIZE;
- int err;
-
- /*
- * We already did this and now we're retrying with everything locked,
- * don't emit the event and continue.
- */
- if (vmf->flags & FAULT_FLAG_TRIED)
- return 0;
-
- /* No watches, we're done. */
- if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
- return 0;
-
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- return VM_FAULT_SIGBUS;
-
- err = fsnotify_file_area_perm(fpin, mask, &pos, count);
- fput(fpin);
- if (err)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_RETRY;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
-/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
*
@@ -3481,37 +3438,6 @@ retry_find:
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
- * If this is a precontent file we have can now emit an event to
- * try and populate the folio.
- */
- if (!(vmf->flags & FAULT_FLAG_TRIED) &&
- unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
- loff_t pos = folio_pos(folio);
- size_t count = folio_size(folio);
-
- /* We're NOWAIT, we have to retry. */
- if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
- folio_unlock(folio);
- goto out_retry;
- }
-
- if (mapping_locked)
- filemap_invalidate_unlock_shared(mapping);
- mapping_locked = false;
-
- folio_unlock(folio);
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- goto out_retry;
-
- error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
- count);
- if (error)
- ret = VM_FAULT_SIGBUS;
- goto out_retry;
- }
-
- /*
* If the invalidate lock is not held, the folio was in cache
* and uptodate and now it is not. Strange but possible since we
* didn't hold the page lock all the time. Let's drop
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 1d1832e2a599..45540942d148 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -28,12 +28,6 @@ void wait_on_page_writeback(struct page *page)
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);
-void wait_for_stable_page(struct page *page)
-{
- return folio_wait_stable(page_folio(page));
-}
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
-
void mark_page_accessed(struct page *page)
{
folio_mark_accessed(page_folio(page));
@@ -90,11 +84,3 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
return folio_file_page(folio, index);
}
EXPORT_SYMBOL(pagecache_get_page);
-
-struct page *grab_cache_page_write_begin(struct address_space *mapping,
- pgoff_t index)
-{
- return pagecache_get_page(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
-}
-EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/gup.c b/mm/gup.c
index 2944fe8cf317..92351e2fa876 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2254,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable);
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
+ * @locked: a pointer to an int denoting whether the mmap sem is held
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by put_page().
@@ -2266,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable);
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
+struct page *get_dump_page(unsigned long addr, int *locked)
{
struct page *page;
- int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
diff --git a/mm/internal.h b/mm/internal.h
index 21f2643f3d95..50c2f590b2d0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1165,9 +1165,13 @@ static inline void mminit_verify_zonelist(void)
#define NODE_RECLAIM_SUCCESS 1
#ifdef CONFIG_NUMA
+extern int node_reclaim_mode;
+
extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
+#define node_reclaim_mode 0
+
static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
unsigned int order)
{
@@ -1179,6 +1183,12 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
}
#endif
+static inline bool node_reclaim_enabled(void)
+{
+ /* Is any node_reclaim_mode bit set? */
+ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
+}
+
/*
* mm/memory-failure.c
*/
@@ -1256,6 +1266,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_NOFRAGMENT 0x0
#endif
#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
+#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
/* Flags that allow allocations below the min watermark. */
diff --git a/mm/memblock.c b/mm/memblock.c
index 64ae678cd1d1..284154445409 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -16,6 +16,7 @@
#include <linux/kmemleak.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
+#include <linux/mutex.h>
#include <asm/sections.h>
#include <linux/io.h>
@@ -2282,6 +2283,7 @@ struct reserve_mem_table {
};
static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
static int reserved_mem_count;
+static DEFINE_MUTEX(reserve_mem_lock);
/* Add wildcard region with a lookup name */
static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
@@ -2295,6 +2297,21 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
strscpy(map->name, name);
}
+static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name)
+{
+ struct reserve_mem_table *map;
+ int i;
+
+ for (i = 0; i < reserved_mem_count; i++) {
+ map = &reserved_mem_table[i];
+ if (!map->size)
+ continue;
+ if (strcmp(name, map->name) == 0)
+ return map;
+ }
+ return NULL;
+}
+
/**
* reserve_mem_find_by_name - Find reserved memory region with a given name
* @name: The name that is attached to a reserved memory region
@@ -2308,22 +2325,47 @@ static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size)
{
struct reserve_mem_table *map;
- int i;
- for (i = 0; i < reserved_mem_count; i++) {
- map = &reserved_mem_table[i];
- if (!map->size)
- continue;
- if (strcmp(name, map->name) == 0) {
- *start = map->start;
- *size = map->size;
- return 1;
- }
- }
- return 0;
+ guard(mutex)(&reserve_mem_lock);
+ map = reserve_mem_find_by_name_nolock(name);
+ if (!map)
+ return 0;
+
+ *start = map->start;
+ *size = map->size;
+ return 1;
}
EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
+/**
+ * reserve_mem_release_by_name - Release reserved memory region with a given name
+ * @name: The name that is attatched to a reserved memory region
+ *
+ * Forcibly release the pages in the reserved memory region so that those memory
+ * can be used as free memory. After released the reserved region size becomes 0.
+ *
+ * Returns: 1 if released or 0 if not found.
+ */
+int reserve_mem_release_by_name(const char *name)
+{
+ char buf[RESERVE_MEM_NAME_SIZE + 12];
+ struct reserve_mem_table *map;
+ void *start, *end;
+
+ guard(mutex)(&reserve_mem_lock);
+ map = reserve_mem_find_by_name_nolock(name);
+ if (!map)
+ return 0;
+
+ start = phys_to_virt(map->start);
+ end = start + map->size - 1;
+ snprintf(buf, sizeof(buf), "reserve_mem:%s", name);
+ free_reserved_area(start, end, 0, buf);
+ map->size = 0;
+
+ return 1;
+}
+
/*
* Parse reserve_mem=nn:align:name
*/
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index c1feb3945350..8660908850dc 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1955,9 +1955,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
if (val > MAX_SWAPPINESS)
return -EINVAL;
- if (!mem_cgroup_is_root(memcg))
+ if (!mem_cgroup_is_root(memcg)) {
+ pr_info_once("Per memcg swappiness does not exist in cgroup v2. "
+ "See memory.reclaim or memory.swap.max there\n ");
WRITE_ONCE(memcg->swappiness, val);
- else
+ } else
WRITE_ONCE(vm_swappiness, val);
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 40c07b8699ae..421740f1bcdc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1759,7 +1759,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
struct memcg_stock_pcp {
- local_lock_t stock_lock;
+ localtry_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -1774,7 +1774,7 @@ struct memcg_stock_pcp {
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+ .stock_lock = INIT_LOCALTRY_LOCK(stock_lock),
};
static DEFINE_MUTEX(percpu_charge_mutex);
@@ -1786,6 +1786,7 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
+ * @gfp_mask: allocation mask.
*
* The charges will only happen if @memcg matches the current cpu's memcg
* stock, and at least @nr_pages are available in that stock. Failure to
@@ -1793,7 +1794,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
*
* returns true if successful, false otherwise.
*/
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
+ gfp_t gfp_mask)
{
struct memcg_stock_pcp *stock;
unsigned int stock_pages;
@@ -1803,7 +1805,11 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ if (!gfpflags_allow_spinning(gfp_mask))
+ return ret;
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ }
stock = this_cpu_ptr(&memcg_stock);
stock_pages = READ_ONCE(stock->nr_pages);
@@ -1812,7 +1818,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
ret = true;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -1851,14 +1857,14 @@ static void drain_local_stock(struct work_struct *dummy)
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
old = drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -1888,9 +1894,20 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long flags;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ /*
+ * In case of unlikely failure to lock percpu stock_lock
+ * uncharge memcg directly.
+ */
+ if (mem_cgroup_is_root(memcg))
+ return;
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+ return;
+ }
__refill_stock(memcg, nr_pages);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
}
/*
@@ -1947,9 +1964,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
stock = &per_cpu(memcg_stock, cpu);
/* drain_obj_stock requires stock_lock */
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
old = drain_obj_stock(stock);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
drain_stock(stock);
obj_cgroup_put(old);
@@ -2242,9 +2259,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long pflags;
retry:
- if (consume_stock(memcg, nr_pages))
+ if (consume_stock(memcg, nr_pages, gfp_mask))
return 0;
+ if (!gfpflags_allow_spinning(gfp_mask))
+ /* Avoid the refill and flush of the older stock */
+ batch = nr_pages;
+
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
@@ -2766,7 +2787,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
unsigned long flags;
int *bytes;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
/*
@@ -2815,7 +2836,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
__mod_objcg_mlstate(objcg, pgdat, idx, nr);
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -2825,7 +2846,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
unsigned long flags;
bool ret = false;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
@@ -2833,7 +2854,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
ret = true;
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -2925,7 +2946,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
unsigned long flags;
unsigned int nr_pages = 0;
- local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
@@ -2939,7 +2960,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
if (nr_pages)
diff --git a/mm/memory.c b/mm/memory.c
index 3900225d99c5..6ea3551eb2df 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -76,7 +76,6 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
-#include <linux/fsnotify.h>
#include <trace/events/kmem.h>
@@ -1357,12 +1356,12 @@ int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
- unsigned long next;
unsigned long addr = src_vma->vm_start;
unsigned long end = src_vma->vm_end;
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
+ unsigned long next, pfn;
bool is_cow;
int ret;
@@ -1373,11 +1372,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- /*
- * We do not free on error cases below as remove_vma
- * gets called on error from higher level routine
- */
- ret = track_pfn_copy(src_vma);
+ ret = track_pfn_copy(dst_vma, src_vma, &pfn);
if (ret)
return ret;
}
@@ -1414,7 +1409,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
continue;
if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
addr, next))) {
- untrack_pfn_clear(dst_vma);
ret = -ENOMEM;
break;
}
@@ -1424,6 +1418,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
+ if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
+ untrack_pfn_copy(dst_vma, pfn);
return ret;
}
@@ -4432,7 +4428,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry);
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
- struct dev_pagemap *pgmap;
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
/*
* migrate_to_ram is not yet ready to operate
@@ -4455,11 +4450,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Get a page reference while we know the page can't be
* freed.
*/
- get_page(vmf->page);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- pgmap = page_pgmap(vmf->page);
- ret = pgmap->ops->migrate_to_ram(vmf);
- put_page(vmf->page);
+ if (trylock_page(vmf->page)) {
+ struct dev_pagemap *pgmap;
+
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ pgmap = page_pgmap(vmf->page);
+ ret = pgmap->ops->migrate_to_ram(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ } else {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_pte_marker_entry(entry)) {
@@ -5856,17 +5858,8 @@ out_map:
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
-
if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
- /*
- * Currently we just emit PAGE_SIZE for our fault events, so don't allow
- * a huge fault if we have a pre content watch on this file. This would
- * be trivial to support, but there would need to be tests to ensure
- * this works properly and those don't exist currently.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
@@ -5890,9 +5883,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -5915,9 +5905,6 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -5935,9 +5922,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
if (vma_is_anonymous(vma))
goto split;
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -7012,6 +6996,124 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
}
EXPORT_SYMBOL_GPL(access_process_vm);
+#ifdef CONFIG_BPF_SYSCALL
+/*
+ * Copy a string from another process's address space as given in mm.
+ * If there is any error return -EFAULT.
+ */
+static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ void *old_buf = buf;
+ int err = 0;
+
+ *(char *)buf = '\0';
+
+ if (mmap_read_lock_killable(mm))
+ return -EFAULT;
+
+ addr = untagged_addr_remote(mm, addr);
+
+ /* Avoid triggering the temporary warning in __get_user_pages */
+ if (!vma_lookup(mm, addr)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ while (len) {
+ int bytes, offset, retval;
+ void *maddr;
+ struct page *page;
+ struct vm_area_struct *vma = NULL;
+
+ page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
+ if (IS_ERR(page)) {
+ /*
+ * Treat as a total failure for now until we decide how
+ * to handle the CONFIG_HAVE_IOREMAP_PROT case and
+ * stack expansion.
+ */
+ *(char *)buf = '\0';
+ err = -EFAULT;
+ goto out;
+ }
+
+ bytes = len;
+ offset = addr & (PAGE_SIZE - 1);
+ if (bytes > PAGE_SIZE - offset)
+ bytes = PAGE_SIZE - offset;
+
+ maddr = kmap_local_page(page);
+ retval = strscpy(buf, maddr + offset, bytes);
+ if (retval >= 0) {
+ /* Found the end of the string */
+ buf += retval;
+ unmap_and_put_page(page, maddr);
+ break;
+ }
+
+ buf += bytes - 1;
+ /*
+ * Because strscpy always NUL terminates we need to
+ * copy the last byte in the page if we are going to
+ * load more pages
+ */
+ if (bytes != len) {
+ addr += bytes - 1;
+ copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1);
+ buf += 1;
+ addr += 1;
+ }
+ len -= bytes;
+
+ unmap_and_put_page(page, maddr);
+ }
+
+out:
+ mmap_read_unlock(mm);
+ if (err)
+ return err;
+ return buf - old_buf;
+}
+
+/**
+ * copy_remote_vm_str - copy a string from another process's address space.
+ * @tsk: the task of the target address space
+ * @addr: start address to read from
+ * @buf: destination buffer
+ * @len: number of bytes to copy
+ * @gup_flags: flags modifying lookup behaviour
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from @addr (source) to @buf (destination);
+ * not including the trailing NUL. Always guaranteed to leave NUL-terminated
+ * buffer. On any error, return -EFAULT.
+ */
+int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm) {
+ *(char *)buf = '\0';
+ return -EFAULT;
+ }
+
+ ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_remote_vm_str);
+#endif /* CONFIG_BPF_SYSCALL */
+
/*
* Print the name of a VMA.
*/
@@ -7044,10 +7146,8 @@ void __might_fault(const char *file, int line)
if (pagefault_disabled())
return;
__might_sleep(file, line);
-#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
-#endif
}
EXPORT_SYMBOL(__might_fault);
#endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 530e71fe9147..b28a1e6ae096 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state)
}
EXPORT_SYMBOL_GPL(numa_nearest_node);
+/**
+ * nearest_node_nodemask - Find the node in @mask at the nearest distance
+ * from @node.
+ *
+ * @node: a valid node ID to start the search from.
+ * @mask: a pointer to a nodemask representing the allowed nodes.
+ *
+ * This function iterates over all nodes in @mask and calculates the
+ * distance from the starting @node, then it returns the node ID that is
+ * the closest to @node, or MAX_NUMNODES if no node is found.
+ *
+ * Note that @node must be a valid node ID usable with node_distance(),
+ * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
+ * or unexpected behavior.
+ */
+int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+ int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
+
+ for_each_node_mask(n, *mask) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(nearest_node_nodemask);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 7d0d64f67cdf..3158afe7eb23 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
+ struct folio *fault_folio = migrate->fault_page ?
+ page_folio(migrate->fault_page) : NULL;
struct vm_area_struct *vma = walk->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start, unmapped = 0;
@@ -88,11 +90,16 @@ again:
folio_get(folio);
spin_unlock(ptl);
+ /* FIXME: we don't expect THP for fault_folio */
+ if (WARN_ON_ONCE(fault_folio == folio))
+ return migrate_vma_collect_skip(start, end,
+ walk);
if (unlikely(!folio_trylock(folio)))
return migrate_vma_collect_skip(start, end,
walk);
ret = split_folio(folio);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
if (ret)
return migrate_vma_collect_skip(start, end,
@@ -198,7 +205,7 @@ again:
* optimisation to avoid walking the rmap later with
* try_to_migrate().
*/
- if (folio_trylock(folio)) {
+ if (fault_folio == folio || folio_trylock(folio)) {
bool anon_exclusive;
pte_t swp_pte;
@@ -210,7 +217,8 @@ again:
if (folio_try_share_anon_rmap_pte(folio, page)) {
set_pte_at(mm, addr, ptep, pte);
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
mpfn = 0;
goto next;
@@ -369,6 +377,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
unsigned long npages,
struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i, restore = 0;
bool allow_drain = true;
unsigned long unmapped = 0;
@@ -433,7 +443,8 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
remove_migration_ptes(folio, folio, 0);
src_pfns[i] = 0;
- folio_unlock(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
folio_put(folio);
restore--;
}
@@ -542,6 +553,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (args->fault_page && !is_device_private_page(args->fault_page))
return -EINVAL;
+ if (args->fault_page && !PageLocked(args->fault_page))
+ return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@@ -805,19 +818,13 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
EXPORT_SYMBOL(migrate_vma_pages);
-/*
- * migrate_device_finalize() - complete page migration
- * @src_pfns: src_pfns returned from migrate_device_range()
- * @dst_pfns: array of pfns allocated by the driver to migrate memory to
- * @npages: number of pages in the range
- *
- * Completes migration of the page by removing special migration entries.
- * Drivers must ensure copying of page data is complete and visible to the CPU
- * before calling this.
- */
-void migrate_device_finalize(unsigned long *src_pfns,
- unsigned long *dst_pfns, unsigned long npages)
+static void __migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns,
+ unsigned long npages,
+ struct page *fault_page)
{
+ struct folio *fault_folio = fault_page ?
+ page_folio(fault_page) : NULL;
unsigned long i;
for (i = 0; i < npages; i++) {
@@ -830,6 +837,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!page) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
@@ -840,6 +848,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
if (dst) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
@@ -849,15 +858,33 @@ void migrate_device_finalize(unsigned long *src_pfns,
if (!folio_is_zone_device(dst))
folio_add_lru(dst);
remove_migration_ptes(src, dst, 0);
- folio_unlock(src);
+ if (fault_folio != src)
+ folio_unlock(src);
folio_put(src);
if (dst != src) {
+ WARN_ON_ONCE(fault_folio == dst);
folio_unlock(dst);
folio_put(dst);
}
}
}
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
+ */
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
+{
+ return __migrate_device_finalize(src_pfns, dst_pfns, npages, NULL);
+}
EXPORT_SYMBOL(migrate_device_finalize);
/**
@@ -873,10 +900,27 @@ EXPORT_SYMBOL(migrate_device_finalize);
*/
void migrate_vma_finalize(struct migrate_vma *migrate)
{
- migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+ __migrate_device_finalize(migrate->src, migrate->dst, migrate->npages,
+ migrate->fault_page);
}
EXPORT_SYMBOL(migrate_vma_finalize);
+static unsigned long migrate_device_pfn_lock(unsigned long pfn)
+{
+ struct folio *folio;
+
+ folio = folio_get_nontail_page(pfn_to_page(pfn));
+ if (!folio)
+ return 0;
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ return 0;
+ }
+
+ return migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+}
+
/**
* migrate_device_range() - migrate device private pfns to normal memory.
* @src_pfns: array large enough to hold migrating source device private pfns.
@@ -901,29 +945,35 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
{
unsigned long i, pfn;
- for (pfn = start, i = 0; i < npages; pfn++, i++) {
- struct folio *folio;
+ for (pfn = start, i = 0; i < npages; pfn++, i++)
+ src_pfns[i] = migrate_device_pfn_lock(pfn);
- folio = folio_get_nontail_page(pfn_to_page(pfn));
- if (!folio) {
- src_pfns[i] = 0;
- continue;
- }
+ migrate_device_unmap(src_pfns, npages, NULL);
- if (!folio_trylock(folio)) {
- src_pfns[i] = 0;
- folio_put(folio);
- continue;
- }
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
- src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
- }
+/**
+ * migrate_device_pfns() - migrate device private pfns to normal memory.
+ * @src_pfns: pre-popluated array of source device private pfns to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * Similar to migrate_device_range() but supports non-contiguous pre-popluated
+ * array of device pages to migrate.
+ */
+int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; i++)
+ src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]);
migrate_device_unmap(src_pfns, npages, NULL);
return 0;
}
-EXPORT_SYMBOL(migrate_device_range);
+EXPORT_SYMBOL(migrate_device_pfns);
/*
* Migrate a device coherent folio back to normal memory. The caller should have
diff --git a/mm/mmap.c b/mm/mmap.c
index efcc4ca7500d..bd210aaf7ebd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1544,6 +1544,57 @@ struct vm_area_struct *_install_special_mapping(
&special_mapping_vmops);
}
+#ifdef CONFIG_SYSCTL
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+int sysctl_legacy_va_layout;
+#endif
+
+static const struct ctl_table mmap_table[] = {
+ {
+ .procname = "max_map_count",
+ .data = &sysctl_max_map_count,
+ .maxlen = sizeof(sysctl_max_map_count),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
+ defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
+ {
+ .procname = "legacy_va_layout",
+ .data = &sysctl_legacy_va_layout,
+ .maxlen = sizeof(sysctl_legacy_va_layout),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
+};
+#endif /* CONFIG_SYSCTL */
+
/*
* initialise the percpu counter for VM
*/
@@ -1553,6 +1604,9 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
+#ifdef CONFIG_SYSCTL
+ register_sysctl_init("vm", mmap_table);
+#endif
}
/*
diff --git a/mm/nommu.c b/mm/nommu.c
index 15a396ce2553..617e7ba8022f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -43,7 +43,6 @@
#include "internal.h"
unsigned long highest_memmap_pfn;
-int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
@@ -386,6 +385,19 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
return mm->brk = brk;
}
+static int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+
+static const struct ctl_table nommu_table[] = {
+ {
+ .procname = "nr_trim_pages",
+ .data = &sysctl_nr_trim_pages,
+ .maxlen = sizeof(sysctl_nr_trim_pages),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+};
+
/*
* initialise the percpu counter for VM and region record slabs
*/
@@ -396,6 +408,7 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
+ register_sysctl_init("vm", nommu_table);
}
/*
@@ -1607,13 +1620,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- BUG();
- return 0;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
@@ -1702,6 +1708,85 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
}
EXPORT_SYMBOL_GPL(access_process_vm);
+#ifdef CONFIG_BPF_SYSCALL
+/*
+ * Copy a string from another process's address space as given in mm.
+ * If there is any error return -EFAULT.
+ */
+static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len)
+{
+ unsigned long addr_end;
+ struct vm_area_struct *vma;
+ int ret = -EFAULT;
+
+ *(char *)buf = '\0';
+
+ if (mmap_read_lock_killable(mm))
+ return ret;
+
+ /* the access must start within one of the target process's mappings */
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto out;
+
+ if (check_add_overflow(addr, len, &addr_end))
+ goto out;
+
+ /* don't overrun this mapping */
+ if (addr_end > vma->vm_end)
+ len = vma->vm_end - addr;
+
+ /* only read mappings where it is permitted */
+ if (vma->vm_flags & VM_MAYREAD) {
+ ret = strscpy(buf, (char *)addr, len);
+ if (ret < 0)
+ ret = len - 1;
+ }
+
+out:
+ mmap_read_unlock(mm);
+ return ret;
+}
+
+/**
+ * copy_remote_vm_str - copy a string from another process's address space.
+ * @tsk: the task of the target address space
+ * @addr: start address to read from
+ * @buf: destination buffer
+ * @len: number of bytes to copy
+ * @gup_flags: flags modifying lookup behaviour (unused)
+ *
+ * The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from @addr (source) to @buf (destination);
+ * not including the trailing NUL. Always guaranteed to leave NUL-terminated
+ * buffer. On any error, return -EFAULT.
+ */
+int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm) {
+ *(char *)buf = '\0';
+ return -EFAULT;
+ }
+
+ ret = __copy_remote_vm_str(mm, addr, buf, len);
+
+ mmput(mm);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(copy_remote_vm_str);
+#endif /* CONFIG_BPF_SYSCALL */
+
/**
* nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
* @inode: The inode to check
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0c01998cb3a0..f51aa6051a99 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+/* Free the page without taking locks. Rely on trylock only. */
+#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -1381,13 +1384,44 @@ static void split_large_buddy(struct zone *zone, struct page *page,
} while (1);
}
+static void add_page_to_zone_llist(struct zone *zone, struct page *page,
+ unsigned int order)
+{
+ /* Remember the order */
+ page->order = order;
+ /* Add the page to the free list */
+ llist_add(&page->pcp_llist, &zone->trylock_free_pages);
+}
+
static void free_one_page(struct zone *zone, struct page *page,
unsigned long pfn, unsigned int order,
fpi_t fpi_flags)
{
+ struct llist_head *llhead;
unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
+ spin_lock_irqsave(&zone->lock, flags);
+ }
+
+ /* The lock succeeded. Process deferred pages. */
+ llhead = &zone->trylock_free_pages;
+ if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
+ struct llist_node *llnode;
+ struct page *p, *tmp;
+
+ llnode = llist_del_all(llhead);
+ llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
+ unsigned int p_order = p->order;
+
+ split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
+ __count_vm_events(PGFREE, 1 << p_order);
+ }
+ }
split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -2280,7 +2314,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long flags;
int i;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ return 0;
+ spin_lock_irqsave(&zone->lock, flags);
+ }
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
@@ -2568,7 +2606,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
static void free_frozen_page_commit(struct zone *zone,
struct per_cpu_pages *pcp, struct page *page, int migratetype,
- unsigned int order)
+ unsigned int order, fpi_t fpi_flags)
{
int high, batch;
int pindex;
@@ -2603,6 +2641,14 @@ static void free_frozen_page_commit(struct zone *zone,
}
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
pcp->free_count += (1 << order);
+
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ /*
+ * Do not attempt to take a zone lock. Let pcp->count get
+ * over high mark temporarily.
+ */
+ return;
+ }
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2617,7 +2663,8 @@ static void free_frozen_page_commit(struct zone *zone,
/*
* Free a pcp page
*/
-void free_frozen_pages(struct page *page, unsigned int order)
+static void __free_frozen_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
@@ -2626,7 +2673,7 @@ void free_frozen_pages(struct page *page, unsigned int order)
int migratetype;
if (!pcp_allowed_order(order)) {
- __free_pages_ok(page, order, FPI_NONE);
+ __free_pages_ok(page, order, fpi_flags);
return;
}
@@ -2644,23 +2691,33 @@ void free_frozen_pages(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
return;
}
migratetype = MIGRATE_MOVABLE;
}
+ if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
+ && (in_nmi() || in_hardirq()))) {
+ add_page_to_zone_llist(zone, page, order);
+ return;
+ }
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_frozen_page_commit(zone, pcp, page, migratetype, order);
+ free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
pcp_spin_unlock(pcp);
} else {
- free_one_page(zone, page, pfn, order, FPI_NONE);
+ free_one_page(zone, page, pfn, order, fpi_flags);
}
pcp_trylock_finish(UP_flags);
}
+void free_frozen_pages(struct page *page, unsigned int order)
+{
+ __free_frozen_pages(page, order, FPI_NONE);
+}
+
/*
* Free a batch of folios
*/
@@ -2749,7 +2806,7 @@ void free_unref_folios(struct folio_batch *folios)
trace_mm_page_free_batched(&folio->page);
free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
- order);
+ order, FPI_NONE);
}
if (pcp) {
@@ -2880,7 +2937,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ return NULL;
+ spin_lock_irqsave(&zone->lock, flags);
+ }
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
@@ -4640,7 +4701,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
might_alloc(gfp_mask);
- if (should_fail_alloc_page(gfp_mask, order))
+ /*
+ * Don't invoke should_fail logic, since it may call
+ * get_random_u32() and printk() which need to spin_lock.
+ */
+ if (!(*alloc_flags & ALLOC_TRYLOCK) &&
+ should_fail_alloc_page(gfp_mask, order))
return false;
*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@@ -4938,9 +5004,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
EXPORT_SYMBOL(get_zeroed_page_noprof);
/**
- * __free_pages - Free pages allocated with alloc_pages().
+ * ___free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
* @order: The order of the allocation.
+ * @fpi_flags: Free Page Internal flags.
*
* This function can free multi-page allocations that are not compound
* pages. It does not check that the @order passed in matches that of
@@ -4957,21 +5024,36 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
* Context: May be called in interrupt context or while holding a normal
* spinlock, but not in NMI context or while holding a raw spinlock.
*/
-void __free_pages(struct page *page, unsigned int order)
+static void ___free_pages(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
/* get PageHead before we drop reference */
int head = PageHead(page);
if (put_page_testzero(page))
- free_frozen_pages(page, order);
+ __free_frozen_pages(page, order, fpi_flags);
else if (!head) {
pgalloc_tag_sub_pages(page, (1 << order) - 1);
while (order-- > 0)
- free_frozen_pages(page + (1 << order), order);
+ __free_frozen_pages(page + (1 << order), order,
+ fpi_flags);
}
}
+void __free_pages(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_NONE);
+}
EXPORT_SYMBOL(__free_pages);
+/*
+ * Can be called while holding raw_spin_lock or from IRQ and NMI for any
+ * page type (not only those that came from try_alloc_pages)
+ */
+void free_pages_nolock(struct page *page, unsigned int order)
+{
+ ___free_pages(page, order, FPI_TRYLOCK);
+}
+
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@@ -7221,3 +7303,94 @@ static bool __free_unaccepted(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+
+/**
+ * try_alloc_pages - opportunistic reentrant allocation from any context
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> try_alloc_pages_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure.
+ */
+struct page *try_alloc_pages_noprof(int nid, unsigned int order)
+{
+ /*
+ * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
+ * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
+ * is not safe in arbitrary context.
+ *
+ * These two are the conditions for gfpflags_allow_spinning() being true.
+ *
+ * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason
+ * to warn. Also warn would trigger printk() which is unsafe from
+ * various contexts. We cannot use printk_deferred_enter() to mitigate,
+ * since the running context is unknown.
+ *
+ * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
+ * is safe in any context. Also zeroing the page is mandatory for
+ * BPF use cases.
+ *
+ * Though __GFP_NOMEMALLOC is not checked in the code path below,
+ * specify it here to highlight that try_alloc_pages()
+ * doesn't want to deplete reserves.
+ */
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
+ | __GFP_ACCOUNT;
+ unsigned int alloc_flags = ALLOC_TRYLOCK;
+ struct alloc_context ac = { };
+ struct page *page;
+
+ /*
+ * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
+ * unsafe in NMI. If spin_trylock() is called from hard IRQ the current
+ * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
+ * mark the task as the owner of another rt_spin_lock which will
+ * confuse PI logic, so return immediately if called form hard IRQ or
+ * NMI.
+ *
+ * Note, irqs_disabled() case is ok. This function can be called
+ * from raw_spin_lock_irqsave region.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
+ return NULL;
+ if (!pcp_allowed_order(order))
+ return NULL;
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (has_unaccepted_memory())
+ return NULL;
+#endif
+ /* Bailout, since _deferred_grow_zone() needs to take a lock */
+ if (deferred_pages_enabled())
+ return NULL;
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_node_id();
+
+ prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
+ &alloc_gfp, &alloc_flags);
+
+ /*
+ * Best effort allocation from percpu free list.
+ * If it's empty attempt to spin_trylock zone->lock.
+ */
+ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
+
+ /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
+
+ if (memcg_kmem_online() && page &&
+ unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
+ free_pages_nolock(page, order);
+ page = NULL;
+ }
+ trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
+ return page;
+}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 849d4a471b6c..cc4a6916eec6 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -299,7 +299,13 @@ void __reset_page_owner(struct page *page, unsigned short order)
alloc_handle = page_owner->handle;
page_ext_put(page_ext);
- handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
+ /*
+ * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
+ * to prevent issues in stack_depot_save().
+ * This is similar to try_alloc_pages() gfp flags, but only used
+ * to signal stack_depot to avoid spin_locks.
+ */
+ handle = save_stack(__GFP_NOWARN);
__update_page_owner_free_handle(page, handle, order, current->pid,
current->tgid, free_ts_nsec);
diff --git a/mm/percpu.c b/mm/percpu.c
index 027fb6497495..b35494c8ede2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -3077,7 +3077,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
continue;
}
/* copy and return the unused part */
- memcpy(ptr, __per_cpu_load, ai->static_size);
+ memcpy(ptr, __per_cpu_start, ai->static_size);
pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
@@ -3246,7 +3246,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
/* copy static data */
- memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
+ memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
}
/* we're ready, commit */
diff --git a/mm/readahead.c b/mm/readahead.c
index 220155a5c964..6a4e96b69702 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,7 +128,6 @@
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
#include "internal.h"
@@ -559,15 +558,6 @@ void page_cache_sync_ra(struct readahead_control *ractl,
pgoff_t prev_index, miss;
/*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't find 0 filled pages in cache that we never emitted
- * events for. Filesystems supporting HSM must make sure to not call
- * this function with ractl->file unset for files handled by HSM.
- */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
- /*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
* readahead will do the right thing and limit the read to just the
@@ -645,10 +635,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (!ra->ra_pages)
return;
- /* See the comment in page_cache_sync_ra. */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
diff --git a/mm/shmem.c b/mm/shmem.c
index 17f27d92c664..99327c30507c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3899,16 +3899,16 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
if (error)
- return error;
+ return ERR_PTR(error);
inc_nlink(dir);
- return 0;
+ return NULL;
}
static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/mm/slab.h b/mm/slab.h
index e9fd9bf0bfa6..05a21dc796e0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -457,39 +457,17 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT));
}
-/* Legal flag mask for kmem_cache_create(), for various configurations */
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_PANIC | \
- SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
+ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
+ SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_ACCOUNT | \
+ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-#ifdef CONFIG_SLUB_DEBUG
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
-#else
-#define SLAB_DEBUG_FLAGS (0)
-#endif
-#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT | \
- SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE)
-
-/* Common flags available with current configuration */
-#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
-
-/* Common flags permitted for kmem_cache_create */
-#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
- SLAB_RED_ZONE | \
- SLAB_POISON | \
- SLAB_STORE_USER | \
- SLAB_TRACE | \
- SLAB_CONSISTENCY_CHECKS | \
- SLAB_NOLEAKTRACE | \
- SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | \
- SLAB_ACCOUNT | \
- SLAB_KMALLOC | \
- SLAB_NO_MERGE | \
- SLAB_NO_USER_FLAGS)
+#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
@@ -604,6 +582,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects, struct slabobj_ext *obj_exts);
#endif
+void kvfree_rcu_cb(struct rcu_head *head);
+
size_t __ksize(const void *objp);
static inline size_t slab_ksize(const struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 4c9f0a87f733..5be257e03c7c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -298,6 +298,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
static_branch_enable(&slub_debug_enabled);
if (flags & SLAB_STORE_USER)
stack_depot_init();
+#else
+ flags &= ~SLAB_DEBUG_FLAGS;
#endif
mutex_lock(&slab_mutex);
@@ -307,20 +309,11 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
goto out_unlock;
}
- /* Refuse requests with allocator specific flags */
if (flags & ~SLAB_FLAGS_PERMITTED) {
err = -EINVAL;
goto out_unlock;
}
- /*
- * Some allocators will constraint the set of valid flags to a subset
- * of all flags. We expect them to define CACHE_CREATE_MASK in this
- * case, and we'll just provide them with a sanitized version of the
- * passed flags.
- */
- flags &= CACHE_CREATE_MASK;
-
/* Fail closed on bad usersize of useroffset values. */
if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) ||
WARN_ON(!args->usersize && args->useroffset) ||
@@ -1284,6 +1277,29 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
+#ifndef CONFIG_KVFREE_RCU_BATCHED
+
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ if (head) {
+ kasan_record_aux_stack(ptr);
+ call_rcu(head, kvfree_rcu_cb);
+ return;
+ }
+
+ // kvfree_rcu(one_arg) call.
+ might_sleep();
+ synchronize_rcu();
+ kvfree(ptr);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+
+void __init kvfree_rcu_init(void)
+{
+}
+
+#else /* CONFIG_KVFREE_RCU_BATCHED */
+
/*
* This rcu parameter is runtime-read-only. It reflects
* a minimum allowed number of objects which can be cached
@@ -1534,8 +1550,7 @@ kvfree_rcu_list(struct rcu_head *head)
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_kvfree_callback("slab", head, offset);
- if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
- kvfree(ptr);
+ kvfree(ptr);
rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs();
@@ -1863,8 +1878,6 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
return true;
}
-#if !defined(CONFIG_TINY_RCU)
-
static enum hrtimer_restart
schedule_page_work_fn(struct hrtimer *t)
{
@@ -1889,8 +1902,8 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
&krcp->page_cache_work,
msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
} else {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_setup(&krcp->hrtimer, schedule_page_work_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
}
}
@@ -2073,8 +2086,6 @@ void kvfree_rcu_barrier(void)
}
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
-#endif /* #if !defined(CONFIG_TINY_RCU) */
-
static unsigned long
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
@@ -2168,3 +2179,6 @@ void __init kvfree_rcu_init(void)
shrinker_register(kfree_rcu_shrinker);
}
+
+#endif /* CONFIG_KVFREE_RCU_BATCHED */
+
diff --git a/mm/slub.c b/mm/slub.c
index 184fd2b14758..b46f87662e71 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -19,6 +19,7 @@
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
+#include <linux/vmalloc.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
@@ -1017,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object)
set_orig_size(s, (void *)object, s->object_size);
}
-static void slab_bug(struct kmem_cache *s, char *fmt, ...)
+static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
{
struct va_format vaf;
va_list args;
- va_start(args, fmt);
+ va_copy(args, argsp);
vaf.fmt = fmt;
vaf.va = &args;
pr_err("=============================================================================\n");
- pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
+ pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
pr_err("-----------------------------------------------------------------------------\n\n");
va_end(args);
}
+static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ __slab_bug(s, fmt, args);
+ va_end(args);
+}
+
__printf(2, 3)
-static void slab_fix(struct kmem_cache *s, char *fmt, ...)
+static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -1085,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
/* Beginning of the filler is the free pointer */
print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
-
- dump_stack();
}
static void object_err(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *reason)
+ u8 *object, const char *reason)
{
if (slab_add_kunit_errors())
return;
- slab_bug(s, "%s", reason);
+ slab_bug(s, reason);
print_trailer(s, slab, object);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
}
static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
@@ -1114,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
return false;
}
+static void __slab_err(struct slab *slab)
+{
+ if (slab_in_kunit_test())
+ return;
+
+ print_slab_info(slab);
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ WARN_ON(1);
+}
+
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
const char *fmt, ...)
{
va_list args;
- char buf[100];
if (slab_add_kunit_errors())
return;
va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
+ __slab_bug(s, fmt, args);
va_end(args);
- slab_bug(s, "%s", buf);
- print_slab_info(slab);
- dump_stack();
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ __slab_err(slab);
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -1166,7 +1184,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
s->inuse - poison_size);
}
-static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
+static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
void *from, void *to)
{
slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
@@ -1181,8 +1199,8 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
static pad_check_attributes int
check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
- u8 *object, char *what,
- u8 *start, unsigned int value, unsigned int bytes)
+ u8 *object, const char *what, u8 *start, unsigned int value,
+ unsigned int bytes, bool slab_obj_print)
{
u8 *fault;
u8 *end;
@@ -1201,10 +1219,11 @@ check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
if (slab_add_kunit_errors())
goto skip_bug_print;
- slab_bug(s, "%s overwritten", what);
- pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
- fault, end - 1, fault - addr,
- fault[0], value);
+ pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
+ what, fault, end - 1, fault - addr, fault[0], value);
+
+ if (slab_obj_print)
+ object_err(s, slab, object, "Object corrupt");
skip_bug_print:
restore_bytes(s, what, value, fault, end);
@@ -1268,7 +1287,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
return 1;
return check_bytes_and_report(s, slab, p, "Object padding",
- p + off, POISON_INUSE, size_from_object(s) - off);
+ p + off, POISON_INUSE, size_from_object(s) - off, true);
}
/* Check the pad bytes at the end of a slab page */
@@ -1301,9 +1320,10 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab)
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
- fault, end - 1, fault - start);
+ slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
+ __slab_err(slab);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
}
@@ -1318,11 +1338,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, slab, object, "Left Redzone",
- object - s->red_left_pad, val, s->red_left_pad))
+ object - s->red_left_pad, val, s->red_left_pad, ret))
ret = 0;
if (!check_bytes_and_report(s, slab, object, "Right Redzone",
- endobject, val, s->inuse - s->object_size))
+ endobject, val, s->inuse - s->object_size, ret))
ret = 0;
if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
@@ -1331,7 +1351,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (s->object_size > orig_size &&
!check_bytes_and_report(s, slab, object,
"kmalloc Redzone", p + orig_size,
- val, s->object_size - orig_size)) {
+ val, s->object_size - orig_size, ret)) {
ret = 0;
}
}
@@ -1339,7 +1359,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
if (!check_bytes_and_report(s, slab, p, "Alignment padding",
endobject, POISON_INUSE,
- s->inuse - s->object_size))
+ s->inuse - s->object_size, ret))
ret = 0;
}
}
@@ -1355,11 +1375,11 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
if (kasan_meta_size < s->object_size - 1 &&
!check_bytes_and_report(s, slab, p, "Poison",
p + kasan_meta_size, POISON_FREE,
- s->object_size - kasan_meta_size - 1))
+ s->object_size - kasan_meta_size - 1, ret))
ret = 0;
if (kasan_meta_size < s->object_size &&
!check_bytes_and_report(s, slab, p, "End Poison",
- p + s->object_size - 1, POISON_END, 1))
+ p + s->object_size - 1, POISON_END, 1, ret))
ret = 0;
}
/*
@@ -1385,11 +1405,6 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
ret = 0;
}
- if (!ret && !slab_in_kunit_test()) {
- print_trailer(s, slab, object);
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- }
-
return ret;
}
@@ -1427,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab)
* Determine if a certain object in a slab is on the freelist. Must hold the
* slab lock to guarantee that the chains are in a consistent state.
*/
-static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
+static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
{
int nr = 0;
void *fp;
@@ -1437,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
fp = slab->freelist;
while (fp && nr <= slab->objects) {
if (fp == search)
- return 1;
+ return true;
if (!check_valid_pointer(s, slab, fp)) {
if (object) {
object_err(s, slab, object,
"Freechain corrupt");
set_freepointer(s, object, NULL);
+ break;
} else {
slab_err(s, slab, "Freepointer corrupt");
slab->freelist = NULL;
slab->inuse = slab->objects;
slab_fix(s, "Freelist cleared");
- return 0;
+ return false;
}
- break;
}
object = fp;
fp = get_freepointer(s, object);
nr++;
}
+ if (nr > slab->objects) {
+ slab_err(s, slab, "Freelist cycle detected");
+ slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab_fix(s, "Freelist cleared");
+ return false;
+ }
+
max_objects = order_objects(slab_order(slab), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -1624,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s,
slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
object);
} else if (!slab->slab_cache) {
- pr_err("SLUB <none>: no slab for object 0x%p.\n",
- object);
- dump_stack();
- } else
+ slab_err(NULL, slab, "No slab cache for object 0x%p",
+ object);
+ } else {
object_err(s, slab, object,
- "page slab pointer corrupt.");
+ "page slab pointer corrupt.");
+ }
return 0;
}
return 1;
@@ -4254,6 +4277,7 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
ptr = folio_address(folio);
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
+ __folio_set_large_kmalloc(folio);
}
ptr = kasan_kmalloc_large(ptr, size, flags);
@@ -4729,6 +4753,11 @@ static void free_large_kmalloc(struct folio *folio, void *object)
{
unsigned int order = folio_order(folio);
+ if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) {
+ dump_page(&folio->page, "Not a kmalloc allocation");
+ return;
+ }
+
if (WARN_ON_ONCE(order == 0))
pr_warn_once("object pointer: 0x%p\n", object);
@@ -4738,9 +4767,55 @@ static void free_large_kmalloc(struct folio *folio, void *object)
lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
+ __folio_clear_large_kmalloc(folio);
folio_put(folio);
}
+/*
+ * Given an rcu_head embedded within an object obtained from kvmalloc at an
+ * offset < 4k, free the object in question.
+ */
+void kvfree_rcu_cb(struct rcu_head *head)
+{
+ void *obj = head;
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+ void *slab_addr;
+
+ if (is_vmalloc_addr(obj)) {
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ vfree(obj);
+ return;
+ }
+
+ folio = virt_to_folio(obj);
+ if (!folio_test_slab(folio)) {
+ /*
+ * rcu_head offset can be only less than page size so no need to
+ * consider folio order
+ */
+ obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
+ free_large_kmalloc(folio, obj);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ slab_addr = folio_address(folio);
+
+ if (is_kfence_address(obj)) {
+ obj = kfence_object_start(obj);
+ } else {
+ unsigned int idx = __obj_to_index(s, slab_addr, obj);
+
+ obj = slab_addr + s->size * idx;
+ obj = fixup_red_left(s, obj);
+ }
+
+ slab_free(s, slab, obj, _RET_IP_);
+}
+
/**
* kfree - free previously allocated memory
* @object: pointer returned by kmalloc() or kmem_cache_alloc()
@@ -4891,6 +4966,168 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
}
EXPORT_SYMBOL(krealloc_noprof);
+static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
+{
+ /*
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - no
+ * OOM killer and no allocation failure warnings as we have a fallback.
+ */
+ if (size > PAGE_SIZE) {
+ flags |= __GFP_NOWARN;
+
+ if (!(flags & __GFP_RETRY_MAYFAIL))
+ flags |= __GFP_NORETRY;
+
+ /* nofail semantic is implemented by the vmalloc fallback */
+ flags &= ~__GFP_NOFAIL;
+ }
+
+ return flags;
+}
+
+/**
+ * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @b: which set of kmalloc buckets to allocate from.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
+ * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
+ * preferable to the vmalloc fallback, due to visible performance drawbacks.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
+ */
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
+{
+ void *ret;
+
+ /*
+ * It doesn't really make sense to fallback to vmalloc for sub page
+ * requests
+ */
+ ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
+ kmalloc_gfp_adjust(flags, size),
+ node, _RET_IP_);
+ if (ret || size <= PAGE_SIZE)
+ return ret;
+
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
+ /* Don't even allow crazy sizes */
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
+ return NULL;
+ }
+
+ /*
+ * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
+ * since the callers already cannot assume anything
+ * about the resulting pointer, and cannot play
+ * protection games.
+ */
+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+ flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(__kvmalloc_node_noprof);
+
+/**
+ * kvfree() - Free memory.
+ * @addr: Pointer to allocated memory.
+ *
+ * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
+ * It is slightly more efficient to use kfree() or vfree() if you are certain
+ * that you know which one to use.
+ *
+ * Context: Either preemptible task context or not-NMI interrupt.
+ */
+void kvfree(const void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ vfree(addr);
+ else
+ kfree(addr);
+}
+EXPORT_SYMBOL(kvfree);
+
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
+/**
+ * kvrealloc - reallocate memory; contents remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @flags: the flags for the page level allocator
+ *
+ * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
+ * and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or kvfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
+{
+ void *n;
+
+ if (is_vmalloc_addr(p))
+ return vrealloc_noprof(p, size, flags);
+
+ n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ if (!n) {
+ /* We failed to krealloc(), fall back to kvmalloc(). */
+ n = kvmalloc_noprof(size, flags);
+ if (!n)
+ return NULL;
+
+ if (p) {
+ /* We already know that `p` is not a vmalloc address. */
+ kasan_disable_current();
+ memcpy(n, kasan_reset_tag(p), ksize(p));
+ kasan_enable_current();
+
+ kfree(p);
+ }
+ }
+
+ return n;
+}
+EXPORT_SYMBOL(kvrealloc_noprof);
+
struct detached_freelist {
struct slab *slab;
void *tail;
@@ -5583,14 +5820,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
return !!oo_objects(s->oo);
}
-static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
- const char *text)
+static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = slab_address(slab);
void *p;
- slab_err(s, slab, text, s->name);
+ if (!slab_add_kunit_errors())
+ slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
spin_lock(&object_map_lock);
__fill_map(object_map, s, slab);
@@ -5605,6 +5842,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
}
}
spin_unlock(&object_map_lock);
+
+ __slab_err(slab);
#endif
}
@@ -5625,8 +5864,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
remove_partial(n, slab);
list_add(&slab->slab_list, &discard);
} else {
- list_slab_objects(s, slab,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ list_slab_objects(s, slab);
}
}
spin_unlock_irq(&n->list_lock);
diff --git a/mm/swap.c b/mm/swap.c
index 7523b65d8caa..77b2d5997873 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -45,7 +45,7 @@
/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
-const int page_cluster_max = 31;
+static const int page_cluster_max = 31;
struct cpu_fbatches {
/*
@@ -1074,6 +1074,18 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
fbatch->nr = j;
}
+static const struct ctl_table swap_sysctl_table[] = {
+ {
+ .procname = "page-cluster",
+ .data = &page_cluster,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&page_cluster_max,
+ }
+};
+
/*
* Perform any setup for the swap system
*/
@@ -1090,4 +1102,6 @@ void __init swap_setup(void)
* Right now other parts of the system means that we
* _really_ don't want to cluster much more
*/
+
+ register_sysctl_init("vm", swap_sysctl_table);
}
diff --git a/mm/swap.h b/mm/swap.h
index 0abb68091b4f..6f4a3f927edb 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -3,6 +3,7 @@
#define _MM_SWAP_H
struct mempolicy;
+extern int page_cluster;
#ifdef CONFIG_SWAP
#include <linux/swapops.h> /* for swp_offset */
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 83c164aba6e0..dbdcc43964fb 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,7 +17,7 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
+#include <linux/ucopysize.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
@@ -201,7 +201,9 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
}
}
-static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
+DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
+ validate_usercopy_range);
+EXPORT_SYMBOL(validate_usercopy_range);
/*
* Validates that the given object is:
@@ -212,9 +214,6 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
{
- if (static_branch_unlikely(&bypass_usercopy_checks))
- return;
-
/* Skip all tests if size is zero. */
if (!n)
return;
@@ -255,7 +254,8 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
}
EXPORT_SYMBOL(__check_object_size);
-static bool enable_checks __initdata = true;
+static bool enable_checks __initdata =
+ IS_ENABLED(CONFIG_HARDENED_USERCOPY_DEFAULT_ON);
static int __init parse_hardened_usercopy(char *str)
{
@@ -269,8 +269,10 @@ __setup("hardened_usercopy=", parse_hardened_usercopy);
static int __init set_hardened_usercopy(void)
{
- if (enable_checks == false)
- static_branch_enable(&bypass_usercopy_checks);
+ if (enable_checks)
+ static_branch_enable(&validate_usercopy_range);
+ else
+ static_branch_disable(&validate_usercopy_range);
return 1;
}
diff --git a/mm/util.c b/mm/util.c
index b6b9684a1438..448117da071f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,6 +12,7 @@
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/sysctl.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
@@ -23,6 +24,7 @@
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/uaccess.h>
@@ -569,6 +571,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
+ if (!ret)
+ ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -612,168 +616,6 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(vm_mmap);
-static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
-{
- /*
- * We want to attempt a large physically contiguous block first because
- * it is less likely to fragment multiple larger blocks and therefore
- * contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
- */
- if (size > PAGE_SIZE) {
- flags |= __GFP_NOWARN;
-
- if (!(flags & __GFP_RETRY_MAYFAIL))
- flags |= __GFP_NORETRY;
-
- /* nofail semantic is implemented by the vmalloc fallback */
- flags &= ~__GFP_NOFAIL;
- }
-
- return flags;
-}
-
-/**
- * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
- * failure, fall back to non-contiguous (vmalloc) allocation.
- * @size: size of the request.
- * @b: which set of kmalloc buckets to allocate from.
- * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
- * @node: numa node to allocate from
- *
- * Uses kmalloc to get the memory but if the allocation fails then falls back
- * to the vmalloc allocator. Use kvfree for freeing the memory.
- *
- * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
- * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
- * preferable to the vmalloc fallback, due to visible performance drawbacks.
- *
- * Return: pointer to the allocated memory of %NULL in case of failure
- */
-void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
-{
- void *ret;
-
- /*
- * It doesn't really make sense to fallback to vmalloc for sub page
- * requests
- */
- ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b),
- kmalloc_gfp_adjust(flags, size),
- node);
- if (ret || size <= PAGE_SIZE)
- return ret;
-
- /* non-sleeping allocations are not supported by vmalloc */
- if (!gfpflags_allow_blocking(flags))
- return NULL;
-
- /* Don't even allow crazy sizes */
- if (unlikely(size > INT_MAX)) {
- WARN_ON_ONCE(!(flags & __GFP_NOWARN));
- return NULL;
- }
-
- /*
- * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
- * since the callers already cannot assume anything
- * about the resulting pointer, and cannot play
- * protection games.
- */
- return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- node, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(__kvmalloc_node_noprof);
-
-/**
- * kvfree() - Free memory.
- * @addr: Pointer to allocated memory.
- *
- * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
- * It is slightly more efficient to use kfree() or vfree() if you are certain
- * that you know which one to use.
- *
- * Context: Either preemptible task context or not-NMI interrupt.
- */
-void kvfree(const void *addr)
-{
- if (is_vmalloc_addr(addr))
- vfree(addr);
- else
- kfree(addr);
-}
-EXPORT_SYMBOL(kvfree);
-
-/**
- * kvfree_sensitive - Free a data object containing sensitive information.
- * @addr: address of the data object to be freed.
- * @len: length of the data object.
- *
- * Use the special memzero_explicit() function to clear the content of a
- * kvmalloc'ed object containing sensitive data to make sure that the
- * compiler won't optimize out the data clearing.
- */
-void kvfree_sensitive(const void *addr, size_t len)
-{
- if (likely(!ZERO_OR_NULL_PTR(addr))) {
- memzero_explicit((void *)addr, len);
- kvfree(addr);
- }
-}
-EXPORT_SYMBOL(kvfree_sensitive);
-
-/**
- * kvrealloc - reallocate memory; contents remain unchanged
- * @p: object to reallocate memory for
- * @size: the size to reallocate
- * @flags: the flags for the page level allocator
- *
- * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
- * and @p is not a %NULL pointer, the object pointed to is freed.
- *
- * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
- * initial memory allocation, every subsequent call to this API for the same
- * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
- * __GFP_ZERO is not fully honored by this API.
- *
- * In any case, the contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.
- *
- * This function must not be called concurrently with itself or kvfree() for the
- * same memory allocation.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
-{
- void *n;
-
- if (is_vmalloc_addr(p))
- return vrealloc_noprof(p, size, flags);
-
- n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
- if (!n) {
- /* We failed to krealloc(), fall back to kvmalloc(). */
- n = kvmalloc_noprof(size, flags);
- if (!n)
- return NULL;
-
- if (p) {
- /* We already know that `p` is not a vmalloc address. */
- kasan_disable_current();
- memcpy(n, kasan_reset_tag(p), ksize(p));
- kasan_enable_current();
-
- kfree(p);
- }
- }
-
- return n;
-}
-EXPORT_SYMBOL(kvrealloc_noprof);
-
/**
* __vmalloc_array - allocate memory for a virtually contiguous array.
* @n: number of elements.
@@ -906,14 +748,16 @@ int folio_mc_copy(struct folio *dst, struct folio *src)
EXPORT_SYMBOL(folio_mc_copy);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
-int sysctl_overcommit_ratio __read_mostly = 50;
-unsigned long sysctl_overcommit_kbytes __read_mostly;
+static int sysctl_overcommit_ratio __read_mostly = 50;
+static unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+#ifdef CONFIG_SYSCTL
+
+static int overcommit_ratio_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -928,8 +772,8 @@ static void sync_overcommit_as(struct work_struct *dummy)
percpu_counter_sync(&vm_committed_as);
}
-int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+static int overcommit_policy_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int new_policy = -1;
@@ -964,8 +808,8 @@ int overcommit_policy_handler(const struct ctl_table *table, int write, void *bu
return ret;
}
-int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
+static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -975,6 +819,54 @@ int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *bu
return ret;
}
+static const struct ctl_table util_sysctl_table[] = {
+ {
+ .procname = "overcommit_memory",
+ .data = &sysctl_overcommit_memory,
+ .maxlen = sizeof(sysctl_overcommit_memory),
+ .mode = 0644,
+ .proc_handler = overcommit_policy_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "overcommit_ratio",
+ .data = &sysctl_overcommit_ratio,
+ .maxlen = sizeof(sysctl_overcommit_ratio),
+ .mode = 0644,
+ .proc_handler = overcommit_ratio_handler,
+ },
+ {
+ .procname = "overcommit_kbytes",
+ .data = &sysctl_overcommit_kbytes,
+ .maxlen = sizeof(sysctl_overcommit_kbytes),
+ .mode = 0644,
+ .proc_handler = overcommit_kbytes_handler,
+ },
+ {
+ .procname = "user_reserve_kbytes",
+ .data = &sysctl_user_reserve_kbytes,
+ .maxlen = sizeof(sysctl_user_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "admin_reserve_kbytes",
+ .data = &sysctl_admin_reserve_kbytes,
+ .maxlen = sizeof(sysctl_admin_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+};
+
+static int __init init_vm_util_sysctls(void)
+{
+ register_sysctl_init("vm", util_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_vm_util_sysctls);
+#endif /* CONFIG_SYSCTL */
+
/*
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
*/
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2b2ab386cab5..b620d74b0f66 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7432,6 +7432,28 @@ void __meminit kswapd_stop(int nid)
pgdat_kswapd_unlock(pgdat);
}
+static const struct ctl_table vmscan_sysctl_table[] = {
+ {
+ .procname = "swappiness",
+ .data = &vm_swappiness,
+ .maxlen = sizeof(vm_swappiness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO_HUNDRED,
+ },
+#ifdef CONFIG_NUMA
+ {
+ .procname = "zone_reclaim_mode",
+ .data = &node_reclaim_mode,
+ .maxlen = sizeof(node_reclaim_mode),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ }
+#endif
+};
+
static int __init kswapd_init(void)
{
int nid;
@@ -7439,6 +7461,7 @@ static int __init kswapd_init(void)
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
+ register_sysctl_init("vm", vmscan_sysctl_table);
return 0;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ab5c840941f3..4c268ce39ff2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -31,8 +31,10 @@
#include "internal.h"
+#ifdef CONFIG_PROC_FS
#ifdef CONFIG_NUMA
-int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
+#define ENABLE_NUMA_STAT 1
+static int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
/* zero numa counters within a zone */
static void zero_zone_numa_counters(struct zone *zone)
@@ -74,7 +76,7 @@ static void invalid_numa_statistics(void)
static DEFINE_MUTEX(vm_numa_stat_lock);
-int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
+static int sysctl_vm_numa_stat_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int ret, oldval;
@@ -102,6 +104,7 @@ out:
return ret;
}
#endif
+#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -1440,6 +1443,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_X86
"direct_map_level2_splits",
"direct_map_level3_splits",
+ "direct_map_level2_collapses",
+ "direct_map_level3_collapses",
#endif
#ifdef CONFIG_PER_VMA_LOCK_STATS
"vma_lock_success",
@@ -1943,7 +1948,7 @@ static const struct seq_operations vmstat_op = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
-int sysctl_stat_interval __read_mostly = HZ;
+static int sysctl_stat_interval __read_mostly = HZ;
static int vmstat_late_init_done;
#ifdef CONFIG_PROC_FS
@@ -1952,7 +1957,7 @@ static void refresh_vm_stats(struct work_struct *work)
refresh_cpu_vm_stats(true);
}
-int vmstat_refresh(const struct ctl_table *table, int write,
+static int vmstat_refresh(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
long val;
@@ -2201,6 +2206,38 @@ static int __init vmstat_late_init(void)
late_initcall(vmstat_late_init);
#endif
+#ifdef CONFIG_PROC_FS
+static const struct ctl_table vmstat_table[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "stat_interval",
+ .data = &sysctl_stat_interval,
+ .maxlen = sizeof(sysctl_stat_interval),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "stat_refresh",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0600,
+ .proc_handler = vmstat_refresh,
+ },
+#endif
+#ifdef CONFIG_NUMA
+ {
+ .procname = "numa_stat",
+ .data = &sysctl_vm_numa_stat,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_vm_numa_stat_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+};
+#endif
+
struct workqueue_struct *mm_percpu_wq;
void __init init_mm_internals(void)
@@ -2232,6 +2269,7 @@ void __init init_mm_internals(void)
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
+ register_sysctl_init("vm", vmstat_table);
#endif
}