summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.c5
-rw-r--r--mm/dmapool.c15
-rw-r--r--mm/execmem.c40
-rw-r--r--mm/filemap.c39
-rw-r--r--mm/hmm.c262
-rw-r--r--mm/hugetlb.c24
-rw-r--r--mm/kasan/Makefile3
-rw-r--r--mm/kasan/shadow.c92
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/memcontrol.c10
-rw-r--r--mm/migrate.c60
-rw-r--r--mm/mm_init.c6
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/nommu.c18
-rw-r--r--mm/page-writeback.c28
-rw-r--r--mm/page_alloc.c16
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/page_table_check.c34
-rw-r--r--mm/readahead.c20
-rw-r--r--mm/shmem.c33
-rw-r--r--mm/show_mem.c4
-rw-r--r--mm/swap.h4
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c11
-rw-r--r--mm/truncate.c20
-rw-r--r--mm/vma.c1
-rw-r--r--mm/vmalloc.c64
-rw-r--r--mm/vmscan.c41
28 files changed, 615 insertions, 244 deletions
diff --git a/mm/cma.c b/mm/cma.c
index ec4b9a401b7d..397567883a10 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -609,7 +609,10 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
* complain. Find the boundary by adding one to the last valid
* address.
*/
- highmem_start = __pa(high_memory - 1) + 1;
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ highmem_start = __pa(high_memory - 1) + 1;
+ else
+ highmem_start = memblock_end_of_DRAM();
pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
__func__, &size, &base, &limit, &alignment);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f0bfc6c490f4..5be8cc1c6529 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -56,6 +56,7 @@ struct dma_pool { /* the pool */
unsigned int size;
unsigned int allocation;
unsigned int boundary;
+ int node;
char name[32];
struct list_head pools;
};
@@ -199,12 +200,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
/**
- * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma.
* @name: name of pool, for diagnostics
* @dev: device that will be doing the DMA
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
+ * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on
* Context: not in_interrupt()
*
* Given one of these pools, dma_pool_alloc()
@@ -221,8 +223,8 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
* Return: a dma allocation pool with the requested characteristics, or
* %NULL if one can't be created.
*/
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t boundary)
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary, int node)
{
struct dma_pool *retval;
size_t allocation;
@@ -251,7 +253,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
boundary = min(boundary, allocation);
- retval = kzalloc(sizeof(*retval), GFP_KERNEL);
+ retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
@@ -264,6 +266,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
+ retval->node = node;
INIT_LIST_HEAD(&retval->pools);
/*
@@ -295,7 +298,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
mutex_unlock(&pools_reg_lock);
return retval;
}
-EXPORT_SYMBOL(dma_pool_create);
+EXPORT_SYMBOL(dma_pool_create_node);
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
@@ -335,7 +338,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
{
struct dma_page *page;
- page = kmalloc(sizeof(*page), mem_flags);
+ page = kmalloc_node(sizeof(*page), mem_flags, pool->node);
if (!page)
return NULL;
diff --git a/mm/execmem.c b/mm/execmem.c
index 2b683e7d864d..9720ac2dfa41 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -254,6 +254,34 @@ out_unlock:
return ptr;
}
+static bool execmem_cache_rox = false;
+
+void execmem_cache_make_ro(void)
+{
+ struct maple_tree *free_areas = &execmem_cache.free_areas;
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
+ MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
+ struct mutex *mutex = &execmem_cache.mutex;
+ void *area;
+
+ execmem_cache_rox = true;
+
+ mutex_lock(mutex);
+
+ mas_for_each(&mas_free, area, ULONG_MAX) {
+ unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT;
+ set_memory_ro(mas_free.index, pages);
+ }
+
+ mas_for_each(&mas_busy, area, ULONG_MAX) {
+ unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT;
+ set_memory_ro(mas_busy.index, pages);
+ }
+
+ mutex_unlock(mutex);
+}
+
static int execmem_cache_populate(struct execmem_range *range, size_t size)
{
unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
@@ -274,9 +302,15 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
/* fill memory with instructions that will trap */
execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
- err = set_memory_rox((unsigned long)p, vm->nr_pages);
- if (err)
- goto err_free_mem;
+ if (execmem_cache_rox) {
+ err = set_memory_rox((unsigned long)p, vm->nr_pages);
+ if (err)
+ goto err_free_mem;
+ } else {
+ err = set_memory_x((unsigned long)p, vm->nr_pages);
+ if (err)
+ goto err_free_mem;
+ }
err = execmem_cache_add(p, alloc_size);
if (err)
diff --git a/mm/filemap.c b/mm/filemap.c
index bc244a2edc93..bada249b9fb7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1589,13 +1589,30 @@ int folio_wait_private_2_killable(struct folio *folio)
}
EXPORT_SYMBOL(folio_wait_private_2_killable);
+static void filemap_end_dropbehind(struct folio *folio)
+{
+ struct address_space *mapping = folio->mapping;
+
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+
+ if (folio_test_writeback(folio) || folio_test_dirty(folio))
+ return;
+ if (!folio_test_clear_dropbehind(folio))
+ return;
+ if (mapping)
+ folio_unmap_invalidate(mapping, folio, 0);
+}
+
/*
* If folio was marked as dropbehind, then pages should be dropped when writeback
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
*/
-static void folio_end_dropbehind_write(struct folio *folio)
+static void filemap_end_dropbehind_write(struct folio *folio)
{
+ if (!folio_test_dropbehind(folio))
+ return;
+
/*
* Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
* but can happen if normal writeback just happens to find dirty folios
@@ -1604,8 +1621,7 @@ static void folio_end_dropbehind_write(struct folio *folio)
* invalidation in that case.
*/
if (in_task() && folio_trylock(folio)) {
- if (folio->mapping)
- folio_unmap_invalidate(folio->mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -1620,8 +1636,6 @@ static void folio_end_dropbehind_write(struct folio *folio)
*/
void folio_end_writeback(struct folio *folio)
{
- bool folio_dropbehind = false;
-
VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);
/*
@@ -1643,14 +1657,11 @@ void folio_end_writeback(struct folio *folio)
* reused before the folio_wake_bit().
*/
folio_get(folio);
- if (!folio_test_dirty(folio))
- folio_dropbehind = folio_test_clear_dropbehind(folio);
if (__folio_end_writeback(folio))
folio_wake_bit(folio, PG_writeback);
- acct_reclaim_writeback(folio);
- if (folio_dropbehind)
- folio_end_dropbehind_write(folio);
+ filemap_end_dropbehind_write(folio);
+ acct_reclaim_writeback(folio);
folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);
@@ -2635,16 +2646,14 @@ static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
return (pos1 >> shift == pos2 >> shift);
}
-static void filemap_end_dropbehind_read(struct address_space *mapping,
- struct folio *folio)
+static void filemap_end_dropbehind_read(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
return;
if (folio_test_writeback(folio) || folio_test_dirty(folio))
return;
if (folio_trylock(folio)) {
- if (folio_test_clear_dropbehind(folio))
- folio_unmap_invalidate(mapping, folio, 0);
+ filemap_end_dropbehind(folio);
folio_unlock(folio);
}
}
@@ -2765,7 +2774,7 @@ put_folios:
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
- filemap_end_dropbehind_read(mapping, folio);
+ filemap_end_dropbehind_read(folio);
folio_put(folio);
}
folio_batch_init(&fbatch);
diff --git a/mm/hmm.c b/mm/hmm.c
index 082f7b7c0b9e..feac86196a65 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
+#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -39,13 +41,21 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
+enum {
+ /* These flags are carried from input-to-output */
+ HMM_PFN_INOUT_FLAGS = HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA |
+ HMM_PFN_P2PDMA_BUS,
+};
+
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, unsigned long cpu_flags)
{
unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++)
- range->hmm_pfns[i] = cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= cpu_flags;
+ }
return 0;
}
@@ -202,8 +212,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -230,14 +242,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long cpu_flags;
pte_t pte = ptep_get(ptep);
uint64_t pfn_req_flags = *hmm_pfn;
+ uint64_t new_pfn_flags = 0;
if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *hmm_pfn = 0;
- return 0;
+ goto out;
}
if (!pte_present(pte)) {
@@ -253,16 +265,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
- return 0;
+ new_pfn_flags = swp_offset_pfn(entry) | cpu_flags;
+ goto out;
}
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
- if (!required_fault) {
- *hmm_pfn = 0;
- return 0;
- }
+ if (!required_fault)
+ goto out;
if (!non_swap_entry(entry))
goto fault;
@@ -304,11 +314,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_unmap(ptep);
return -EFAULT;
}
- *hmm_pfn = HMM_PFN_ERROR;
- return 0;
+ new_pfn_flags = HMM_PFN_ERROR;
+ goto out;
}
- *hmm_pfn = pte_pfn(pte) | cpu_flags;
+ new_pfn_flags = pte_pfn(pte) | cpu_flags;
+out:
+ *hmm_pfn = (*hmm_pfn & HMM_PFN_INOUT_FLAGS) | new_pfn_flags;
return 0;
fault:
@@ -448,8 +460,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
}
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- for (i = 0; i < npages; ++i, ++pfn)
- hmm_pfns[i] = pfn | cpu_flags;
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ hmm_pfns[i] |= pfn | cpu_flags;
+ }
goto out_unlock;
}
@@ -507,8 +521,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
}
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
- for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->hmm_pfns[i] = pfn | cpu_flags;
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ range->hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS;
+ range->hmm_pfns[i] |= pfn | cpu_flags;
+ }
spin_unlock(ptl);
return 0;
@@ -607,3 +623,211 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_dma_map_alloc - Allocate HMM map structure
+ * @dev: device to allocate structure for
+ * @map: HMM map to allocate
+ * @nr_entries: number of entries in the map
+ * @dma_entry_size: size of the DMA entry in the map
+ *
+ * Allocate the HMM map structure and all the lists it contains.
+ * Return 0 on success, -ENOMEM on failure.
+ */
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size)
+{
+ bool dma_need_sync = false;
+ bool use_iova;
+
+ WARN_ON_ONCE(!(nr_entries * PAGE_SIZE / dma_entry_size));
+
+ /*
+ * The HMM API violates our normal DMA buffer ownership rules and can't
+ * transfer buffer ownership. The dma_addressing_limited() check is a
+ * best approximation to ensure no swiotlb buffering happens.
+ */
+#ifdef CONFIG_DMA_NEED_SYNC
+ dma_need_sync = !dev->dma_skip_sync;
+#endif /* CONFIG_DMA_NEED_SYNC */
+ if (dma_need_sync || dma_addressing_limited(dev))
+ return -EOPNOTSUPP;
+
+ map->dma_entry_size = dma_entry_size;
+ map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->pfn_list)
+ return -ENOMEM;
+
+ use_iova = dma_iova_try_alloc(dev, &map->state, 0,
+ nr_entries * PAGE_SIZE);
+ if (!use_iova && dma_need_unmap(dev)) {
+ map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!map->dma_list)
+ goto err_dma;
+ }
+ return 0;
+
+err_dma:
+ kvfree(map->pfn_list);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
+
+/**
+ * hmm_dma_map_free - iFree HMM map structure
+ * @dev: device to free structure from
+ * @map: HMM map containing the various lists and state
+ *
+ * Free the HMM map structure and all the lists it contains.
+ */
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
+{
+ if (dma_use_iova(&map->state))
+ dma_iova_free(dev, &map->state);
+ kvfree(map->pfn_list);
+ kvfree(map->dma_list);
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_free);
+
+/**
+ * hmm_dma_map_pfn - Map a physical HMM page to DMA address
+ * @dev: Device to map the page for
+ * @map: HMM map
+ * @idx: Index into the PFN and dma address arrays
+ * @p2pdma_state: PCI P2P state.
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by their use in
+ * iova->size. Call this function after IOVA allocation to link whole @page
+ * to get the DMA address. Note that very first call to this function
+ * will have @offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx,
+ struct pci_p2pdma_map_state *p2pdma_state)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ struct page *page = hmm_pfn_to_page(pfns[idx]);
+ phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
+ size_t offset = idx * map->dma_entry_size;
+ unsigned long attrs = 0;
+ dma_addr_t dma_addr;
+ int ret;
+
+ if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
+ !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ */
+ if (dma_use_iova(state))
+ return state->addr + offset;
+
+ /*
+ * Without dma_need_unmap, the dma_addrs array is NULL, thus we
+ * need to regenerate the address below even if there already
+ * was a mapping. But !dma_need_unmap implies that the
+ * mapping stateless, so this is fine.
+ */
+ if (dma_need_unmap(dev))
+ return dma_addrs[idx];
+
+ /* Continue to remapping */
+ }
+
+ switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ pfns[idx] |= HMM_PFN_P2PDMA;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED;
+ return pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ default:
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (dma_use_iova(state)) {
+ ret = dma_iova_link(dev, state, paddr, offset,
+ map->dma_entry_size, DMA_BIDIRECTIONAL,
+ attrs);
+ if (ret)
+ goto error;
+
+ ret = dma_iova_sync(dev, state, offset, map->dma_entry_size);
+ if (ret) {
+ dma_iova_unlink(dev, state, offset, map->dma_entry_size,
+ DMA_BIDIRECTIONAL, attrs);
+ goto error;
+ }
+
+ dma_addr = state->addr + offset;
+ } else {
+ if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
+ goto error;
+
+ dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, dma_addr))
+ goto error;
+
+ if (dma_need_unmap(dev))
+ dma_addrs[idx] = dma_addr;
+ }
+ pfns[idx] |= HMM_PFN_DMA_MAPPED;
+ return dma_addr;
+error:
+ pfns[idx] &= ~HMM_PFN_P2PDMA;
+ return DMA_MAPPING_ERROR;
+
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
+
+/**
+ * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
+ * @dev: Device to unmap the page from
+ * @map: HMM map
+ * @idx: Index of the PFN to unmap
+ *
+ * Returns true if the PFN was mapped and has been unmapped, false otherwise.
+ */
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
+{
+ const unsigned long valid_dma = HMM_PFN_VALID | HMM_PFN_DMA_MAPPED;
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ unsigned long attrs = 0;
+
+ if ((pfns[idx] & valid_dma) != valid_dma)
+ return false;
+
+ if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
+ ; /* no need to unmap bus address P2P mappings */
+ else if (dma_use_iova(state)) {
+ if (pfns[idx] & HMM_PFN_P2PDMA)
+ attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+ dma_iova_unlink(dev, state, idx * map->dma_entry_size,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, attrs);
+ } else if (dma_need_unmap(dev))
+ dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+
+ pfns[idx] &=
+ ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA | HMM_PFN_P2PDMA_BUS);
+ return true;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5a0bf1ea48f9..f0b1d53079f9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1251,7 +1251,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
/*
* Reset and decrement one ref on hugepage private reservation.
* Called with mm->mmap_lock writer semaphore held.
- * This function should be only used by move_vma() and operate on
+ * This function should be only used by mremap and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
*/
@@ -2924,12 +2924,20 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
while (start_pfn < end_pfn) {
folio = pfn_folio(start_pfn);
+
+ /*
+ * The folio might have been dissolved from under our feet, so make sure
+ * to carefully check the state under the lock.
+ */
+ spin_lock_irq(&hugetlb_lock);
if (folio_test_hugetlb(folio)) {
h = folio_hstate(folio);
} else {
+ spin_unlock_irq(&hugetlb_lock);
start_pfn++;
continue;
}
+ spin_unlock_irq(&hugetlb_lock);
if (!folio_ref_count(folio)) {
ret = alloc_and_dissolve_hugetlb_folio(h, folio,
@@ -7932,3 +7940,17 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
}
+
+/*
+ * For hugetlb, mremap() is an odd edge case - while the VMA copying is
+ * performed, we permit both the old and new VMAs to reference the same
+ * reservation.
+ *
+ * We fix this up after the operation succeeds, or if a newly allocated VMA
+ * is closed as a result of a failure to allocate memory.
+ */
+void fixup_hugetlb_reservations(struct vm_area_struct *vma)
+{
+ if (is_vm_hugetlb_page(vma))
+ clear_vma_resv_huge_pages(vma);
+}
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1a958e7c8a46..dd93ae8a6beb 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) $(call cc-disable-warning, vla)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN)
ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
# If compiler instruments memintrinsics by prefixing them with __asan/__hwasan,
# we need to treat them normally (as builtins), otherwise the compiler won't
@@ -44,6 +44,7 @@ ifndef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX
CFLAGS_KASAN_TEST += -fno-builtin
endif
+CFLAGS_REMOVE_kasan_test_c.o += $(call cc-option, -Wvla-larger-than=1)
CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST)
RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN)
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 88d1c9dcb507..d2c70cd2afb1 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -292,33 +292,99 @@ void __init __weak kasan_populate_early_vm_area_shadow(void *start,
{
}
+struct vmalloc_populate_data {
+ unsigned long start;
+ struct page **pages;
+};
+
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
+ void *_data)
{
- unsigned long page;
+ struct vmalloc_populate_data *data = _data;
+ struct page *page;
pte_t pte;
+ int index;
if (likely(!pte_none(ptep_get(ptep))))
return 0;
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+ index = PFN_DOWN(addr - data->start);
+ page = data->pages[index];
+ __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(page_to_pfn(page), PAGE_KERNEL);
spin_lock(&init_mm.page_table_lock);
if (likely(pte_none(ptep_get(ptep)))) {
set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
+ data->pages[index] = NULL;
}
spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
+
+ return 0;
+}
+
+static void ___free_pages_bulk(struct page **pages, int nr_pages)
+{
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i]) {
+ __free_pages(pages[i], 0);
+ pages[i] = NULL;
+ }
+ }
+}
+
+static int ___alloc_pages_bulk(struct page **pages, int nr_pages)
+{
+ unsigned long nr_populated, nr_total = nr_pages;
+ struct page **page_array = pages;
+
+ while (nr_pages) {
+ nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages);
+ if (!nr_populated) {
+ ___free_pages_bulk(page_array, nr_total - nr_pages);
+ return -ENOMEM;
+ }
+ pages += nr_populated;
+ nr_pages -= nr_populated;
+ }
+
return 0;
}
+static int __kasan_populate_vmalloc(unsigned long start, unsigned long end)
+{
+ unsigned long nr_pages, nr_total = PFN_UP(end - start);
+ struct vmalloc_populate_data data;
+ int ret = 0;
+
+ data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!data.pages)
+ return -ENOMEM;
+
+ while (nr_total) {
+ nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0]));
+ ret = ___alloc_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ data.start = start;
+ ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE,
+ kasan_populate_vmalloc_pte, &data);
+ ___free_pages_bulk(data.pages, nr_pages);
+ if (ret)
+ break;
+
+ start += nr_pages * PAGE_SIZE;
+ nr_total -= nr_pages;
+ }
+
+ free_page((unsigned long)data.pages);
+
+ return ret;
+}
+
int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
{
unsigned long shadow_start, shadow_end;
@@ -348,9 +414,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
shadow_start = PAGE_ALIGN_DOWN(shadow_start);
shadow_end = PAGE_ALIGN(shadow_end);
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
+ ret = __kasan_populate_vmalloc(shadow_start, shadow_end);
if (ret)
return ret;
diff --git a/mm/maccess.c b/mm/maccess.c
index 8f0906180a94..831b4dd7296c 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -196,7 +196,7 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
if (ret >= count) {
ret = count;
dst[ret - 1] = '\0';
- } else if (ret > 0) {
+ } else if (ret >= 0) {
ret++;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4e9771e6e340..902da8a9c643 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -575,7 +575,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val,
/* TODO: add to cgroup update tree once it is nmi-safe. */
if (!in_nmi())
- cgroup_rstat_updated(memcg->css.cgroup, cpu);
+ css_rstat_updated(&memcg->css, cpu);
statc_pcpu = memcg->vmstats_percpu;
for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) {
statc = this_cpu_ptr(statc_pcpu);
@@ -610,7 +610,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force)
if (mem_cgroup_is_root(memcg))
WRITE_ONCE(flush_last_time, jiffies_64);
- cgroup_rstat_flush(memcg->css.cgroup);
+ css_rstat_flush(&memcg->css);
}
/*
@@ -1157,7 +1157,6 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
- int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1167,10 +1166,9 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it))) {
- /* Avoid potential softlockup warning */
- if ((++i & 1023) == 0)
- cond_resched();
ret = fn(task, arg);
+ /* Avoid potential softlockup warning */
+ cond_resched();
}
css_task_iter_end(&it);
if (ret) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 784ac2256d08..8cf0f9c9599d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -934,66 +934,20 @@ int filemap_migrate_folio(struct address_space *mapping,
EXPORT_SYMBOL_GPL(filemap_migrate_folio);
/*
- * Writeback a folio to clean the dirty state
- */
-static int writeout(struct address_space *mapping, struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- .nr_to_write = 1,
- .range_start = 0,
- .range_end = LLONG_MAX,
- .for_reclaim = 1
- };
- int rc;
-
- if (!mapping->a_ops->writepage)
- /* No write method for the address space */
- return -EINVAL;
-
- if (!folio_clear_dirty_for_io(folio))
- /* Someone else already triggered a write */
- return -EAGAIN;
-
- /*
- * A dirty folio may imply that the underlying filesystem has
- * the folio on some queue. So the folio must be clean for
- * migration. Writeout may mean we lose the lock and the
- * folio state is no longer what we checked for earlier.
- * At this point we know that the migration attempt cannot
- * be successful.
- */
- remove_migration_ptes(folio, folio, 0);
-
- rc = mapping->a_ops->writepage(&folio->page, &wbc);
-
- if (rc != AOP_WRITEPAGE_ACTIVATE)
- /* unlocked. Relock */
- folio_lock(folio);
-
- return (rc < 0) ? -EIO : -EAGAIN;
-}
-
-/*
* Default handling if a filesystem does not provide a migration function.
*/
static int fallback_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- if (folio_test_dirty(src)) {
- /* Only writeback folios in full synchronous migration */
- switch (mode) {
- case MIGRATE_SYNC:
- break;
- default:
- return -EBUSY;
- }
- return writeout(mapping, src);
- }
+ WARN_ONCE(mapping->a_ops->writepages,
+ "%ps does not implement migrate_folio\n",
+ mapping->a_ops);
+ if (folio_test_dirty(src))
+ return -EBUSY;
/*
- * Buffers may be managed in a filesystem specific way.
- * We must have no buffers or drop them.
+ * Filesystem may have private data at folio->private that we
+ * can't migrate automatically.
*/
if (!filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 8684fa851b84..f2944748f526 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2666,12 +2666,6 @@ static void __init report_meminit(void)
stack = "all(pattern)";
else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
stack = "all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- stack = "byref_all(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- stack = "byref(zero)";
- else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- stack = "__user(zero)";
else
stack = "off";
diff --git a/mm/mremap.c b/mm/mremap.c
index 6e78e02f74bd..83e359754961 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1188,8 +1188,7 @@ static int copy_vma_and_data(struct vma_remap_struct *vrm,
mremap_userfaultfd_prep(new_vma, vrm->uf);
}
- if (is_vm_hugetlb_page(vma))
- clear_vma_resv_huge_pages(vma);
+ fixup_hugetlb_reservations(vma);
*new_vma_ptr = new_vma;
return err;
diff --git a/mm/nommu.c b/mm/nommu.c
index 0bf4849b8204..b624acec6d2e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -200,7 +200,23 @@ void *vmalloc_noprof(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_noprof);
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
+/*
+ * vmalloc_huge_node - allocate virtually contiguous memory, on a node
+ *
+ * @size: allocation size
+ * @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Due to NOMMU implications the node argument and HUGE page attribute is
+ * ignored.
+ */
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
+{
+ return __vmalloc_noprof(size, gfp_mask);
+}
/*
* vzalloc - allocate virtually contiguous memory with zero fill
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e8641aa1043e..b8eea5b3c064 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2622,27 +2622,6 @@ int write_cache_pages(struct address_space *mapping,
}
EXPORT_SYMBOL(write_cache_pages);
-static int writeback_use_writepage(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- struct folio *folio = NULL;
- struct blk_plug plug;
- int err;
-
- blk_start_plug(&plug);
- while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
- err = mapping->a_ops->writepage(&folio->page, wbc);
- if (err == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- err = 0;
- }
- mapping_set_error(mapping, err);
- }
- blk_finish_plug(&plug);
-
- return err;
-}
-
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
@@ -2653,14 +2632,11 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
wb = inode_to_wb_wbc(mapping->host, wbc);
wb_bandwidth_estimate_start(wb);
while (1) {
- if (mapping->a_ops->writepages) {
+ if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
- } else if (mapping->a_ops->writepage) {
- ret = writeback_use_writepage(mapping, wbc);
- } else {
+ else
/* deal with chardevs and other special files */
ret = 0;
- }
if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
break;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d578c6f7ee29..2ef3c07266b3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -898,9 +898,7 @@ static inline bool page_expected_state(struct page *page,
#ifdef CONFIG_MEMCG
page->memcg_data |
#endif
-#ifdef CONFIG_PAGE_POOL
- ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) |
-#endif
+ page_pool_page_is_pp(page) |
(page->flags & check_flags)))
return false;
@@ -927,10 +925,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
if (unlikely(page->memcg_data))
bad_reason = "page still charged to cgroup";
#endif
-#ifdef CONFIG_PAGE_POOL
- if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE))
+ if (unlikely(page_pool_page_is_pp(page)))
bad_reason = "page_pool leak";
-#endif
return bad_reason;
}
@@ -4551,6 +4547,14 @@ restart:
}
retry:
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * infinite retries.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
+
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
diff --git a/mm/page_io.c b/mm/page_io.c
index 4bce19df557b..f7716b6569fa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -237,9 +237,8 @@ static void swap_zeromap_folio_clear(struct folio *folio)
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
*/
-int swap_writepage(struct page *page, struct writeback_control *wbc)
+int swap_writeout(struct folio *folio, struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
int ret;
if (folio_free_swap(folio)) {
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 68109ee93841..4eeca782b888 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -218,33 +218,39 @@ static inline void page_table_check_pmd_flags(pmd_t pmd)
WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
}
-void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
+void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd,
+ unsigned int nr)
{
+ unsigned long stride = PMD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
page_table_check_pmd_flags(pmd);
- __page_table_check_pmd_clear(mm, *pmdp);
- if (pmd_user_accessible_page(pmd)) {
- page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
- pmd_write(pmd));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pmd_clear(mm, *(pmdp + i));
+ if (pmd_user_accessible_page(pmd))
+ page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd));
}
-EXPORT_SYMBOL(__page_table_check_pmd_set);
+EXPORT_SYMBOL(__page_table_check_pmds_set);
-void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
+void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud,
+ unsigned int nr)
{
+ unsigned long stride = PUD_SIZE >> PAGE_SHIFT;
+ unsigned int i;
+
if (&init_mm == mm)
return;
- __page_table_check_pud_clear(mm, *pudp);
- if (pud_user_accessible_page(pud)) {
- page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
- pud_write(pud));
- }
+ for (i = 0; i < nr; i++)
+ __page_table_check_pud_clear(mm, *(pudp + i));
+ if (pud_user_accessible_page(pud))
+ page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud));
}
-EXPORT_SYMBOL(__page_table_check_pud_set);
+EXPORT_SYMBOL(__page_table_check_puds_set);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
unsigned long addr,
diff --git a/mm/readahead.c b/mm/readahead.c
index 6a4e96b69702..20d36d6b055e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -690,9 +690,15 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
+ struct file *file;
+ const struct inode *inode;
+
CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ file = fd_file(f);
+ if (!(file->f_mode & FMODE_READ))
return -EBADF;
/*
@@ -700,9 +706,15 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
- (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
- !S_ISBLK(file_inode(fd_file(f))->i_mode)))
+ if (!file->f_mapping)
+ return -EINVAL;
+ if (!file->f_mapping->a_ops)
+ return -EINVAL;
+
+ inode = file_inode(file);
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ return -EINVAL;
+ if (IS_ANON_FILE(inode))
return -EINVAL;
return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
diff --git a/mm/shmem.c b/mm/shmem.c
index 4b42419ce6b2..0c5fb4ffa03a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -98,7 +98,7 @@ static struct vfsmount *shm_mnt __ro_after_init;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * shmem_fallocate communicates with shmem_fault or shmem_writeout via
* inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
@@ -107,7 +107,7 @@ struct shmem_falloc {
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
- pgoff_t nr_unswapped; /* how often writepage refused to swap out */
+ pgoff_t nr_unswapped; /* how often writeout refused to swap out */
};
struct shmem_options {
@@ -446,7 +446,7 @@ static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
/*
* Special case: whereas normally shmem_recalc_inode() is called
* after i_mapping->nrpages has already been adjusted (up or down),
- * shmem_writepage() has to raise swapped before nrpages is lowered -
+ * shmem_writeout() has to raise swapped before nrpages is lowered -
* to stop a racing shmem_recalc_inode() from thinking that a page has
* been freed. Compensate here, to avoid the need for a followup call.
*/
@@ -1537,12 +1537,15 @@ start_over:
return error;
}
-/*
- * Move the page from the page cache to the swap cache.
+/**
+ * shmem_writeout - Write the folio to swap
+ * @folio: The folio to write
+ * @wbc: How writeback is to be done
+ *
+ * Move the folio from the page cache to the swap cache.
*/
-static int shmem_writepage(struct page *page, struct writeback_control *wbc)
+int shmem_writeout(struct folio *folio, struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -1551,13 +1554,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
int nr_pages;
bool split = false;
- /*
- * Our capabilities prevent regular writeback or sync from ever calling
- * shmem_writepage; but a stacking filesystem might use ->writepage of
- * its underlying filesystem, in which case tmpfs should write out to
- * swap only in response to memory pressure, and not for the writeback
- * threads or sync.
- */
if (WARN_ON_ONCE(!wbc->for_reclaim))
goto redirty;
@@ -1587,9 +1583,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
try_split:
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page_to_list_to_order(page, wbc->list, 0))
+ if (split_folio_to_list(folio, wbc->list))
goto redirty;
- folio = page_folio(page);
folio_clear_dirty(folio);
}
@@ -1647,7 +1642,7 @@ try_split:
mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(folio_mapped(folio));
- return swap_writepage(&folio->page, wbc);
+ return swap_writeout(folio, wbc);
}
if (!info->swapped)
list_del_init(&info->swaplist);
@@ -1661,6 +1656,7 @@ redirty:
folio_unlock(folio);
return 0;
}
+EXPORT_SYMBOL_GPL(shmem_writeout);
#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
@@ -3771,7 +3767,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
index--;
/*
- * Inform shmem_writepage() how far we have reached.
+ * Inform shmem_writeout() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
if (!folio_test_uptodate(folio))
@@ -5194,7 +5190,6 @@ static int shmem_error_remove_folio(struct address_space *mapping,
}
static const struct address_space_operations shmem_aops = {
- .writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 03e8d968fd1a..0cf8bf5d832d 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -217,7 +217,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
global_node_page_state(NR_SECONDARY_PAGETABLE),
- global_zone_page_state(NR_BOUNCE),
+ 0UL,
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
@@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_BOUNCE)),
+ 0UL,
K(free_pcp),
K(this_cpu_read(zone->per_cpu_pageset->count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
diff --git a/mm/swap.h b/mm/swap.h
index 521bf510ec75..2269eb9df0af 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -20,7 +20,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
__swap_read_unplug(plug);
}
void swap_write_unplug(struct swap_iocb *sio);
-int swap_writepage(struct page *page, struct writeback_control *wbc);
+int swap_writeout(struct folio *folio, struct writeback_control *wbc);
void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
/* linux/mm/swap_state.c */
@@ -141,7 +141,7 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
return NULL;
}
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+static inline int swap_writeout(struct folio *f, struct writeback_control *wbc)
{
return 0;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ac4e0994931c..c354435a0923 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -30,7 +30,6 @@
* vmscan's shrink_folio_list.
*/
static const struct address_space_operations swap_aops = {
- .writepage = swap_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
.migrate_folio = migrate_folio,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 026090bf3efe..68ce283e84be 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2371,7 +2371,7 @@ retry:
* Limit the number of retries? No: when mmget_not_zero()
* above fails, that mm is likely to be freeing swap from
* exit_mmap(), which proceeds at its own independent pace;
- * and even shmem_writepage() could have been preempted after
+ * and even shmem_writeout() could have been preempted after
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
@@ -3335,6 +3335,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
/*
+ * The swap subsystem needs a major overhaul to support this.
+ * It doesn't work yet so just disable it for now.
+ */
+ if (mapping_min_folio_order(mapping) > 0) {
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ /*
* Read the swap header.
*/
if (!mapping->a_ops->read_folio) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 3c1bcdc3a3e9..91eb92a5ce4f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -191,6 +191,7 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
loff_t pos = folio_pos(folio);
+ size_t size = folio_size(folio);
unsigned int offset, length;
struct page *split_at, *split_at2;
@@ -198,14 +199,13 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
offset = start - pos;
else
offset = 0;
- length = folio_size(folio);
- if (pos + length <= (u64)end)
- length = length - offset;
+ if (pos + size <= (u64)end)
+ length = size - offset;
else
length = end + 1 - pos - offset;
folio_wait_writeback(folio);
- if (length == folio_size(folio)) {
+ if (length == size) {
truncate_inode_folio(folio->mapping, folio);
return true;
}
@@ -224,16 +224,20 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
return true;
split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
- split_at2 = folio_page(folio,
- PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
-
if (!try_folio_split(folio, split_at, NULL)) {
/*
* try to split at offset + length to make sure folios within
* the range can be dropped, especially to avoid memory waste
* for shmem truncate
*/
- struct folio *folio2 = page_folio(split_at2);
+ struct folio *folio2;
+
+ if (offset + length == size)
+ goto no_split;
+
+ split_at2 = folio_page(folio,
+ PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);
+ folio2 = page_folio(split_at2);
if (!folio_try_get(folio2))
goto no_split;
diff --git a/mm/vma.c b/mm/vma.c
index 3ff6cfbe3338..1c6595f282e5 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -1874,6 +1874,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
+ fixup_hugetlb_reservations(new_vma);
vma_close(new_vma);
if (new_vma->vm_file)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 49df04e1fbe1..ab986dd09b6a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -104,6 +104,9 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
if (unlikely(!pte_none(ptep_get(pte)))) {
if (pfn_valid(pfn)) {
@@ -127,6 +130,8 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -350,12 +355,30 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
pte_t *pte;
+ pte_t ptent;
+ unsigned long size = PAGE_SIZE;
pte = pte_offset_kernel(pmd, addr);
+ arch_enter_lazy_mmu_mode();
+
do {
- pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_unmap_size(addr, pte);
+ if (size != PAGE_SIZE) {
+ if (WARN_ON(!IS_ALIGNED(addr, size))) {
+ addr = ALIGN_DOWN(addr, size);
+ pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT));
+ }
+ ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size);
+ if (WARN_ON(end - addr < size))
+ size = end - addr;
+ } else
+#endif
+ ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
}
@@ -374,8 +397,10 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
if (cleared || pmd_bad(*pmd))
*mask |= PGTBL_PMD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PMD_SIZE);
continue;
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next, mask);
@@ -399,8 +424,10 @@ static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
if (cleared || pud_bad(*pud))
*mask |= PGTBL_PUD_MODIFIED;
- if (cleared)
+ if (cleared) {
+ WARN_ON(next - addr < PUD_SIZE);
continue;
+ }
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next, mask);
@@ -497,6 +524,9 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
+
+ arch_enter_lazy_mmu_mode();
+
do {
struct page *page = pages[*nr];
@@ -510,6 +540,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+
+ arch_leave_lazy_mmu_mode();
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -3952,9 +3984,10 @@ void *vmalloc_noprof(unsigned long size)
EXPORT_SYMBOL(vmalloc_noprof);
/**
- * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
+ * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages
* @size: allocation size
* @gfp_mask: flags for the page level allocator
+ * @node: node to use for allocation or NUMA_NO_NODE
*
* Allocate enough pages to cover @size from the page level
* allocator and map them into contiguous kernel virtual space.
@@ -3963,13 +3996,13 @@ EXPORT_SYMBOL(vmalloc_noprof);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node)
{
return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- NUMA_NO_NODE, __builtin_return_address(0));
+ gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+ node, __builtin_return_address(0));
}
-EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
+EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof);
/**
* vzalloc - allocate virtually contiguous memory with zero fill
@@ -4101,8 +4134,8 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
* would be a good heuristic for when to shrink the vm_area?
*/
if (size <= old_size) {
- /* Zero out "freed" memory. */
- if (want_init_on_free())
+ /* Zero out "freed" memory, potentially for future realloc. */
+ if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
vm->requested_size = size;
kasan_poison_vmalloc(p + size, old_size - size);
@@ -4115,10 +4148,13 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
if (size <= alloced_size) {
kasan_unpoison_vmalloc(p + old_size, size - old_size,
KASAN_VMALLOC_PROT_NORMAL);
- /* Zero out "alloced" memory. */
- if (want_init_on_alloc(flags))
- memset((void *)p + old_size, 0, size - old_size);
+ /*
+ * No need to zero memory here, as unused memory will have
+ * already been zeroed at initial allocation time or during
+ * realloc shrink time.
+ */
vm->requested_size = size;
+ return (void *)p;
}
/* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 07c51fa03434..f8dfd2864bbf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -654,21 +654,20 @@ typedef enum {
/*
* pageout is called by shrink_folio_list() for each dirty folio.
- * Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
struct swap_iocb **plug, struct list_head *folio_list)
{
+ int (*writeout)(struct folio *, struct writeback_control *);
+
/*
- * If the folio is dirty, only perform writeback if that write
- * will be non-blocking. To prevent this allocation from being
- * stalled by pagecache activity. But note that there may be
- * stalls if we need to run get_block(). We could test
- * PagePrivate for that.
- *
- * If this process is currently in __generic_file_write_iter() against
- * this folio's queue, we can perform writeback even if that
- * will block.
+ * We no longer attempt to writeback filesystem folios here, other
+ * than tmpfs/shmem. That's taken care of in page-writeback.
+ * If we find a dirty filesystem folio at the end of the LRU list,
+ * typically that means the filesystem is saturating the storage
+ * with contiguous writes and telling it to write a folio here
+ * would only make the situation worse by injecting an element
+ * of random access.
*
* If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
@@ -691,7 +690,11 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
return PAGE_KEEP;
}
- if (mapping->a_ops->writepage == NULL)
+ if (shmem_mapping(mapping))
+ writeout = shmem_writeout;
+ else if (folio_test_anon(folio))
+ writeout = swap_writeout;
+ else
return PAGE_ACTIVATE;
if (folio_clear_dirty_for_io(folio)) {
@@ -714,7 +717,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
wbc.list = folio_list;
folio_set_reclaim(folio);
- res = mapping->a_ops->writepage(&folio->page, &wbc);
+ res = writeout(folio, &wbc);
if (res < 0)
handle_write_error(mapping, folio, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
@@ -723,7 +726,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
if (!folio_test_writeback(folio)) {
- /* synchronous write or broken a_ops? */
+ /* synchronous write? */
folio_clear_reclaim(folio);
}
trace_mm_vmscan_write_folio(folio);
@@ -1194,8 +1197,10 @@ retry:
* 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the folio for immediate
- * reclaim and continue scanning.
+ * not to fs), or the folio belongs to a mapping where
+ * waiting on writeback during reclaim may lead to a deadlock.
+ * In this case mark the folio for immediate reclaim and
+ * continue scanning.
*
* Require may_enter_fs() because we would wait on fs, which
* may not have submitted I/O yet. And the loop driver might
@@ -1220,6 +1225,8 @@ retry:
* takes to write them to disk.
*/
if (folio_test_writeback(folio)) {
+ mapping = folio_mapping(folio);
+
/* Case 1 above */
if (current_is_kswapd() &&
folio_test_reclaim(folio) &&
@@ -1230,7 +1237,9 @@ retry:
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
!folio_test_reclaim(folio) ||
- !may_enter_fs(folio, sc->gfp_mask)) {
+ !may_enter_fs(folio, sc->gfp_mask) ||
+ (mapping &&
+ mapping_writeback_may_deadlock_on_reclaim(mapping))) {
/*
* This is slightly racy -
* folio_end_writeback() might have