summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile1
-rw-r--r--mm/damon/core.c5
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/filemap.c21
-rw-r--r--mm/gup.c140
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/memcontrol-v1.c44
-rw-r--r--mm/migrate.c4
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c154
-rw-r--r--mm/page_frag_cache.c171
-rw-r--r--mm/readahead.c17
-rw-r--r--mm/shmem.c269
-rw-r--r--mm/slab_common.c50
-rw-r--r--mm/swap.c14
-rw-r--r--mm/swapfile.c24
-rw-r--r--mm/truncate.c16
21 files changed, 669 insertions, 292 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 33fa51d608dc..84000b016808 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1295,6 +1295,12 @@ config NUMA_EMU
into virtual nodes when booted with "numa=fake=N", where N is the
number of nodes. This is only useful for debugging.
+config ARCH_HAS_USER_SHADOW_STACK
+ bool
+ help
+ The architecture has hardware support for userspace shadow call
+ stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index d5639b036166..dba52bb0da8a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -65,6 +65,7 @@ page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-y += page-alloc.o
+obj-y += page_frag_cache.o
obj-y += init-mm.o
obj-y += memblock.o
obj-y += $(memory-hotplug-y)
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 511c3f61ab44..8b8e2933dcd4 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1906,11 +1906,10 @@ static unsigned long damos_wmark_wait_us(struct damos *scheme)
static void kdamond_usleep(unsigned long usecs)
{
- /* See Documentation/timers/timers-howto.rst for the thresholds */
- if (usecs > 20 * USEC_PER_MSEC)
+ if (usecs >= USLEEP_RANGE_UPPER_BOUND)
schedule_timeout_idle(usecs_to_jiffies(usecs));
else
- usleep_idle_range(usecs, usecs + 1);
+ usleep_range_idle(usecs, usecs + 1);
}
/* Returns negative error code if it's not activated but should return */
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 532dee205c6e..588fe76c5a14 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -190,16 +190,12 @@ EXPORT_SYMBOL(vfs_fadvise);
int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
{
- struct fd f = fdget(fd);
- int ret;
+ CLASS(fd, f)(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- ret = vfs_fadvise(fd_file(f), offset, len, advice);
-
- fdput(f);
- return ret;
+ return vfs_fadvise(fd_file(f), offset, len, advice);
}
SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
diff --git a/mm/filemap.c b/mm/filemap.c
index e582a1545d2a..7c76a123ba18 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2619,12 +2619,14 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
loff_t isize, end_offset;
loff_t last_pos = ra->prev_pos;
+ if (unlikely(iocb->ki_pos < 0))
+ return -EINVAL;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
if (unlikely(!iov_iter_count(iter)))
return 0;
- iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
+ iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
folio_batch_init(&fbatch);
do {
@@ -4420,31 +4422,25 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
struct cachestat_range __user *, cstat_range,
struct cachestat __user *, cstat, unsigned int, flags)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct address_space *mapping;
struct cachestat_range csr;
struct cachestat cs;
pgoff_t first_index, last_index;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
if (copy_from_user(&csr, cstat_range,
- sizeof(struct cachestat_range))) {
- fdput(f);
+ sizeof(struct cachestat_range)))
return -EFAULT;
- }
/* hugetlbfs is not supported */
- if (is_file_hugepages(fd_file(f))) {
- fdput(f);
+ if (is_file_hugepages(fd_file(f)))
return -EOPNOTSUPP;
- }
- if (flags != 0) {
- fdput(f);
+ if (flags != 0)
return -EINVAL;
- }
first_index = csr.off >> PAGE_SHIFT;
last_index =
@@ -4452,7 +4448,6 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
memset(&cs, 0, sizeof(struct cachestat));
mapping = fd_file(f)->f_mapping;
filemap_cachestat(mapping, first_index, last_index, &cs);
- fdput(f);
if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
return -EFAULT;
diff --git a/mm/gup.c b/mm/gup.c
index 28ae330ec4dd..746070a1d8bf 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2273,20 +2273,57 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
#ifdef CONFIG_MIGRATION
+
+/*
+ * An array of either pages or folios ("pofs"). Although it may seem tempting to
+ * avoid this complication, by simply interpreting a list of folios as a list of
+ * pages, that approach won't work in the longer term, because eventually the
+ * layouts of struct page and struct folio will become completely different.
+ * Furthermore, this pof approach avoids excessive page_folio() calls.
+ */
+struct pages_or_folios {
+ union {
+ struct page **pages;
+ struct folio **folios;
+ void **entries;
+ };
+ bool has_folios;
+ long nr_entries;
+};
+
+static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
+{
+ if (pofs->has_folios)
+ return pofs->folios[i];
+ return page_folio(pofs->pages[i]);
+}
+
+static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
+{
+ pofs->entries[i] = NULL;
+}
+
+static void pofs_unpin(struct pages_or_folios *pofs)
+{
+ if (pofs->has_folios)
+ unpin_folios(pofs->folios, pofs->nr_entries);
+ else
+ unpin_user_pages(pofs->pages, pofs->nr_entries);
+}
+
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
static unsigned long collect_longterm_unpinnable_folios(
- struct list_head *movable_folio_list,
- unsigned long nr_folios,
- struct folio **folios)
+ struct list_head *movable_folio_list,
+ struct pages_or_folios *pofs)
{
unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
bool drain_allow = true;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = folios[i];
+ for (i = 0; i < pofs->nr_entries; i++) {
+ struct folio *folio = pofs_get_folio(pofs, i);
if (folio == prev_folio)
continue;
@@ -2327,16 +2364,15 @@ static unsigned long collect_longterm_unpinnable_folios(
* Returns -EAGAIN if all folios were successfully migrated or -errno for
* failure (or partial success).
*/
-static int migrate_longterm_unpinnable_folios(
- struct list_head *movable_folio_list,
- unsigned long nr_folios,
- struct folio **folios)
+static int
+migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,
+ struct pages_or_folios *pofs)
{
int ret;
unsigned long i;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = folios[i];
+ for (i = 0; i < pofs->nr_entries; i++) {
+ struct folio *folio = pofs_get_folio(pofs, i);
if (folio_is_device_coherent(folio)) {
/*
@@ -2344,7 +2380,7 @@ static int migrate_longterm_unpinnable_folios(
* convert the pin on the source folio to a normal
* reference.
*/
- folios[i] = NULL;
+ pofs_clear_entry(pofs, i);
folio_get(folio);
gup_put_folio(folio, 1, FOLL_PIN);
@@ -2363,8 +2399,8 @@ static int migrate_longterm_unpinnable_folios(
* calling folio_isolate_lru() which takes a reference so the
* folio won't be freed if it's migrating.
*/
- unpin_folio(folios[i]);
- folios[i] = NULL;
+ unpin_folio(folio);
+ pofs_clear_entry(pofs, i);
}
if (!list_empty(movable_folio_list)) {
@@ -2387,12 +2423,26 @@ static int migrate_longterm_unpinnable_folios(
return -EAGAIN;
err:
- unpin_folios(folios, nr_folios);
+ pofs_unpin(pofs);
putback_movable_pages(movable_folio_list);
return ret;
}
+static long
+check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
+{
+ LIST_HEAD(movable_folio_list);
+ unsigned long collected;
+
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+ pofs);
+ if (!collected)
+ return 0;
+
+ return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
+}
+
/*
* Check whether all folios are *allowed* to be pinned indefinitely (long term).
* Rather confusingly, all folios in the range are required to be pinned via
@@ -2417,16 +2467,13 @@ err:
static long check_and_migrate_movable_folios(unsigned long nr_folios,
struct folio **folios)
{
- unsigned long collected;
- LIST_HEAD(movable_folio_list);
-
- collected = collect_longterm_unpinnable_folios(&movable_folio_list,
- nr_folios, folios);
- if (!collected)
- return 0;
+ struct pages_or_folios pofs = {
+ .folios = folios,
+ .has_folios = true,
+ .nr_entries = nr_folios,
+ };
- return migrate_longterm_unpinnable_folios(&movable_folio_list,
- nr_folios, folios);
+ return check_and_migrate_movable_pages_or_folios(&pofs);
}
/*
@@ -2436,22 +2483,13 @@ static long check_and_migrate_movable_folios(unsigned long nr_folios,
static long check_and_migrate_movable_pages(unsigned long nr_pages,
struct page **pages)
{
- struct folio **folios;
- long i, ret;
-
- folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL);
- if (!folios) {
- unpin_user_pages(pages, nr_pages);
- return -ENOMEM;
- }
-
- for (i = 0; i < nr_pages; i++)
- folios[i] = page_folio(pages[i]);
+ struct pages_or_folios pofs = {
+ .pages = pages,
+ .has_folios = false,
+ .nr_entries = nr_pages,
+ };
- ret = check_and_migrate_movable_folios(nr_pages, folios);
-
- kfree(folios);
- return ret;
+ return check_and_migrate_movable_pages_or_folios(&pofs);
}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
@@ -3722,3 +3760,27 @@ err:
return ret;
}
EXPORT_SYMBOL_GPL(memfd_pin_folios);
+
+/**
+ * folio_add_pins() - add pins to an already-pinned folio
+ * @folio: the folio to add more pins to
+ * @pins: number of pins to add
+ *
+ * Try to add more pins to an already-pinned folio. The semantics
+ * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot
+ * be changed.
+ *
+ * This function is helpful when having obtained a pin on a large folio
+ * using memfd_pin_folios(), but wanting to logically unpin parts
+ * (e.g., individual pages) of the folio later, for example, using
+ * unpin_user_page_range_dirty_lock().
+ *
+ * This is not the right interface to initially pin a folio.
+ */
+int folio_add_pins(struct folio *folio, unsigned int pins)
+{
+ VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));
+
+ return try_grab_folio(folio, pins, FOLL_PIN);
+}
+EXPORT_SYMBOL_GPL(folio_add_pins);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1ebe18ec4560..ee335d96fc39 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3863,7 +3863,9 @@ next:
* in the case it was underused, then consider it used and
* don't add it back to split_queue.
*/
- if (!did_split && !folio_test_partially_mapped(folio)) {
+ if (did_split) {
+ ; /* folio already removed from list */
+ } else if (!folio_test_partially_mapped(folio)) {
list_del_init(&folio->_deferred_list);
removed++;
} else {
diff --git a/mm/ksm.c b/mm/ksm.c
index 7ac59cde626c..31a9bc365437 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2261,8 +2261,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
/* Start by searching for the folio in the stable tree */
kfolio = stable_tree_search(page);
- if (!IS_ERR_OR_NULL(kfolio) && &kfolio->page == page &&
- rmap_item->head == stable_node) {
+ if (&kfolio->page == page && rmap_item->head == stable_node) {
folio_put(kfolio);
return;
}
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 539ceefa9d2d..a071fa43d479 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -958,8 +958,6 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
struct mem_cgroup_event *event;
struct cgroup_subsys_state *cfile_css;
unsigned int efd, cfd;
- struct fd efile;
- struct fd cfile;
struct dentry *cdentry;
const char *name;
char *endp;
@@ -983,6 +981,12 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
else
return -EINVAL;
+ CLASS(fd, efile)(efd);
+ if (fd_empty(efile))
+ return -EBADF;
+
+ CLASS(fd, cfile)(cfd);
+
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
@@ -993,20 +997,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
init_waitqueue_func_entry(&event->wait, memcg_event_wake);
INIT_WORK(&event->remove, memcg_event_remove);
- efile = fdget(efd);
- if (!fd_file(efile)) {
- ret = -EBADF;
- goto out_kfree;
- }
-
event->eventfd = eventfd_ctx_fileget(fd_file(efile));
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
- goto out_put_efile;
+ goto out_kfree;
}
- cfile = fdget(cfd);
- if (!fd_file(cfile)) {
+ if (fd_empty(cfile)) {
ret = -EBADF;
goto out_put_eventfd;
}
@@ -1015,7 +1012,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
/* AV: shouldn't we check that it's been opened for read instead? */
ret = file_permission(fd_file(cfile), MAY_READ);
if (ret < 0)
- goto out_put_cfile;
+ goto out_put_eventfd;
/*
* The control file must be a regular cgroup1 file. As a regular cgroup
@@ -1024,7 +1021,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
cdentry = fd_file(cfile)->f_path.dentry;
if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
ret = -EINVAL;
- goto out_put_cfile;
+ goto out_put_eventfd;
}
/*
@@ -1057,7 +1054,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
event->unregister_event = memsw_cgroup_usage_unregister_event;
} else {
ret = -EINVAL;
- goto out_put_cfile;
+ goto out_put_eventfd;
}
/*
@@ -1069,11 +1066,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
&memory_cgrp_subsys);
ret = -EINVAL;
if (IS_ERR(cfile_css))
- goto out_put_cfile;
- if (cfile_css != css) {
- css_put(cfile_css);
- goto out_put_cfile;
- }
+ goto out_put_eventfd;
+ if (cfile_css != css)
+ goto out_put_css;
ret = event->register_event(memcg, event->eventfd, buf);
if (ret)
@@ -1084,23 +1079,14 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
spin_lock_irq(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
spin_unlock_irq(&memcg->event_list_lock);
-
- fdput(cfile);
- fdput(efile);
-
return nbytes;
out_put_css:
- css_put(css);
-out_put_cfile:
- fdput(cfile);
+ css_put(cfile_css);
out_put_eventfd:
eventfd_ctx_put(event->eventfd);
-out_put_efile:
- fdput(efile);
out_kfree:
kfree(event);
-
return ret;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index dfb5eba3c522..2ce6b4b814df 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -473,7 +473,7 @@ static int folio_expected_refs(struct address_space *mapping,
* The number of remaining references must be:
* 1 for anonymous folios without a mapping
* 2 for folios with a mapping
- * 3 for folios with a mapping and PagePrivate/PagePrivate2 set.
+ * 3 for folios with a mapping and the private flag set.
*/
static int __folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int expected_count)
@@ -788,7 +788,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
* @mode: How to migrate the page.
*
* Common logic to directly migrate a single LRU folio suitable for
- * folios that do not use PagePrivate/PagePrivate2.
+ * folios that do not have private data.
*
* Folios are locked upon entry and exit.
*/
diff --git a/mm/mremap.c b/mm/mremap.c
index 4c79ab92eb8f..60473413836b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -657,7 +657,7 @@ again:
* Prevent negative return values when {old,new}_addr was realigned
* but we broke out of the above loop for the first PMD itself.
*/
- if (len + old_addr < old_end)
+ if (old_addr < old_end - len)
return 0;
return len + old_addr - old_end; /* how much done */
diff --git a/mm/nommu.c b/mm/nommu.c
index e9b5f527ab5b..9cb6e99215e2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -573,7 +573,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start);
vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
- if (vma_iter_prealloc(&vmi, vma)) {
+ if (vma_iter_prealloc(&vmi, NULL)) {
pr_warn("Allocation of vma tree for process %d failed\n",
current->pid);
return -ENOMEM;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9c3317c3a615..fdb89ce85fff 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -54,7 +54,7 @@
#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
/*
- * Estimate write bandwidth at 200ms intervals.
+ * Estimate write bandwidth or update dirty limit at 200ms intervals.
*/
#define BANDWIDTH_INTERVAL max(HZ/5, 1)
@@ -586,7 +586,7 @@ static void wb_domain_writeout_add(struct wb_domain *dom,
/* First event after period switching was turned off? */
if (unlikely(!dom->period_time)) {
/*
- * We can race with other __bdi_writeout_inc calls here but
+ * We can race with other wb_domain_writeout_add calls here but
* it does not cause any harm since the resulting time when
* timer will fire and what is in writeout_period_time will be
* roughly the same.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 744c1a413fdb..1cb4b8c8886d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1048,6 +1048,7 @@ __always_inline bool free_pages_prepare(struct page *page,
bool skip_kasan_poison = should_skip_kasan_poison(page);
bool init = want_init_on_free();
bool compound = PageCompound(page);
+ struct folio *folio = page_folio(page);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1057,6 +1058,20 @@ __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
+ /*
+ * In rare cases, when truncation or holepunching raced with
+ * munlock after VM_LOCKED was cleared, Mlocked may still be
+ * found set here. This does not indicate a problem, unless
+ * "unevictable_pgs_cleared" appears worryingly large.
+ */
+ if (unlikely(folio_test_mlocked(folio))) {
+ long nr_pages = folio_nr_pages(folio);
+
+ __folio_clear_mlocked(folio);
+ zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
+ }
+
if (unlikely(PageHWPoison(page)) && !order) {
/* Do not let hwpoison pages hit pcplists/buddy */
reset_page_owner(page, order);
@@ -4592,7 +4607,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
gfp = alloc_gfp;
/* Find an allowed local zone that meets the low watermark. */
- for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+ z = ac.preferred_zoneref;
+ for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) {
unsigned long mark;
if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
@@ -4838,142 +4854,6 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
-/*
- * Page Fragment:
- * An arbitrary-length arbitrary-offset area of memory which resides
- * within a 0 or higher order page. Multiple fragments within that page
- * are individually refcounted, in the page's reference counter.
- *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments. This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
- */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
- gfp_t gfp_mask)
-{
- struct page *page = NULL;
- gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP |
- __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
- page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
- PAGE_FRAG_CACHE_MAX_ORDER);
- nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-#endif
- if (unlikely(!page))
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
- nc->va = page ? page_address(page) : NULL;
-
- return page;
-}
-
-void page_frag_cache_drain(struct page_frag_cache *nc)
-{
- if (!nc->va)
- return;
-
- __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
- nc->va = NULL;
-}
-EXPORT_SYMBOL(page_frag_cache_drain);
-
-void __page_frag_cache_drain(struct page *page, unsigned int count)
-{
- VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-
- if (page_ref_sub_and_test(page, count))
- free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(__page_frag_cache_drain);
-
-void *__page_frag_alloc_align(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask,
- unsigned int align_mask)
-{
- unsigned int size = PAGE_SIZE;
- struct page *page;
- int offset;
-
- if (unlikely(!nc->va)) {
-refill:
- page = __page_frag_cache_refill(nc, gfp_mask);
- if (!page)
- return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- /* if size can vary use size else just use PAGE_SIZE */
- size = nc->size;
-#endif
- /* Even if we own the page, we do not use atomic_set().
- * This would break get_page_unless_zero() users.
- */
- page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
-
- /* reset page count bias and offset to start of new frag */
- nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
- nc->offset = size;
- }
-
- offset = nc->offset - fragsz;
- if (unlikely(offset < 0)) {
- page = virt_to_page(nc->va);
-
- if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
- goto refill;
-
- if (unlikely(nc->pfmemalloc)) {
- free_unref_page(page, compound_order(page));
- goto refill;
- }
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- /* if size can vary use size else just use PAGE_SIZE */
- size = nc->size;
-#endif
- /* OK, page count is 0, we can safely set it */
- set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
-
- /* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
- offset = size - fragsz;
- if (unlikely(offset < 0)) {
- /*
- * The caller is trying to allocate a fragment
- * with fragsz > PAGE_SIZE but the cache isn't big
- * enough to satisfy the request, this may
- * happen in low memory conditions.
- * We don't release the cache page because
- * it could make memory pressure worse
- * so we simply return NULL here.
- */
- return NULL;
- }
- }
-
- nc->pagecnt_bias--;
- offset &= align_mask;
- nc->offset = offset;
-
- return nc->va + offset;
-}
-EXPORT_SYMBOL(__page_frag_alloc_align);
-
-/*
- * Frees a page fragment allocated out of either a compound or order 0 page.
- */
-void page_frag_free(void *addr)
-{
- struct page *page = virt_to_head_page(addr);
-
- if (unlikely(put_page_testzero(page)))
- free_unref_page(page, compound_order(page));
-}
-EXPORT_SYMBOL(page_frag_free);
-
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
new file mode 100644
index 000000000000..3f7a203d35c6
--- /dev/null
+++ b/mm/page_frag_cache.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Page fragment allocator
+ *
+ * Page Fragment:
+ * An arbitrary-length arbitrary-offset area of memory which resides within a
+ * 0 or higher order page. Multiple fragments within that page are
+ * individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions provide a simple allocation framework for page
+ * fragments. This is used by the network stack and network device drivers to
+ * provide a backing region of memory for use as either an sk_buff->head, or to
+ * be used in the "frags" portion of skb_shared_info.
+ */
+
+#include <linux/build_bug.h>
+#include <linux/export.h>
+#include <linux/gfp_types.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/page_frag_cache.h>
+#include "internal.h"
+
+static unsigned long encoded_page_create(struct page *page, unsigned int order,
+ bool pfmemalloc)
+{
+ BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);
+ BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_BIT >= PAGE_SIZE);
+
+ return (unsigned long)page_address(page) |
+ (order & PAGE_FRAG_CACHE_ORDER_MASK) |
+ ((unsigned long)pfmemalloc * PAGE_FRAG_CACHE_PFMEMALLOC_BIT);
+}
+
+static unsigned long encoded_page_decode_order(unsigned long encoded_page)
+{
+ return encoded_page & PAGE_FRAG_CACHE_ORDER_MASK;
+}
+
+static void *encoded_page_decode_virt(unsigned long encoded_page)
+{
+ return (void *)(encoded_page & PAGE_MASK);
+}
+
+static struct page *encoded_page_decode_page(unsigned long encoded_page)
+{
+ return virt_to_page((void *)encoded_page);
+}
+
+static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+ gfp_t gfp_mask)
+{
+ unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
+ struct page *page = NULL;
+ gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP |
+ __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
+ page = __alloc_pages(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER,
+ numa_mem_id(), NULL);
+#endif
+ if (unlikely(!page)) {
+ page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
+ order = 0;
+ }
+
+ nc->encoded_page = page ?
+ encoded_page_create(page, order, page_is_pfmemalloc(page)) : 0;
+
+ return page;
+}
+
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+ if (!nc->encoded_page)
+ return;
+
+ __page_frag_cache_drain(encoded_page_decode_page(nc->encoded_page),
+ nc->pagecnt_bias);
+ nc->encoded_page = 0;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
+void __page_frag_cache_drain(struct page *page, unsigned int count)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+ if (page_ref_sub_and_test(page, count))
+ free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(__page_frag_cache_drain);
+
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask,
+ unsigned int align_mask)
+{
+ unsigned long encoded_page = nc->encoded_page;
+ unsigned int size, offset;
+ struct page *page;
+
+ if (unlikely(!encoded_page)) {
+refill:
+ page = __page_frag_cache_refill(nc, gfp_mask);
+ if (!page)
+ return NULL;
+
+ encoded_page = nc->encoded_page;
+
+ /* Even if we own the page, we do not use atomic_set().
+ * This would break get_page_unless_zero() users.
+ */
+ page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+ nc->offset = 0;
+ }
+
+ size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
+ offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
+ if (unlikely(offset + fragsz > size)) {
+ if (unlikely(fragsz > PAGE_SIZE)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
+
+ page = encoded_page_decode_page(encoded_page);
+
+ if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
+ goto refill;
+
+ if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) {
+ free_unref_page(page,
+ encoded_page_decode_order(encoded_page));
+ goto refill;
+ }
+
+ /* OK, page count is 0, we can safely set it */
+ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+ offset = 0;
+ }
+
+ nc->pagecnt_bias--;
+ nc->offset = offset + fragsz;
+
+ return encoded_page_decode_virt(encoded_page) + offset;
+}
+EXPORT_SYMBOL(__page_frag_alloc_align);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void page_frag_free(void *addr)
+{
+ struct page *page = virt_to_head_page(addr);
+
+ if (unlikely(put_page_testzero(page)))
+ free_unref_page(page, compound_order(page));
+}
+EXPORT_SYMBOL(page_frag_free);
diff --git a/mm/readahead.c b/mm/readahead.c
index 475d2940a1ed..8f1cf599b572 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -678,29 +678,22 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
- ssize_t ret;
- struct fd f;
+ CLASS(fd, f)(fd);
- ret = -EBADF;
- f = fdget(fd);
- if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
- goto out;
+ if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ return -EBADF;
/*
* The readahead() syscall is intended to run only on files
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- ret = -EINVAL;
if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
(!S_ISREG(file_inode(fd_file(f))->i_mode) &&
!S_ISBLK(file_inode(fd_file(f))->i_mode)))
- goto out;
+ return -EINVAL;
- ret = vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
-out:
- fdput(f);
- return ret;
+ return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
}
SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
diff --git a/mm/shmem.c b/mm/shmem.c
index 579e58cb3262..ccb9629a0f70 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -40,6 +40,7 @@
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
#include <linux/iversion.h>
+#include <linux/unicode.h>
#include "swap.h"
static struct vfsmount *shm_mnt __ro_after_init;
@@ -123,6 +124,10 @@ struct shmem_options {
bool noswap;
unsigned short quota_types;
struct shmem_quota_limits qlimits;
+#if IS_ENABLED(CONFIG_UNICODE)
+ struct unicode_map *encoding;
+ bool strict_encoding;
+#endif
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
@@ -1169,9 +1174,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- inode_lock_shared(inode);
generic_fillattr(idmap, request_mask, inode, stat);
- inode_unlock_shared(inode);
if (shmem_huge_global_enabled(inode, 0, 0, false, 0))
stat->blksize = HPAGE_PMD_SIZE;
@@ -2771,13 +2774,62 @@ static int shmem_file_open(struct inode *inode, struct file *file)
#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+#if IS_ENABLED(CONFIG_UNICODE)
+/*
+ * shmem_inode_casefold_flags - Deal with casefold file attribute flag
+ *
+ * The casefold file attribute needs some special checks. I can just be added to
+ * an empty dir, and can't be removed from a non-empty dir.
+ */
+static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
+ struct dentry *dentry, unsigned int *i_flags)
+{
+ unsigned int old = inode->i_flags;
+ struct super_block *sb = inode->i_sb;
+
+ if (fsflags & FS_CASEFOLD_FL) {
+ if (!(old & S_CASEFOLD)) {
+ if (!sb->s_encoding)
+ return -EOPNOTSUPP;
+
+ if (!S_ISDIR(inode->i_mode))
+ return -ENOTDIR;
+
+ if (dentry && !simple_empty(dentry))
+ return -ENOTEMPTY;
+ }
+
+ *i_flags = *i_flags | S_CASEFOLD;
+ } else if (old & S_CASEFOLD) {
+ if (dentry && !simple_empty(dentry))
+ return -ENOTEMPTY;
+ }
+
+ return 0;
+}
+#else
+static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
+ struct dentry *dentry, unsigned int *i_flags)
+{
+ if (fsflags & FS_CASEFOLD_FL)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+#endif
+
/*
* chattr's fsflags are unrelated to extended attributes,
* but tmpfs has chosen to enable them under the same config option.
*/
-static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
{
unsigned int i_flags = 0;
+ int ret;
+
+ ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags);
+ if (ret)
+ return ret;
if (fsflags & FS_NOATIME_FL)
i_flags |= S_NOATIME;
@@ -2788,10 +2840,12 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
/*
* But FS_NODUMP_FL does not require any action in i_flags.
*/
- inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
+ inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);
+
+ return 0;
}
#else
-static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
{
}
#define shmem_initxattrs NULL
@@ -2838,7 +2892,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
info->fsflags = (dir == NULL) ? 0 :
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
if (info->fsflags)
- shmem_set_inode_flags(inode, info->fsflags);
+ shmem_set_inode_flags(inode, info->fsflags, NULL);
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs);
@@ -3610,6 +3664,9 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct inode *inode;
int error;
+ if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
+ return -EINVAL;
+
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -3629,7 +3686,12 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
dir->i_size += BOGO_DIRENT_SIZE;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
- d_instantiate(dentry, inode);
+
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ d_add(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
+
dget(dentry); /* Extra count - pin the dentry in core */
return error;
@@ -3720,7 +3782,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
- d_instantiate(dentry, inode);
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ d_add(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
out:
return ret;
}
@@ -3740,6 +3805,14 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - does all the work */
+
+ /*
+ * For now, VFS can't deal with case-insensitive negative dentries, so
+ * we invalidate them
+ */
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ d_invalidate(dentry);
+
return 0;
}
@@ -3884,7 +3957,10 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
dir->i_size += BOGO_DIRENT_SIZE;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
- d_instantiate(dentry, inode);
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ d_add(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
dget(dentry);
return 0;
@@ -3949,16 +4025,23 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
+ int ret, flags;
if (fileattr_has_fsx(fa))
return -EOPNOTSUPP;
if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
return -EOPNOTSUPP;
- info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
+ flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
(fa->flags & SHMEM_FL_USER_MODIFIABLE);
- shmem_set_inode_flags(inode, info->fsflags);
+ ret = shmem_set_inode_flags(inode, flags, dentry);
+
+ if (ret)
+ return ret;
+
+ info->fsflags = flags;
+
inode_set_ctime_current(inode);
inode_inc_iversion(inode);
return 0;
@@ -4237,6 +4320,9 @@ enum shmem_param {
Opt_usrquota_inode_hardlimit,
Opt_grpquota_block_hardlimit,
Opt_grpquota_inode_hardlimit,
+ Opt_casefold_version,
+ Opt_casefold,
+ Opt_strict_encoding,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -4268,9 +4354,54 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
#endif
+ fsparam_string("casefold", Opt_casefold_version),
+ fsparam_flag ("casefold", Opt_casefold),
+ fsparam_flag ("strict_encoding", Opt_strict_encoding),
{}
};
+#if IS_ENABLED(CONFIG_UNICODE)
+static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
+ bool latest_version)
+{
+ struct shmem_options *ctx = fc->fs_private;
+ unsigned int version = UTF8_LATEST;
+ struct unicode_map *encoding;
+ char *version_str = param->string + 5;
+
+ if (!latest_version) {
+ if (strncmp(param->string, "utf8-", 5))
+ return invalfc(fc, "Only UTF-8 encodings are supported "
+ "in the format: utf8-<version number>");
+
+ version = utf8_parse_version(version_str);
+ if (version < 0)
+ return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
+ }
+
+ encoding = utf8_load(version);
+
+ if (IS_ERR(encoding)) {
+ return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n",
+ unicode_major(version), unicode_minor(version),
+ unicode_rev(version));
+ }
+
+ pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n",
+ unicode_major(version), unicode_minor(version), unicode_rev(version));
+
+ ctx->encoding = encoding;
+
+ return 0;
+}
+#else
+static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
+ bool latest_version)
+{
+ return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
+}
+#endif
+
static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
struct shmem_options *ctx = fc->fs_private;
@@ -4429,6 +4560,17 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
"Group quota inode hardlimit too large.");
ctx->qlimits.grpquota_ihardlimit = size;
break;
+ case Opt_casefold_version:
+ return shmem_parse_opt_casefold(fc, param, false);
+ case Opt_casefold:
+ return shmem_parse_opt_casefold(fc, param, true);
+ case Opt_strict_encoding:
+#if IS_ENABLED(CONFIG_UNICODE)
+ ctx->strict_encoding = true;
+ break;
+#else
+ return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
+#endif
}
return 0;
@@ -4658,6 +4800,11 @@ static void shmem_put_super(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (sb->s_encoding)
+ utf8_unload(sb->s_encoding);
+#endif
+
#ifdef CONFIG_TMPFS_QUOTA
shmem_disable_quotas(sb);
#endif
@@ -4668,6 +4815,14 @@ static void shmem_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS)
+static const struct dentry_operations shmem_ci_dentry_ops = {
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
+ .d_delete = always_delete_dentry,
+};
+#endif
+
static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct shmem_options *ctx = fc->fs_private;
@@ -4702,9 +4857,25 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
}
sb->s_export_op = &shmem_export_ops;
sb->s_flags |= SB_NOSEC | SB_I_VERSION;
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (!ctx->encoding && ctx->strict_encoding) {
+ pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
+ error = -EINVAL;
+ goto failed;
+ }
+
+ if (ctx->encoding) {
+ sb->s_encoding = ctx->encoding;
+ sb->s_d_op = &shmem_ci_dentry_ops;
+ if (ctx->strict_encoding)
+ sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
+ }
+#endif
+
#else
sb->s_flags |= SB_NOUSER;
-#endif
+#endif /* CONFIG_TMPFS */
sbinfo->max_blocks = ctx->blocks;
sbinfo->max_inodes = ctx->inodes;
sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
@@ -4978,6 +5149,10 @@ int shmem_init_fs_context(struct fs_context *fc)
ctx->uid = current_fsuid();
ctx->gid = current_fsgid();
+#if IS_ENABLED(CONFIG_UNICODE)
+ ctx->encoding = NULL;
+#endif
+
fc->fs_private = ctx;
fc->ops = &shmem_fs_context_ops;
return 0;
@@ -4991,9 +5166,69 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+ .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
+};
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
+{ \
+ .attr = { .name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+}
+
+#define TMPFS_ATTR_W(_name, _store) \
+ static struct kobj_attribute tmpfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
+
+#define TMPFS_ATTR_RW(_name, _show, _store) \
+ static struct kobj_attribute tmpfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define TMPFS_ATTR_RO(_name, _show) \
+ static struct kobj_attribute tmpfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a,
+ char *buf)
+{
+ return sysfs_emit(buf, "supported\n");
+}
+TMPFS_ATTR_RO(casefold, casefold_show);
+#endif
+
+static struct attribute *tmpfs_attributes[] = {
+#if IS_ENABLED(CONFIG_UNICODE)
+ &tmpfs_attr_casefold.attr,
+#endif
+ NULL
};
+static const struct attribute_group tmpfs_attribute_group = {
+ .attrs = tmpfs_attributes,
+ .name = "features"
+};
+
+static struct kobject *tmpfs_kobj;
+
+static int __init tmpfs_sysfs_init(void)
+{
+ int ret;
+
+ tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj);
+ if (!tmpfs_kobj)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group);
+ if (ret)
+ kobject_put(tmpfs_kobj);
+
+ return ret;
+}
+#endif /* CONFIG_SYSFS && CONFIG_TMPFS */
+
void __init shmem_init(void)
{
int error;
@@ -5017,6 +5252,14 @@ void __init shmem_init(void)
goto out1;
}
+#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
+ error = tmpfs_sysfs_init();
+ if (error) {
+ pr_err("Could not init tmpfs sysfs\n");
+ goto out1;
+ }
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 552b92dfdac7..a7174455db9f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -380,8 +380,11 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
unsigned int usersize,
void (*ctor)(void *))
{
+ unsigned long mask = 0;
+ unsigned int idx;
kmem_buckets *b;
- int idx;
+
+ BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG);
/*
* When the separate buckets API is not built in, just return
@@ -403,7 +406,7 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) {
char *short_size, *cache_name;
unsigned int cache_useroffset, cache_usersize;
- unsigned int size;
+ unsigned int size, aligned_idx;
if (!kmalloc_caches[KMALLOC_NORMAL][idx])
continue;
@@ -416,10 +419,6 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
if (WARN_ON(!short_size))
goto fail;
- cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1);
- if (WARN_ON(!cache_name))
- goto fail;
-
if (useroffset >= size) {
cache_useroffset = 0;
cache_usersize = 0;
@@ -427,18 +426,28 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
cache_useroffset = useroffset;
cache_usersize = min(size - cache_useroffset, usersize);
}
- (*b)[idx] = kmem_cache_create_usercopy(cache_name, size,
+
+ aligned_idx = __kmalloc_index(size, false);
+ if (!(*b)[aligned_idx]) {
+ cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1);
+ if (WARN_ON(!cache_name))
+ goto fail;
+ (*b)[aligned_idx] = kmem_cache_create_usercopy(cache_name, size,
0, flags, cache_useroffset,
cache_usersize, ctor);
- kfree(cache_name);
- if (WARN_ON(!(*b)[idx]))
- goto fail;
+ kfree(cache_name);
+ if (WARN_ON(!(*b)[aligned_idx]))
+ goto fail;
+ set_bit(aligned_idx, &mask);
+ }
+ if (idx != aligned_idx)
+ (*b)[idx] = (*b)[aligned_idx];
}
return b;
fail:
- for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++)
+ for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]))
kmem_cache_destroy((*b)[idx]);
kmem_cache_free(kmem_buckets_cache, b);
@@ -1323,6 +1332,25 @@ size_t ksize(const void *objp)
}
EXPORT_SYMBOL(ksize);
+#ifdef CONFIG_BPF_SYSCALL
+#include <linux/btf.h>
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc struct kmem_cache *bpf_get_kmem_cache(u64 addr)
+{
+ struct slab *slab;
+
+ if (!virt_addr_valid((void *)(long)addr))
+ return NULL;
+
+ slab = virt_to_slab((void *)(long)addr);
+ return slab ? slab->slab_cache : NULL;
+}
+
+__bpf_kfunc_end_defs();
+#endif /* CONFIG_BPF_SYSCALL */
+
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/swap.c b/mm/swap.c
index 638a3f001676..10decd9dffa1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,20 +78,6 @@ static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
lruvec_del_folio(*lruvecp, folio);
__folio_clear_lru_flags(folio);
}
-
- /*
- * In rare cases, when truncation or holepunching raced with
- * munlock after VM_LOCKED was cleared, Mlocked may still be
- * found set here. This does not indicate a problem, unless
- * "unevictable_pgs_cleared" appears worryingly large.
- */
- if (unlikely(folio_test_mlocked(folio))) {
- long nr_pages = folio_nr_pages(folio);
-
- __folio_clear_mlocked(folio);
- zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
- count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
- }
}
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 46bd4b1a3c07..b0a9071cfe1d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -664,12 +664,15 @@ static bool cluster_scan_range(struct swap_info_struct *si,
return true;
}
-static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
unsigned int start, unsigned char usage,
unsigned int order)
{
unsigned int nr_pages = 1 << order;
+ if (!(si->flags & SWP_WRITEOK))
+ return false;
+
if (cluster_is_free(ci)) {
if (nr_pages < SWAPFILE_CLUSTER) {
list_move_tail(&ci->list, &si->nonfull_clusters[order]);
@@ -690,6 +693,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
list_move_tail(&ci->list, &si->full_clusters);
ci->flags = CLUSTER_FLAG_FULL;
}
+
+ return true;
}
static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
@@ -713,7 +718,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
while (offset <= end) {
if (cluster_scan_range(si, ci, offset, nr_pages)) {
- cluster_alloc_range(si, ci, offset, usage, order);
+ if (!cluster_alloc_range(si, ci, offset, usage, order)) {
+ offset = SWAP_NEXT_INVALID;
+ goto done;
+ }
*foundp = offset;
if (ci->count == SWAPFILE_CLUSTER) {
offset = SWAP_NEXT_INVALID;
@@ -805,7 +813,11 @@ new_cluster:
if (!list_empty(&si->free_clusters)) {
ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
- VM_BUG_ON(!found);
+ /*
+ * Either we didn't touch the cluster due to swapoff,
+ * or the allocation must success.
+ */
+ VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
goto done;
}
@@ -929,7 +941,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
si->highest_bit = 0;
del_from_avail_list(si);
- if (vm_swap_full())
+ if (si->cluster_info && vm_swap_full())
schedule_work(&si->reclaim_work);
}
}
@@ -1041,6 +1053,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
VM_BUG_ON(!si->cluster_info);
+ si->flags += SWP_SCANNING;
+
while (n_ret < nr) {
unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
@@ -1049,6 +1063,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
slots[n_ret++] = swp_entry(si->type, offset);
}
+ si->flags -= SWP_SCANNING;
+
return n_ret;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index e5151703ba04..7c304d2f0052 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -151,7 +151,6 @@ static void truncate_cleanup_folio(struct folio *folio)
* Hence dirty accounting check is placed after invalidation.
*/
folio_cancel_dirty(folio);
- folio_clear_mappedtodisk(folio);
}
int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
@@ -786,6 +785,21 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
*/
if (folio_mkclean(folio))
folio_mark_dirty(folio);
+
+ /*
+ * The post-eof range of the folio must be zeroed before it is exposed
+ * to the file. Writeback normally does this, but since i_size has been
+ * increased we handle it here.
+ */
+ if (folio_test_dirty(folio)) {
+ unsigned int offset, end;
+
+ offset = from - folio_pos(folio);
+ end = min_t(unsigned int, to - folio_pos(folio),
+ folio_size(folio));
+ folio_zero_segment(folio, offset, end);
+ }
+
folio_unlock(folio);
folio_put(folio);
}