Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"

Patch series "ksm: perform a range-walk to jump over holes in break_ksm", v4. When unmerging an address range, unmerge_ksm_pages function walks every page address in the specified range to locate ksm pages. This becomes highly inefficient when scanning large virtual memory areas that contain mostly unmapped regions, causing the process to get blocked for several minutes. This patch makes break_ksm, function called by unmerge_ksm_pages for every page in an address range, perform a range walk, allowing it to skip over entire unmapped holes in a VMA, avoiding unnecessary lookups. As pointed out by David Hildenbrand in [1], unmerge_ksm_pages() is called from: * ksm_madvise() through madvise(MADV_UNMERGEABLE). There are not a lot of users of that function. * __ksm_del_vma() through ksm_del_vmas(). Effectively called when disabling KSM for a process either through the sysctl or from s390x gmap code when enabling storage keys for a VM. Consider the following test program which creates a 32 TiB mapping in the virtual address space but only populates a single page: #include <unistd.h> #include <stdio.h> #include <sys/mman.h> /* 32 TiB */ const size_t size = 32ul * 1024 * 1024 * 1024 * 1024; int main() { char *area = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0); if (area == MAP_FAILED) { perror("mmap() failed\n"); return -1; } /* Populate a single page such that we get an anon_vma. */ *area = 0; /* Enable KSM. */ madvise(area, size, MADV_MERGEABLE); madvise(area, size, MADV_UNMERGEABLE); return 0; } Without this patch, this program takes 9 minutes to finish, while with this patch it finishes in less then 5 seconds. This patch (of 3): This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes function break_ksm_pmd_entry() to use folios. This reverts break_ksm() to use walk_page_range_vma() instead of folio_walk_start(). Change break_ksm_pmd_entry() to call is_ksm_zero_pte() only if we know the folio is present, and also rename variable ret to found. This will make it easier to later modify break_ksm() to perform a proper range walk. Link: https://lkml.kernel.org/r/20251105184912.186329-1-pedrodemargomes@gmail.com Link: https://lkml.kernel.org/r/20251105184912.186329-2-pedrodemargomes@gmail.com Link: https://lore.kernel.org/linux-mm/e0886fdf-d198-4130-bd9a-be276c59da37@redhat.com/ [1] Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org> Acked-by: David Hildenbrand (Red Hat) <david@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Pedro Demarchi Gomes <pedrodemargomes@gmail.com> 2025-11-05 15:49:10 -0300
committer: Andrew Morton <akpm@linux-foundation.org> 2025-11-16 17:28:28 -0800
commit: 912aa825957f556a29d781c8f4cb4f4dfd938a9d (patch)
tree: ef7c662fc7128ebcd854bc82e8f52ad21f397f22
parent: ed1f8855dd7b82a0ad87960b1729a3e848dc5589 (diff)
1 files changed, 48 insertions, 16 deletions
diff --git a/mm/ksm.c b/mm/ksm.c
index 4f672f4f2140..9f74baf01e46 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -607,6 +607,48 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
 	return atomic_read(&mm->mm_users) == 0;
 }
 
+static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+			struct mm_walk *walk)
+{
+	struct folio *folio = NULL;
+	spinlock_t *ptl;
+	pte_t *pte;
+	pte_t ptent;
+	int found;
+
+	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	if (!pte)
+		return 0;
+	ptent = ptep_get(pte);
+	if (pte_present(ptent)) {
+		folio = vm_normal_folio(walk->vma, addr, ptent);
+	} else if (!pte_none(ptent)) {
+		swp_entry_t entry = pte_to_swp_entry(ptent);
+
+		/*
+		 * As KSM pages remain KSM pages until freed, no need to wait
+		 * here for migration to end.
+		 */
+		if (is_migration_entry(entry))
+			folio = pfn_swap_entry_folio(entry);
+	}
+	/* return 1 if the page is an normal ksm page or KSM-placed zero page */
+	found = (folio && folio_test_ksm(folio)) ||
+		(pte_present(ptent) && is_ksm_zero_pte(ptent));
+	pte_unmap_unlock(pte, ptl);
+	return found;
+}
+
+static const struct mm_walk_ops break_ksm_ops = {
+	.pmd_entry = break_ksm_pmd_entry,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops break_ksm_lock_vma_ops = {
+	.pmd_entry = break_ksm_pmd_entry,
+	.walk_lock = PGWALK_WRLOCK,
+};
+
 /*
  * We use break_ksm to break COW on a ksm page by triggering unsharing,
  * such that the ksm page will get replaced by an exclusive anonymous page.
@@ -623,26 +665,16 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
 static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
 {
 	vm_fault_t ret = 0;
-
-	if (lock_vma)
-		vma_start_write(vma);
+	const struct mm_walk_ops *ops = lock_vma ?
+				&break_ksm_lock_vma_ops : &break_ksm_ops;
 
 	do {
-		bool ksm_page = false;
-		struct folio_walk fw;
-		struct folio *folio;
+		int ksm_page;
 
 		cond_resched();
-		folio = folio_walk_start(&fw, vma, addr,
-					 FW_MIGRATION | FW_ZEROPAGE);
-		if (folio) {
-			/* Small folio implies FW_LEVEL_PTE. */
-			if (!folio_test_large(folio) &&
-			    (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte)))
-				ksm_page = true;
-			folio_walk_end(&fw, vma);
-		}
-
+		ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
+		if (WARN_ON_ONCE(ksm_page < 0))
+			return ksm_page;
 		if (!ksm_page)
 			return 0;
 		ret = handle_mm_fault(vma, addr,
author	Pedro Demarchi Gomes <pedrodemargomes@gmail.com>	2025-11-05 15:49:10 -0300
committer	Andrew Morton <akpm@linux-foundation.org>	2025-11-16 17:28:28 -0800
commit	912aa825957f556a29d781c8f4cb4f4dfd938a9d (patch)
tree	ef7c662fc7128ebcd854bc82e8f52ad21f397f22
parent	ed1f8855dd7b82a0ad87960b1729a3e848dc5589 (diff)