summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-08-03 16:23:09 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-08-03 16:23:09 -0700
commite991acf1bce7a428794514cbbe216973c9c0a3c8 (patch)
tree0f2bdee8143718218252d8f5fd5373057e2ab8d9 /kernel
parent3c4a063b1f8ab71352df1421d9668521acb63cd9 (diff)
parent085dece6cc88b5c6fc6f2eca0403bfd2c5fbc7cb (diff)
Merge tag 'mm-nonmm-stable-2025-08-03-12-47' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull non-MM updates from Andrew Morton: "Significant patch series in this pull request: - "squashfs: Remove page->mapping references" (Matthew Wilcox) gets us closer to being able to remove page->mapping - "relayfs: misc changes" (Jason Xing) does some maintenance and minor feature addition work in relayfs - "kdump: crashkernel reservation from CMA" (Jiri Bohac) switches us from static preallocation of the kdump crashkernel's working memory over to dynamic allocation. So the difficulty of a-priori estimation of the second kernel's needs is removed and the first kernel obtains extra memory - "generalize panic_print's dump function to be used by other kernel parts" (Feng Tang) implements some consolidation and rationalization of the various ways in which a failing kernel splats information at the operator * tag 'mm-nonmm-stable-2025-08-03-12-47' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (80 commits) tools/getdelays: add backward compatibility for taskstats version kho: add test for kexec handover delaytop: enhance error logging and add PSI feature description samples: Kconfig: fix spelling mistake "instancess" -> "instances" fat: fix too many log in fat_chain_add() scripts/spelling.txt: add notifer||notifier to spelling.txt xen/xenbus: fix typo "notifer" net: mvneta: fix typo "notifer" drm/xe: fix typo "notifer" cxl: mce: fix typo "notifer" KVM: x86: fix typo "notifer" MAINTAINERS: add maintainers for delaytop ucount: use atomic_long_try_cmpxchg() in atomic_long_inc_below() ucount: fix atomic_long_inc_below() argument type kexec: enable CMA based contiguous allocation stackdepot: make max number of pools boot-time configurable lib/xxhash: remove unused functions init/Kconfig: restore CONFIG_BROKEN help text lib/raid6: update recov_rvv.c zero page usage docs: update docs after introducing delaytop ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/crash_core.c15
-rw-r--r--kernel/crash_reserve.c68
-rw-r--r--kernel/events/uprobes.c4
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/fork.c95
-rw-r--r--kernel/hung_task.c29
-rw-r--r--kernel/kcov.c2
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kexec_core.c100
-rw-r--r--kernel/kexec_file.c51
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kthread.c11
-rw-r--r--kernel/locking/rwsem.c31
-rw-r--r--kernel/panic.c71
-rw-r--r--kernel/relay.c69
-rw-r--r--kernel/trace/blktrace.c22
-rw-r--r--kernel/ucount.c16
17 files changed, 424 insertions, 171 deletions
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 335b8425dd4b..a4ef79591eb2 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -21,6 +21,7 @@
#include <linux/reboot.h>
#include <linux/btf.h>
#include <linux/objtool.h>
+#include <linux/delay.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -33,6 +34,11 @@
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
+/* time to wait for possible DMA to finish before starting the kdump kernel
+ * when a CMA reservation is used
+ */
+#define CMA_DMA_TIMEOUT_SEC 10
+
#ifdef CONFIG_CRASH_DUMP
int kimage_crash_copy_vmcoreinfo(struct kimage *image)
@@ -97,6 +103,14 @@ int kexec_crash_loaded(void)
}
EXPORT_SYMBOL_GPL(kexec_crash_loaded);
+static void crash_cma_clear_pending_dma(void)
+{
+ if (!crashk_cma_cnt)
+ return;
+
+ mdelay(CMA_DMA_TIMEOUT_SEC * 1000);
+}
+
/*
* No panic_cpu check version of crash_kexec(). This function is called
* only when panic_cpu holds the current CPU number; this is the only CPU
@@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)
crash_setup_regs(&fixed_regs, regs);
crash_save_vmcoreinfo();
machine_crash_shutdown(&fixed_regs);
+ crash_cma_clear_pending_dma();
machine_kexec(kexec_crash_image);
}
kexec_unlock();
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index acb6bf42e30d..87bf4d41eabb 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -14,6 +14,8 @@
#include <linux/cpuhotplug.h>
#include <linux/memblock.h>
#include <linux/kmemleak.h>
+#include <linux/cma.h>
+#include <linux/crash_reserve.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -172,17 +174,19 @@ static int __init parse_crashkernel_simple(char *cmdline,
#define SUFFIX_HIGH 0
#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
+#define SUFFIX_CMA 2
+#define SUFFIX_NULL 3
static __initdata char *suffix_tbl[] = {
[SUFFIX_HIGH] = ",high",
[SUFFIX_LOW] = ",low",
+ [SUFFIX_CMA] = ",cma",
[SUFFIX_NULL] = NULL,
};
/*
* That function parses "suffix" crashkernel command lines like
*
- * crashkernel=size,[high|low]
+ * crashkernel=size,[high|low|cma]
*
* It returns 0 on success and -EINVAL on failure.
*/
@@ -298,9 +302,11 @@ int __init parse_crashkernel(char *cmdline,
unsigned long long *crash_size,
unsigned long long *crash_base,
unsigned long long *low_size,
+ unsigned long long *cma_size,
bool *high)
{
int ret;
+ unsigned long long __always_unused cma_base;
/* crashkernel=X[@offset] */
ret = __parse_crashkernel(cmdline, system_ram, crash_size,
@@ -331,6 +337,14 @@ int __init parse_crashkernel(char *cmdline,
*high = true;
}
+
+ /*
+ * optional CMA reservation
+ * cma_base is ignored
+ */
+ if (cma_size)
+ __parse_crashkernel(cmdline, 0, cma_size,
+ &cma_base, suffix_tbl[SUFFIX_CMA]);
#endif
if (!*crash_size)
ret = -EINVAL;
@@ -457,6 +471,56 @@ retry:
#endif
}
+struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX];
+#ifdef CRASHKERNEL_CMA
+int crashk_cma_cnt;
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ unsigned long long request_size = roundup(cma_size, PAGE_SIZE);
+ unsigned long long reserved_size = 0;
+
+ if (!cma_size)
+ return;
+
+ while (cma_size > reserved_size &&
+ crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) {
+
+ struct cma *res;
+
+ if (cma_declare_contiguous(0, request_size, 0, 0, 0, false,
+ "crashkernel", &res)) {
+ /* reservation failed, try half-sized blocks */
+ if (request_size <= PAGE_SIZE)
+ break;
+
+ request_size = roundup(request_size / 2, PAGE_SIZE);
+ continue;
+ }
+
+ crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res);
+ crashk_cma_ranges[crashk_cma_cnt].end =
+ crashk_cma_ranges[crashk_cma_cnt].start +
+ cma_get_size(res) - 1;
+ ++crashk_cma_cnt;
+ reserved_size += request_size;
+ }
+
+ if (cma_size > reserved_size)
+ pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n",
+ cma_size >> 20, reserved_size >> 20, crashk_cma_cnt);
+ else
+ pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n",
+ reserved_size >> 20, crashk_cma_cnt);
+}
+
+#else /* CRASHKERNEL_CMA */
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+ if (cma_size)
+ pr_warn("crashkernel CMA reservation not supported\n");
+}
+#endif
+
#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
static __init int insert_crashkernel_resources(void)
{
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f774367c8e71..7ca1940607bd 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -580,8 +580,8 @@ retry:
out:
/* Revert back reference counter if instruction update failed. */
- if (ret < 0 && is_register && ref_ctr_updated)
- update_ref_ctr(uprobe, mm, -1);
+ if (ret < 0 && ref_ctr_updated)
+ update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
/* try collapse pmd for compound page */
if (ret > 0)
diff --git a/kernel/exit.c b/kernel/exit.c
index 1d8c8ac33c4f..343eb97543d5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -693,12 +693,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
}
/*
- * This does two things:
- *
- * A. Make init inherit all the child processes
- * B. Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ * Make init inherit all the child processes
*/
static void forget_original_parent(struct task_struct *father,
struct list_head *dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index e45354cc7cac..9ce93fd20f82 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -189,33 +189,33 @@ static inline void free_task_struct(struct task_struct *tsk)
kmem_cache_free(task_struct_cachep, tsk);
}
-/*
- * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
- * kmemcache based allocator.
- */
-# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-
-# ifdef CONFIG_VMAP_STACK
+#ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
*/
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+/*
+ * Allocated stacks are cached and later reused by new threads, so memcg
+ * accounting is performed by the code assigning/releasing stacks to tasks.
+ * We need a zeroed memory without __GFP_ACCOUNT.
+ */
+#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
struct vm_stack {
struct rcu_head rcu;
struct vm_struct *stack_vm_area;
};
-static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
{
unsigned int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
struct vm_struct *tmp = NULL;
- if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm))
+ if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
return true;
}
return false;
@@ -224,11 +224,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
static void thread_stack_free_rcu(struct rcu_head *rh)
{
struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+ struct vm_struct *vm_area = vm_stack->stack_vm_area;
if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
return;
- vfree(vm_stack);
+ vfree(vm_area->addr);
}
static void thread_stack_delayed_free(struct task_struct *tsk)
@@ -241,32 +242,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk)
static int free_vm_stack_cache(unsigned int cpu)
{
- struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
+ struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *vm_stack = cached_vm_stacks[i];
+ struct vm_struct *vm_area = cached_vm_stack_areas[i];
- if (!vm_stack)
+ if (!vm_area)
continue;
- vfree(vm_stack->addr);
- cached_vm_stacks[i] = NULL;
+ vfree(vm_area->addr);
+ cached_vm_stack_areas[i] = NULL;
}
return 0;
}
-static int memcg_charge_kernel_stack(struct vm_struct *vm)
+static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
{
int i;
int ret;
int nr_charged = 0;
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+ BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
if (ret)
goto err;
nr_charged++;
@@ -274,55 +275,47 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm)
return 0;
err:
for (i = 0; i < nr_charged; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
return ret;
}
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
- struct vm_struct *vm;
+ struct vm_struct *vm_area;
void *stack;
int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *s;
-
- s = this_cpu_xchg(cached_stacks[i], NULL);
-
- if (!s)
+ vm_area = this_cpu_xchg(cached_stacks[i], NULL);
+ if (!vm_area)
continue;
/* Reset stack metadata. */
- kasan_unpoison_range(s->addr, THREAD_SIZE);
+ kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
- stack = kasan_reset_tag(s->addr);
+ stack = kasan_reset_tag(vm_area->addr);
/* Clear stale pointers from reused stack. */
memset(stack, 0, THREAD_SIZE);
- if (memcg_charge_kernel_stack(s)) {
- vfree(s->addr);
+ if (memcg_charge_kernel_stack(vm_area)) {
+ vfree(vm_area->addr);
return -ENOMEM;
}
- tsk->stack_vm_area = s;
+ tsk->stack_vm_area = vm_area;
tsk->stack = stack;
return 0;
}
- /*
- * Allocated stacks are cached and later reused by new threads,
- * so memcg accounting is performed manually on assigning/releasing
- * stacks to tasks. Drop __GFP_ACCOUNT.
- */
stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
- THREADINFO_GFP & ~__GFP_ACCOUNT,
+ GFP_VMAP_STACK,
node, __builtin_return_address(0));
if (!stack)
return -ENOMEM;
- vm = find_vm_area(stack);
- if (memcg_charge_kernel_stack(vm)) {
+ vm_area = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm_area)) {
vfree(stack);
return -ENOMEM;
}
@@ -331,7 +324,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- tsk->stack_vm_area = vm;
+ tsk->stack_vm_area = vm_area;
stack = kasan_reset_tag(stack);
tsk->stack = stack;
return 0;
@@ -346,7 +339,13 @@ static void free_thread_stack(struct task_struct *tsk)
tsk->stack_vm_area = NULL;
}
-# else /* !CONFIG_VMAP_STACK */
+#else /* !CONFIG_VMAP_STACK */
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+#if THREAD_SIZE >= PAGE_SIZE
static void thread_stack_free_rcu(struct rcu_head *rh)
{
@@ -378,8 +377,7 @@ static void free_thread_stack(struct task_struct *tsk)
tsk->stack = NULL;
}
-# endif /* CONFIG_VMAP_STACK */
-# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
+#else /* !(THREAD_SIZE >= PAGE_SIZE) */
static struct kmem_cache *thread_stack_cache;
@@ -418,7 +416,8 @@ void thread_stack_cache_init(void)
BUG_ON(thread_stack_cache == NULL);
}
-# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#endif /* THREAD_SIZE >= PAGE_SIZE */
+#endif /* CONFIG_VMAP_STACK */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -438,11 +437,11 @@ static struct kmem_cache *mm_cachep;
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- struct vm_struct *vm = task_stack_vm_area(tsk);
+ struct vm_struct *vm_area = task_stack_vm_area(tsk);
int i;
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
+ mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
void *stack = task_stack_page(tsk);
@@ -458,12 +457,12 @@ void exit_task_stack_account(struct task_struct *tsk)
account_kernel_stack(tsk, -1);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- struct vm_struct *vm;
+ struct vm_struct *vm_area;
int i;
- vm = task_stack_vm_area(tsk);
+ vm_area = task_stack_vm_area(tsk);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
}
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d2432df2b905..8708a1205f82 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -23,6 +23,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
#include <linux/hung_task.h>
+#include <linux/rwsem.h>
#include <trace/events/sched.h>
@@ -100,6 +101,7 @@ static void debug_show_blocker(struct task_struct *task)
{
struct task_struct *g, *t;
unsigned long owner, blocker, blocker_type;
+ const char *rwsem_blocked_by, *rwsem_blocked_as;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
@@ -111,12 +113,20 @@ static void debug_show_blocker(struct task_struct *task)
switch (blocker_type) {
case BLOCKER_TYPE_MUTEX:
- owner = mutex_get_owner(
- (struct mutex *)hung_task_blocker_to_lock(blocker));
+ owner = mutex_get_owner(hung_task_blocker_to_lock(blocker));
break;
case BLOCKER_TYPE_SEM:
- owner = sem_last_holder(
- (struct semaphore *)hung_task_blocker_to_lock(blocker));
+ owner = sem_last_holder(hung_task_blocker_to_lock(blocker));
+ break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ owner = (unsigned long)rwsem_owner(
+ hung_task_blocker_to_lock(blocker));
+ rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ?
+ "reader" : "writer";
+ rwsem_blocked_by = is_rwsem_reader_owned(
+ hung_task_blocker_to_lock(blocker)) ?
+ "reader" : "writer";
break;
default:
WARN_ON_ONCE(1);
@@ -134,6 +144,11 @@ static void debug_show_blocker(struct task_struct *task)
pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
task->comm, task->pid);
break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n",
+ task->comm, task->pid);
+ break;
}
return;
}
@@ -152,6 +167,12 @@ static void debug_show_blocker(struct task_struct *task)
pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
task->comm, task->pid, t->comm, t->pid);
break;
+ case BLOCKER_TYPE_RWSEM_READER:
+ case BLOCKER_TYPE_RWSEM_WRITER:
+ pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n",
+ task->comm, task->pid, rwsem_blocked_as, t->comm,
+ t->pid, rwsem_blocked_by);
+ break;
}
sched_show_task(t);
return;
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 187ba1b80bda..1d85597057e1 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -552,7 +552,7 @@ static int kcov_get_mode(unsigned long arg)
/*
* Fault in a lazily-faulted vmalloc area before it can be used by
- * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the
+ * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the
* vmalloc fault handling path is instrumented.
*/
static void kcov_fault_in_area(struct kcov *kcov)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a6b3f96bb50c..28008e3d462e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
goto out;
for (i = 0; i < nr_segments; i++) {
- ret = kimage_load_segment(image, &image->segment[i]);
+ ret = kimage_load_segment(image, i);
if (ret)
goto out;
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 351cd7d76dfa..31203f0bacaf 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -40,6 +40,7 @@
#include <linux/hugetlb.h>
#include <linux/objtool.h>
#include <linux/kmsg_dump.h>
+#include <linux/dma-map-ops.h>
#include <asm/page.h>
#include <asm/sections.h>
@@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry)
kimage_free_pages(page);
}
+static void kimage_free_cma(struct kimage *image)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct page *cma = image->segment_cma[i];
+ u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT;
+
+ if (!cma)
+ continue;
+
+ arch_kexec_pre_free_pages(page_address(cma), nr_pages);
+ dma_release_from_contiguous(NULL, cma, nr_pages);
+ image->segment_cma[i] = NULL;
+ }
+
+}
+
void kimage_free(struct kimage *image)
{
kimage_entry_t *ptr, entry;
@@ -591,6 +610,9 @@ void kimage_free(struct kimage *image)
/* Free the kexec control pages... */
kimage_free_page_list(&image->control_pages);
+ /* Free CMA allocations */
+ kimage_free_cma(image);
+
/*
* Free up any temporary buffers allocated. This might hit if
* error occurred much later after buffer allocation.
@@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image,
return page;
}
-static int kimage_load_normal_segment(struct kimage *image,
- struct kexec_segment *segment)
+static int kimage_load_cma_segment(struct kimage *image, int idx)
+{
+ struct kexec_segment *segment = &image->segment[idx];
+ struct page *cma = image->segment_cma[idx];
+ char *ptr = page_address(cma);
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result = 0;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ /* Then copy from source buffer to the CMA one */
+ while (mbytes) {
+ size_t uchunk, mchunk;
+
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+
+ if (uchunk) {
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ ubytes -= uchunk;
+ if (image->file_mode)
+ kbuf += uchunk;
+ else
+ buf += uchunk;
+ }
+
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+
+ ptr += mchunk;
+ maddr += mchunk;
+ mbytes -= mchunk;
+
+ cond_resched();
+ }
+
+ /* Clear any remainder */
+ memset(ptr, 0, mbytes);
+
+out:
+ return result;
+}
+
+static int kimage_load_normal_segment(struct kimage *image, int idx)
{
+ struct kexec_segment *segment = &image->segment[idx];
unsigned long maddr;
size_t ubytes, mbytes;
int result;
@@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image,
mbytes = segment->memsz;
maddr = segment->mem;
+ if (image->segment_cma[idx])
+ return kimage_load_cma_segment(image, idx);
+
result = kimage_set_destination(image, maddr);
if (result < 0)
goto out;
@@ -787,13 +872,13 @@ out:
}
#ifdef CONFIG_CRASH_DUMP
-static int kimage_load_crash_segment(struct kimage *image,
- struct kexec_segment *segment)
+static int kimage_load_crash_segment(struct kimage *image, int idx)
{
/* For crash dumps kernels we simply copy the data from
* user space to it's destination.
* We do things a page at a time for the sake of kmap.
*/
+ struct kexec_segment *segment = &image->segment[idx];
unsigned long maddr;
size_t ubytes, mbytes;
int result;
@@ -858,18 +943,17 @@ out:
}
#endif
-int kimage_load_segment(struct kimage *image,
- struct kexec_segment *segment)
+int kimage_load_segment(struct kimage *image, int idx)
{
int result = -ENOMEM;
switch (image->type) {
case KEXEC_TYPE_DEFAULT:
- result = kimage_load_normal_segment(image, segment);
+ result = kimage_load_normal_segment(image, idx);
break;
#ifdef CONFIG_CRASH_DUMP
case KEXEC_TYPE_CRASH:
- result = kimage_load_crash_segment(image, segment);
+ result = kimage_load_crash_segment(image, idx);
break;
#endif
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b835033c65eb..91d46502a817 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,6 +26,7 @@
#include <linux/kernel_read_file.h>
#include <linux/syscalls.h>
#include <linux/vmalloc.h>
+#include <linux/dma-map-ops.h>
#include "kexec_internal.h"
#ifdef CONFIG_KEXEC_SIG
@@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
ret = 0;
}
+ image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+
if (cmdline_len) {
image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
if (IS_ERR(image->cmdline_buf)) {
@@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
i, ksegment->buf, ksegment->bufsz, ksegment->mem,
ksegment->memsz);
- ret = kimage_load_segment(image, &image->segment[i]);
+ ret = kimage_load_segment(image, i);
if (ret)
goto out;
}
@@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
}
+static int kexec_alloc_contig(struct kexec_buf *kbuf)
+{
+ size_t nr_pages = kbuf->memsz >> PAGE_SHIFT;
+ unsigned long mem;
+ struct page *p;
+
+ /* User space disabled CMA allocations, bail out. */
+ if (kbuf->image->no_cma)
+ return -EPERM;
+
+ /* Skip CMA logic for crash kernel */
+ if (kbuf->image->type == KEXEC_TYPE_CRASH)
+ return -EPERM;
+
+ p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true);
+ if (!p)
+ return -ENOMEM;
+
+ pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p));
+
+ mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+
+ if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
+ /* Our region is already in use by a statically defined one. Bail out. */
+ pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
+ dma_release_from_contiguous(NULL, p, nr_pages);
+ return -EBUSY;
+ }
+
+ kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+ kbuf->cma = p;
+
+ arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0);
+
+ return 0;
+}
+
/**
* kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
* @kbuf: Parameters for the memory search.
@@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (ret <= 0)
return ret;
+ /*
+ * Try to find a free physically contiguous block of memory first. With that, we
+ * can avoid any copying at kexec time.
+ */
+ if (!kexec_alloc_contig(kbuf))
+ return 0;
+
if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
@@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
/* Ensure minimum alignment needed for segments. */
kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
+ kbuf->cma = NULL;
/* Walk the RAM ranges and allocate a suitable range for the buffer */
ret = arch_kexec_locate_mem_hole(kbuf);
@@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
ksegment->bufsz = kbuf->bufsz;
ksegment->mem = kbuf->mem;
ksegment->memsz = kbuf->memsz;
+ kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma;
kbuf->image->nr_segments++;
return 0;
}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 30a733a55a67..228bb88c018b 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void);
int sanity_check_segment_list(struct kimage *image);
void kimage_free_page_list(struct list_head *list);
void kimage_free(struct kimage *image);
-int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+int kimage_load_segment(struct kimage *image, int idx);
void kimage_terminate(struct kimage *image);
int kimage_is_destination_range(struct kimage *image,
unsigned long start, unsigned long end);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 85fc068f0083..0e98b228a8ef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -88,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k)
/*
* Variant of to_kthread() that doesn't assume @p is a kthread.
*
- * Per construction; when:
+ * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
+ * always remain a kthread. For kthreads p->worker_private always
+ * points to a struct kthread. For tasks that are not kthreads
+ * p->worker_private is used to point to other things.
*
- * (p->flags & PF_KTHREAD) && p->worker_private
- *
- * the task is both a kthread and struct kthread is persistent. However
- * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
- * begin_new_exec()).
+ * Return NULL for any task that is not a kthread.
*/
static inline struct kthread *__to_kthread(struct task_struct *p)
{
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 8572dba95af4..24df4d98f7d2 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <linux/hung_task.h>
#include <trace/events/lock.h>
#ifndef CONFIG_PREEMPT_RT
@@ -181,11 +182,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
__rwsem_set_reader_owned(sem, current);
}
-#ifdef CONFIG_DEBUG_RWSEMS
+#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
/*
* Return just the real task structure pointer of the owner
*/
-static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+struct task_struct *rwsem_owner(struct rw_semaphore *sem)
{
return (struct task_struct *)
(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
@@ -194,7 +195,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
/*
* Return true if the rwsem is owned by a reader.
*/
-static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+bool is_rwsem_reader_owned(struct rw_semaphore *sem)
{
/*
* Check the count to see if it is write-locked.
@@ -207,10 +208,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
}
/*
- * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
- * is a task pointer in owner of a reader-owned rwsem, it will be the
- * real owner or one of the real owners. The only exception is when the
- * unlock is done by up_read_non_owner().
+ * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured,
+ * it will make sure that the owner field of a reader-owned rwsem either
+ * points to a real reader-owner(s) or gets cleared. The only exception is
+ * when the unlock is done by up_read_non_owner().
*/
static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
{
@@ -1063,10 +1064,13 @@ queue:
wake_up_q(&wake_q);
trace_contention_begin(sem, LCB_F_READ);
+ set_current_state(state);
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER);
/* wait to be given the lock */
for (;;) {
- set_current_state(state);
if (!smp_load_acquire(&waiter.task)) {
/* Matches rwsem_mark_wake()'s smp_store_release(). */
break;
@@ -1081,8 +1085,12 @@ queue:
}
schedule_preempt_disabled();
lockevent_inc(rwsem_sleep_reader);
+ set_current_state(state);
}
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
trace_contention_end(sem, 0);
@@ -1144,6 +1152,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
set_current_state(state);
trace_contention_begin(sem, LCB_F_WRITE);
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
+
for (;;) {
if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
@@ -1177,6 +1188,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
+
+ if (state == TASK_UNINTERRUPTIBLE)
+ hung_task_clear_blocker();
+
__set_current_state(TASK_RUNNING);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
diff --git a/kernel/panic.c b/kernel/panic.c
index 43817111c979..72fcbb5a071b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,6 +36,7 @@
#include <linux/sysfs.h>
#include <linux/context_tracking.h>
#include <linux/seq_buf.h>
+#include <linux/sys_info.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -63,20 +64,13 @@ int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
static unsigned int warn_limit __read_mostly;
+static bool panic_console_replay;
bool panic_triggering_all_cpu_backtrace;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
-#define PANIC_PRINT_TASK_INFO 0x00000001
-#define PANIC_PRINT_MEM_INFO 0x00000002
-#define PANIC_PRINT_TIMER_INFO 0x00000004
-#define PANIC_PRINT_LOCK_INFO 0x00000008
-#define PANIC_PRINT_FTRACE_INFO 0x00000010
-#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
-#define PANIC_PRINT_ALL_CPU_BT 0x00000040
-#define PANIC_PRINT_BLOCKED_TASKS 0x00000080
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -128,6 +122,13 @@ static int proc_taint(const struct ctl_table *table, int write,
return err;
}
+static int sysctl_panic_print_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n");
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+
static const struct ctl_table kern_panic_table[] = {
#ifdef CONFIG_SMP
{
@@ -165,7 +166,7 @@ static const struct ctl_table kern_panic_table[] = {
.data = &panic_print,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = sysctl_panic_print_handler,
},
{
.procname = "panic_on_warn",
@@ -193,6 +194,13 @@ static const struct ctl_table kern_panic_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+ {
+ .procname = "panic_sys_info",
+ .data = &panic_print,
+ .maxlen = sizeof(panic_print),
+ .mode = 0644,
+ .proc_handler = sysctl_sys_info_handler,
+ },
};
static __init int kernel_panic_sysctls_init(void)
@@ -203,6 +211,15 @@ static __init int kernel_panic_sysctls_init(void)
late_initcall(kernel_panic_sysctls_init);
#endif
+/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */
+static int __init setup_panic_sys_info(char *buf)
+{
+ /* There is no risk of race in kernel boot phase */
+ panic_print = sys_info_parse_param(buf);
+ return 1;
+}
+__setup("panic_sys_info=", setup_panic_sys_info);
+
static atomic_t warn_count = ATOMIC_INIT(0);
#ifdef CONFIG_SYSFS
@@ -298,33 +315,6 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
}
EXPORT_SYMBOL(nmi_panic);
-static void panic_print_sys_info(bool console_flush)
-{
- if (console_flush) {
- if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
- console_flush_on_panic(CONSOLE_REPLAY_ALL);
- return;
- }
-
- if (panic_print & PANIC_PRINT_TASK_INFO)
- show_state();
-
- if (panic_print & PANIC_PRINT_MEM_INFO)
- show_mem();
-
- if (panic_print & PANIC_PRINT_TIMER_INFO)
- sysrq_timer_list_show();
-
- if (panic_print & PANIC_PRINT_LOCK_INFO)
- debug_show_all_locks();
-
- if (panic_print & PANIC_PRINT_FTRACE_INFO)
- ftrace_dump(DUMP_ALL);
-
- if (panic_print & PANIC_PRINT_BLOCKED_TASKS)
- show_state_filter(TASK_UNINTERRUPTIBLE);
-}
-
void check_panic_on_warn(const char *origin)
{
unsigned int limit;
@@ -345,7 +335,7 @@ void check_panic_on_warn(const char *origin)
*/
static void panic_other_cpus_shutdown(bool crash_kexec)
{
- if (panic_print & PANIC_PRINT_ALL_CPU_BT) {
+ if (panic_print & SYS_INFO_ALL_CPU_BT) {
/* Temporary allow non-panic CPUs to write their backtraces. */
panic_triggering_all_cpu_backtrace = true;
trigger_all_cpu_backtrace();
@@ -468,7 +458,7 @@ void vpanic(const char *fmt, va_list args)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- panic_print_sys_info(false);
+ sys_info(panic_print);
kmsg_dump_desc(KMSG_DUMP_PANIC, buf);
@@ -497,7 +487,9 @@ void vpanic(const char *fmt, va_list args)
debug_locks_off();
console_flush_on_panic(CONSOLE_FLUSH_PENDING);
- panic_print_sys_info(true);
+ if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) ||
+ panic_console_replay)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
if (!panic_blink)
panic_blink = no_blink;
@@ -949,6 +941,7 @@ core_param(panic_print, panic_print, ulong, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
core_param(panic_on_warn, panic_on_warn, int, 0644);
core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644);
+core_param(panic_console_replay, panic_console_replay, bool, 0644);
static int __init oops_setup(char *s)
{
diff --git a/kernel/relay.c b/kernel/relay.c
index c0c93a04d4ce..8d915fe98198 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -118,7 +118,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
return NULL;
for (i = 0; i < n_pages; i++) {
- buf->page_array[i] = alloc_page(GFP_KERNEL);
+ buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (unlikely(!buf->page_array[i]))
goto depopulate;
set_page_private(buf->page_array[i], (unsigned long)buf);
@@ -127,7 +127,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
if (!mem)
goto depopulate;
- memset(mem, 0, *size);
buf->page_count = n_pages;
return mem;
@@ -250,13 +249,18 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
*/
static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
- void *prev_subbuf, size_t prev_padding)
+ void *prev_subbuf)
{
+ int full = relay_buf_full(buf);
+
+ if (full)
+ buf->stats.full_count++;
+
if (!buf->chan->cb->subbuf_start)
- return !relay_buf_full(buf);
+ return !full;
return buf->chan->cb->subbuf_start(buf, subbuf,
- prev_subbuf, prev_padding);
+ prev_subbuf);
}
/**
@@ -298,11 +302,13 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
buf->finalized = 0;
buf->data = buf->start;
buf->offset = 0;
+ buf->stats.full_count = 0;
+ buf->stats.big_count = 0;
for (i = 0; i < buf->chan->n_subbufs; i++)
buf->padding[i] = 0;
- relay_subbuf_start(buf, buf->data, NULL, 0);
+ relay_subbuf_start(buf, buf->data, NULL);
}
/**
@@ -555,9 +561,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
goto toobig;
if (buf->offset != buf->chan->subbuf_size + 1) {
- buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+ size_t prev_padding;
+
+ prev_padding = buf->chan->subbuf_size - buf->offset;
old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
- buf->padding[old_subbuf] = buf->prev_padding;
+ buf->padding[old_subbuf] = prev_padding;
buf->subbufs_produced++;
if (buf->dentry)
d_inode(buf->dentry)->i_size +=
@@ -582,7 +590,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
new = buf->start + new_subbuf * buf->chan->subbuf_size;
buf->offset = 0;
- if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) {
+ if (!relay_subbuf_start(buf, new, old)) {
buf->offset = buf->chan->subbuf_size + 1;
return 0;
}
@@ -595,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
return length;
toobig:
- buf->chan->last_toobig = length;
+ buf->stats.big_count++;
return 0;
}
EXPORT_SYMBOL_GPL(relay_switch_subbuf);
@@ -655,11 +663,6 @@ void relay_close(struct rchan *chan)
if ((buf = *per_cpu_ptr(chan->buf, i)))
relay_close_buf(buf);
- if (chan->last_toobig)
- printk(KERN_WARNING "relay: one or more items not logged "
- "[item size (%zd) > sub-buffer size (%zd)]\n",
- chan->last_toobig, chan->subbuf_size);
-
list_del(&chan->list);
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
@@ -694,6 +697,42 @@ void relay_flush(struct rchan *chan)
EXPORT_SYMBOL_GPL(relay_flush);
/**
+ * relay_stats - get channel buffer statistics
+ * @chan: the channel
+ * @flags: select particular information to get
+ *
+ * Returns the count of certain field that caller specifies.
+ */
+size_t relay_stats(struct rchan *chan, int flags)
+{
+ unsigned int i, count = 0;
+ struct rchan_buf *rbuf;
+
+ if (!chan || flags > RELAY_STATS_LAST)
+ return 0;
+
+ if (chan->is_global) {
+ rbuf = *per_cpu_ptr(chan->buf, 0);
+ if (flags & RELAY_STATS_BUF_FULL)
+ count = rbuf->stats.full_count;
+ else if (flags & RELAY_STATS_WRT_BIG)
+ count = rbuf->stats.big_count;
+ } else {
+ for_each_online_cpu(i) {
+ rbuf = *per_cpu_ptr(chan->buf, i);
+ if (rbuf) {
+ if (flags & RELAY_STATS_BUF_FULL)
+ count += rbuf->stats.full_count;
+ else if (flags & RELAY_STATS_WRT_BIG)
+ count += rbuf->stats.big_count;
+ }
+ }
+ }
+
+ return count;
+}
+
+/**
* relay_file_open - open file op for relay files
* @inode: the inode
* @filp: the file
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 47168d2afbf1..6941145b5058 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -415,9 +415,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
size_t count, loff_t *ppos)
{
struct blk_trace *bt = filp->private_data;
+ size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL);
char buf[16];
- snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+ snprintf(buf, sizeof(buf), "%zu\n", dropped);
return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
}
@@ -456,23 +457,6 @@ static const struct file_operations blk_msg_fops = {
.llseek = noop_llseek,
};
-/*
- * Keep track of how many times we encountered a full subbuffer, to aid
- * the user space app in telling how many lost events there were.
- */
-static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
- void *prev_subbuf, size_t prev_padding)
-{
- struct blk_trace *bt;
-
- if (!relay_buf_full(buf))
- return 1;
-
- bt = buf->chan->private_data;
- atomic_inc(&bt->dropped);
- return 0;
-}
-
static int blk_remove_buf_file_callback(struct dentry *dentry)
{
debugfs_remove(dentry);
@@ -491,7 +475,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename,
}
static const struct rchan_callbacks blk_relay_callbacks = {
- .subbuf_start = blk_subbuf_start_callback,
.create_buf_file = blk_create_buf_file_callback,
.remove_buf_file = blk_remove_buf_file_callback,
};
@@ -580,7 +563,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
}
bt->dev = dev;
- atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
ret = -EIO;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 8686e329b8f2..586af49fc03e 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -199,18 +199,16 @@ void put_ucounts(struct ucounts *ucounts)
}
}
-static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, long u)
{
- long c, old;
- c = atomic_long_read(v);
- for (;;) {
+ long c = atomic_long_read(v);
+
+ do {
if (unlikely(c >= u))
return false;
- old = atomic_long_cmpxchg(v, c, c+1);
- if (likely(old == c))
- return true;
- c = old;
- }
+ } while (!atomic_long_try_cmpxchg(v, &c, c+1));
+
+ return true;
}
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,