diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/crash_core.c | 15 | ||||
| -rw-r--r-- | kernel/crash_reserve.c | 68 | ||||
| -rw-r--r-- | kernel/events/uprobes.c | 4 | ||||
| -rw-r--r-- | kernel/exit.c | 7 | ||||
| -rw-r--r-- | kernel/fork.c | 95 | ||||
| -rw-r--r-- | kernel/hung_task.c | 29 | ||||
| -rw-r--r-- | kernel/kcov.c | 2 | ||||
| -rw-r--r-- | kernel/kexec.c | 2 | ||||
| -rw-r--r-- | kernel/kexec_core.c | 100 | ||||
| -rw-r--r-- | kernel/kexec_file.c | 51 | ||||
| -rw-r--r-- | kernel/kexec_internal.h | 2 | ||||
| -rw-r--r-- | kernel/kthread.c | 11 | ||||
| -rw-r--r-- | kernel/locking/rwsem.c | 31 | ||||
| -rw-r--r-- | kernel/panic.c | 71 | ||||
| -rw-r--r-- | kernel/relay.c | 69 | ||||
| -rw-r--r-- | kernel/trace/blktrace.c | 22 | ||||
| -rw-r--r-- | kernel/ucount.c | 16 |
17 files changed, 424 insertions, 171 deletions
diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 335b8425dd4b..a4ef79591eb2 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -21,6 +21,7 @@ #include <linux/reboot.h> #include <linux/btf.h> #include <linux/objtool.h> +#include <linux/delay.h> #include <asm/page.h> #include <asm/sections.h> @@ -33,6 +34,11 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; +/* time to wait for possible DMA to finish before starting the kdump kernel + * when a CMA reservation is used + */ +#define CMA_DMA_TIMEOUT_SEC 10 + #ifdef CONFIG_CRASH_DUMP int kimage_crash_copy_vmcoreinfo(struct kimage *image) @@ -97,6 +103,14 @@ int kexec_crash_loaded(void) } EXPORT_SYMBOL_GPL(kexec_crash_loaded); +static void crash_cma_clear_pending_dma(void) +{ + if (!crashk_cma_cnt) + return; + + mdelay(CMA_DMA_TIMEOUT_SEC * 1000); +} + /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU @@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); + crash_cma_clear_pending_dma(); machine_kexec(kexec_crash_image); } kexec_unlock(); diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index acb6bf42e30d..87bf4d41eabb 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -14,6 +14,8 @@ #include <linux/cpuhotplug.h> #include <linux/memblock.h> #include <linux/kmemleak.h> +#include <linux/cma.h> +#include <linux/crash_reserve.h> #include <asm/page.h> #include <asm/sections.h> @@ -172,17 +174,19 @@ static int __init parse_crashkernel_simple(char *cmdline, #define SUFFIX_HIGH 0 #define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 +#define SUFFIX_CMA 2 +#define SUFFIX_NULL 3 static __initdata char *suffix_tbl[] = { [SUFFIX_HIGH] = ",high", [SUFFIX_LOW] = ",low", + [SUFFIX_CMA] = ",cma", [SUFFIX_NULL] = NULL, }; /* * That function parses "suffix" crashkernel command lines like * - * crashkernel=size,[high|low] + * crashkernel=size,[high|low|cma] * * It returns 0 on success and -EINVAL on failure. */ @@ -298,9 +302,11 @@ int __init parse_crashkernel(char *cmdline, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, + unsigned long long *cma_size, bool *high) { int ret; + unsigned long long __always_unused cma_base; /* crashkernel=X[@offset] */ ret = __parse_crashkernel(cmdline, system_ram, crash_size, @@ -331,6 +337,14 @@ int __init parse_crashkernel(char *cmdline, *high = true; } + + /* + * optional CMA reservation + * cma_base is ignored + */ + if (cma_size) + __parse_crashkernel(cmdline, 0, cma_size, + &cma_base, suffix_tbl[SUFFIX_CMA]); #endif if (!*crash_size) ret = -EINVAL; @@ -457,6 +471,56 @@ retry: #endif } +struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX]; +#ifdef CRASHKERNEL_CMA +int crashk_cma_cnt; +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + unsigned long long request_size = roundup(cma_size, PAGE_SIZE); + unsigned long long reserved_size = 0; + + if (!cma_size) + return; + + while (cma_size > reserved_size && + crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) { + + struct cma *res; + + if (cma_declare_contiguous(0, request_size, 0, 0, 0, false, + "crashkernel", &res)) { + /* reservation failed, try half-sized blocks */ + if (request_size <= PAGE_SIZE) + break; + + request_size = roundup(request_size / 2, PAGE_SIZE); + continue; + } + + crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res); + crashk_cma_ranges[crashk_cma_cnt].end = + crashk_cma_ranges[crashk_cma_cnt].start + + cma_get_size(res) - 1; + ++crashk_cma_cnt; + reserved_size += request_size; + } + + if (cma_size > reserved_size) + pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n", + cma_size >> 20, reserved_size >> 20, crashk_cma_cnt); + else + pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n", + reserved_size >> 20, crashk_cma_cnt); +} + +#else /* CRASHKERNEL_CMA */ +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + if (cma_size) + pr_warn("crashkernel CMA reservation not supported\n"); +} +#endif + #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f774367c8e71..7ca1940607bd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -580,8 +580,8 @@ retry: out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && is_register && ref_ctr_updated) - update_ref_ctr(uprobe, mm, -1); + if (ret < 0 && ref_ctr_updated) + update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ if (ret > 0) diff --git a/kernel/exit.c b/kernel/exit.c index 1d8c8ac33c4f..343eb97543d5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -693,12 +693,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, } /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + * Make init inherit all the child processes */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) diff --git a/kernel/fork.c b/kernel/fork.c index e45354cc7cac..9ce93fd20f82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -189,33 +189,33 @@ static inline void free_task_struct(struct task_struct *tsk) kmem_cache_free(task_struct_cachep, tsk); } -/* - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a - * kmemcache based allocator. - */ -# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) - -# ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +/* + * Allocated stacks are cached and later reused by new threads, so memcg + * accounting is performed by the code assigning/releasing stacks to tasks. + * We need a zeroed memory without __GFP_ACCOUNT. + */ +#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) struct vm_stack { struct rcu_head rcu; struct vm_struct *stack_vm_area; }; -static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) return true; } return false; @@ -224,11 +224,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm) static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - vfree(vm_stack); + vfree(vm_area->addr); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -241,32 +242,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk) static int free_vm_stack_cache(unsigned int cpu) { - struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *vm_stack = cached_vm_stacks[i]; + struct vm_struct *vm_area = cached_vm_stack_areas[i]; - if (!vm_stack) + if (!vm_area) continue; - vfree(vm_stack->addr); - cached_vm_stacks[i] = NULL; + vfree(vm_area->addr); + cached_vm_stack_areas[i] = NULL; } return 0; } -static int memcg_charge_kernel_stack(struct vm_struct *vm) +static int memcg_charge_kernel_stack(struct vm_struct *vm_area) { int i; int ret; int nr_charged = 0; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; @@ -274,55 +275,47 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) return 0; err: for (i = 0; i < nr_charged; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct vm_struct *vm; + struct vm_struct *vm_area; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s; - - s = this_cpu_xchg(cached_stacks[i], NULL); - - if (!s) + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (!vm_area) continue; /* Reset stack metadata. */ - kasan_unpoison_range(s->addr, THREAD_SIZE); + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); - stack = kasan_reset_tag(s->addr); + stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(s)) { - vfree(s->addr); + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); return -ENOMEM; } - tsk->stack_vm_area = s; + tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; } - /* - * Allocated stacks are cached and later reused by new threads, - * so memcg accounting is performed manually on assigning/releasing - * stacks to tasks. Drop __GFP_ACCOUNT. - */ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, - THREADINFO_GFP & ~__GFP_ACCOUNT, + GFP_VMAP_STACK, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; - vm = find_vm_area(stack); - if (memcg_charge_kernel_stack(vm)) { + vm_area = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm_area)) { vfree(stack); return -ENOMEM; } @@ -331,7 +324,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = vm; + tsk->stack_vm_area = vm_area; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; @@ -346,7 +339,13 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack_vm_area = NULL; } -# else /* !CONFIG_VMAP_STACK */ +#else /* !CONFIG_VMAP_STACK */ + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +#if THREAD_SIZE >= PAGE_SIZE static void thread_stack_free_rcu(struct rcu_head *rh) { @@ -378,8 +377,7 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack = NULL; } -# endif /* CONFIG_VMAP_STACK */ -# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ +#else /* !(THREAD_SIZE >= PAGE_SIZE) */ static struct kmem_cache *thread_stack_cache; @@ -418,7 +416,8 @@ void thread_stack_cache_init(void) BUG_ON(thread_stack_cache == NULL); } -# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#endif /* THREAD_SIZE >= PAGE_SIZE */ +#endif /* CONFIG_VMAP_STACK */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -438,11 +437,11 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm = task_stack_vm_area(tsk); + struct vm_struct *vm_area = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); @@ -458,12 +457,12 @@ void exit_task_stack_account(struct task_struct *tsk) account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm; + struct vm_struct *vm_area; int i; - vm = task_stack_vm_area(tsk); + vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); } } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d2432df2b905..8708a1205f82 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -23,6 +23,7 @@ #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> #include <linux/hung_task.h> +#include <linux/rwsem.h> #include <trace/events/sched.h> @@ -100,6 +101,7 @@ static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; + const char *rwsem_blocked_by, *rwsem_blocked_as; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); @@ -111,12 +113,20 @@ static void debug_show_blocker(struct task_struct *task) switch (blocker_type) { case BLOCKER_TYPE_MUTEX: - owner = mutex_get_owner( - (struct mutex *)hung_task_blocker_to_lock(blocker)); + owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); break; case BLOCKER_TYPE_SEM: - owner = sem_last_holder( - (struct semaphore *)hung_task_blocker_to_lock(blocker)); + owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + owner = (unsigned long)rwsem_owner( + hung_task_blocker_to_lock(blocker)); + rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? + "reader" : "writer"; + rwsem_blocked_by = is_rwsem_reader_owned( + hung_task_blocker_to_lock(blocker)) ? + "reader" : "writer"; break; default: WARN_ON_ONCE(1); @@ -134,6 +144,11 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", task->comm, task->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", + task->comm, task->pid); + break; } return; } @@ -152,6 +167,12 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", task->comm, task->pid, t->comm, t->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", + task->comm, task->pid, rwsem_blocked_as, t->comm, + t->pid, rwsem_blocked_by); + break; } sched_show_task(t); return; diff --git a/kernel/kcov.c b/kernel/kcov.c index 187ba1b80bda..1d85597057e1 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -552,7 +552,7 @@ static int kcov_get_mode(unsigned long arg) /* * Fault in a lazily-faulted vmalloc area before it can be used by - * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the + * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the * vmalloc fault handling path is instrumented. */ static void kcov_fault_in_area(struct kcov *kcov) diff --git a/kernel/kexec.c b/kernel/kexec.c index a6b3f96bb50c..28008e3d462e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, goto out; for (i = 0; i < nr_segments; i++) { - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 351cd7d76dfa..31203f0bacaf 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -40,6 +40,7 @@ #include <linux/hugetlb.h> #include <linux/objtool.h> #include <linux/kmsg_dump.h> +#include <linux/dma-map-ops.h> #include <asm/page.h> #include <asm/sections.h> @@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry) kimage_free_pages(page); } +static void kimage_free_cma(struct kimage *image) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + struct page *cma = image->segment_cma[i]; + u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; + + if (!cma) + continue; + + arch_kexec_pre_free_pages(page_address(cma), nr_pages); + dma_release_from_contiguous(NULL, cma, nr_pages); + image->segment_cma[i] = NULL; + } + +} + void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; @@ -591,6 +610,9 @@ void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + /* Free CMA allocations */ + kimage_free_cma(image); + /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. @@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image, return page; } -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_cma_segment(struct kimage *image, int idx) +{ + struct kexec_segment *segment = &image->segment[idx]; + struct page *cma = image->segment_cma[idx]; + char *ptr = page_address(cma); + unsigned long maddr; + size_t ubytes, mbytes; + int result = 0; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + /* Then copy from source buffer to the CMA one */ + while (mbytes) { + size_t uchunk, mchunk; + + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } + + if (result) { + result = -EFAULT; + goto out; + } + + ptr += mchunk; + maddr += mchunk; + mbytes -= mchunk; + + cond_resched(); + } + + /* Clear any remainder */ + memset(ptr, 0, mbytes); + +out: + return result; +} + +static int kimage_load_normal_segment(struct kimage *image, int idx) { + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image, mbytes = segment->memsz; maddr = segment->mem; + if (image->segment_cma[idx]) + return kimage_load_cma_segment(image, idx); + result = kimage_set_destination(image, maddr); if (result < 0) goto out; @@ -787,13 +872,13 @@ out: } #ifdef CONFIG_CRASH_DUMP -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -858,18 +943,17 @@ out: } #endif -int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) +int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); + result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_crash_segment(image, idx); break; #endif } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b835033c65eb..91d46502a817 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,6 +26,7 @@ #include <linux/kernel_read_file.h> #include <linux/syscalls.h> #include <linux/vmalloc.h> +#include <linux/dma-map-ops.h> #include "kexec_internal.h" #ifdef CONFIG_KEXEC_SIG @@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = 0; } + image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); if (IS_ERR(image->cmdline_buf)) { @@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, i, ksegment->buf, ksegment->bufsz, ksegment->mem, ksegment->memsz); - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } @@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } +static int kexec_alloc_contig(struct kexec_buf *kbuf) +{ + size_t nr_pages = kbuf->memsz >> PAGE_SHIFT; + unsigned long mem; + struct page *p; + + /* User space disabled CMA allocations, bail out. */ + if (kbuf->image->no_cma) + return -EPERM; + + /* Skip CMA logic for crash kernel */ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return -EPERM; + + p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true); + if (!p) + return -ENOMEM; + + pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p)); + + mem = page_to_boot_pfn(p) << PAGE_SHIFT; + + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) { + /* Our region is already in use by a statically defined one. Bail out. */ + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz); + dma_release_from_contiguous(NULL, p, nr_pages); + return -EBUSY; + } + + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT; + kbuf->cma = p; + + arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0); + + return 0; +} + /** * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel * @kbuf: Parameters for the memory search. @@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (ret <= 0) return ret; + /* + * Try to find a free physically contiguous block of memory first. With that, we + * can avoid any copying at kexec time. + */ + if (!kexec_alloc_contig(kbuf)) + return 0; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Ensure minimum alignment needed for segments. */ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); + kbuf->cma = NULL; /* Walk the RAM ranges and allocate a suitable range for the buffer */ ret = arch_kexec_locate_mem_hole(kbuf); @@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; + kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma; kbuf->image->nr_segments++; return 0; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 30a733a55a67..228bb88c018b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); void kimage_free(struct kimage *image); -int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +int kimage_load_segment(struct kimage *image, int idx); void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); diff --git a/kernel/kthread.c b/kernel/kthread.c index 85fc068f0083..0e98b228a8ef 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -88,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k) /* * Variant of to_kthread() that doesn't assume @p is a kthread. * - * Per construction; when: + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. * - * (p->flags & PF_KTHREAD) && p->worker_private - * - * the task is both a kthread and struct kthread is persistent. However - * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and - * begin_new_exec()). + * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 8572dba95af4..24df4d98f7d2 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> +#include <linux/hung_task.h> #include <trace/events/lock.h> #ifndef CONFIG_PREEMPT_RT @@ -181,11 +182,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } -#ifdef CONFIG_DEBUG_RWSEMS +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) /* * Return just the real task structure pointer of the owner */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +struct task_struct *rwsem_owner(struct rw_semaphore *sem) { return (struct task_struct *) (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); @@ -194,7 +195,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) /* * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +bool is_rwsem_reader_owned(struct rw_semaphore *sem) { /* * Check the count to see if it is write-locked. @@ -207,10 +208,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) } /* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). + * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured, + * it will make sure that the owner field of a reader-owned rwsem either + * points to a real reader-owner(s) or gets cleared. The only exception is + * when the unlock is done by up_read_non_owner(). */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { @@ -1063,10 +1064,13 @@ queue: wake_up_q(&wake_q); trace_contention_begin(sem, LCB_F_READ); + set_current_state(state); + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER); /* wait to be given the lock */ for (;;) { - set_current_state(state); if (!smp_load_acquire(&waiter.task)) { /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; @@ -1081,8 +1085,12 @@ queue: } schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_reader); + set_current_state(state); } + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); trace_contention_end(sem, 0); @@ -1144,6 +1152,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) set_current_state(state); trace_contention_begin(sem, LCB_F_WRITE); + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1177,6 +1188,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) trylock_again: raw_spin_lock_irq(&sem->wait_lock); } + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); diff --git a/kernel/panic.c b/kernel/panic.c index 43817111c979..72fcbb5a071b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,7 @@ #include <linux/sysfs.h> #include <linux/context_tracking.h> #include <linux/seq_buf.h> +#include <linux/sys_info.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -63,20 +64,13 @@ int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; static unsigned int warn_limit __read_mostly; +static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); -#define PANIC_PRINT_TASK_INFO 0x00000001 -#define PANIC_PRINT_MEM_INFO 0x00000002 -#define PANIC_PRINT_TIMER_INFO 0x00000004 -#define PANIC_PRINT_LOCK_INFO 0x00000008 -#define PANIC_PRINT_FTRACE_INFO 0x00000010 -#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 -#define PANIC_PRINT_ALL_CPU_BT 0x00000040 -#define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -128,6 +122,13 @@ static int proc_taint(const struct ctl_table *table, int write, return err; } +static int sysctl_panic_print_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} + static const struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { @@ -165,7 +166,7 @@ static const struct ctl_table kern_panic_table[] = { .data = &panic_print, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = sysctl_panic_print_handler, }, { .procname = "panic_on_warn", @@ -193,6 +194,13 @@ static const struct ctl_table kern_panic_table[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "panic_sys_info", + .data = &panic_print, + .maxlen = sizeof(panic_print), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static __init int kernel_panic_sysctls_init(void) @@ -203,6 +211,15 @@ static __init int kernel_panic_sysctls_init(void) late_initcall(kernel_panic_sysctls_init); #endif +/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */ +static int __init setup_panic_sys_info(char *buf) +{ + /* There is no risk of race in kernel boot phase */ + panic_print = sys_info_parse_param(buf); + return 1; +} +__setup("panic_sys_info=", setup_panic_sys_info); + static atomic_t warn_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS @@ -298,33 +315,6 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); -static void panic_print_sys_info(bool console_flush) -{ - if (console_flush) { - if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) - console_flush_on_panic(CONSOLE_REPLAY_ALL); - return; - } - - if (panic_print & PANIC_PRINT_TASK_INFO) - show_state(); - - if (panic_print & PANIC_PRINT_MEM_INFO) - show_mem(); - - if (panic_print & PANIC_PRINT_TIMER_INFO) - sysrq_timer_list_show(); - - if (panic_print & PANIC_PRINT_LOCK_INFO) - debug_show_all_locks(); - - if (panic_print & PANIC_PRINT_FTRACE_INFO) - ftrace_dump(DUMP_ALL); - - if (panic_print & PANIC_PRINT_BLOCKED_TASKS) - show_state_filter(TASK_UNINTERRUPTIBLE); -} - void check_panic_on_warn(const char *origin) { unsigned int limit; @@ -345,7 +335,7 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & PANIC_PRINT_ALL_CPU_BT) { + if (panic_print & SYS_INFO_ALL_CPU_BT) { /* Temporary allow non-panic CPUs to write their backtraces. */ panic_triggering_all_cpu_backtrace = true; trigger_all_cpu_backtrace(); @@ -468,7 +458,7 @@ void vpanic(const char *fmt, va_list args) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - panic_print_sys_info(false); + sys_info(panic_print); kmsg_dump_desc(KMSG_DUMP_PANIC, buf); @@ -497,7 +487,9 @@ void vpanic(const char *fmt, va_list args) debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); - panic_print_sys_info(true); + if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) || + panic_console_replay) + console_flush_on_panic(CONSOLE_REPLAY_ALL); if (!panic_blink) panic_blink = no_blink; @@ -949,6 +941,7 @@ core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); +core_param(panic_console_replay, panic_console_replay, bool, 0644); static int __init oops_setup(char *s) { diff --git a/kernel/relay.c b/kernel/relay.c index c0c93a04d4ce..8d915fe98198 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -118,7 +118,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) return NULL; for (i = 0; i < n_pages; i++) { - buf->page_array[i] = alloc_page(GFP_KERNEL); + buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); if (unlikely(!buf->page_array[i])) goto depopulate; set_page_private(buf->page_array[i], (unsigned long)buf); @@ -127,7 +127,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) if (!mem) goto depopulate; - memset(mem, 0, *size); buf->page_count = n_pages; return mem; @@ -250,13 +249,18 @@ EXPORT_SYMBOL_GPL(relay_buf_full); */ static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { + int full = relay_buf_full(buf); + + if (full) + buf->stats.full_count++; + if (!buf->chan->cb->subbuf_start) - return !relay_buf_full(buf); + return !full; return buf->chan->cb->subbuf_start(buf, subbuf, - prev_subbuf, prev_padding); + prev_subbuf); } /** @@ -298,11 +302,13 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) buf->finalized = 0; buf->data = buf->start; buf->offset = 0; + buf->stats.full_count = 0; + buf->stats.big_count = 0; for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; - relay_subbuf_start(buf, buf->data, NULL, 0); + relay_subbuf_start(buf, buf->data, NULL); } /** @@ -555,9 +561,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) goto toobig; if (buf->offset != buf->chan->subbuf_size + 1) { - buf->prev_padding = buf->chan->subbuf_size - buf->offset; + size_t prev_padding; + + prev_padding = buf->chan->subbuf_size - buf->offset; old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - buf->padding[old_subbuf] = buf->prev_padding; + buf->padding[old_subbuf] = prev_padding; buf->subbufs_produced++; if (buf->dentry) d_inode(buf->dentry)->i_size += @@ -582,7 +590,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; new = buf->start + new_subbuf * buf->chan->subbuf_size; buf->offset = 0; - if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) { + if (!relay_subbuf_start(buf, new, old)) { buf->offset = buf->chan->subbuf_size + 1; return 0; } @@ -595,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) return length; toobig: - buf->chan->last_toobig = length; + buf->stats.big_count++; return 0; } EXPORT_SYMBOL_GPL(relay_switch_subbuf); @@ -655,11 +663,6 @@ void relay_close(struct rchan *chan) if ((buf = *per_cpu_ptr(chan->buf, i))) relay_close_buf(buf); - if (chan->last_toobig) - printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%zd) > sub-buffer size (%zd)]\n", - chan->last_toobig, chan->subbuf_size); - list_del(&chan->list); kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); @@ -694,6 +697,42 @@ void relay_flush(struct rchan *chan) EXPORT_SYMBOL_GPL(relay_flush); /** + * relay_stats - get channel buffer statistics + * @chan: the channel + * @flags: select particular information to get + * + * Returns the count of certain field that caller specifies. + */ +size_t relay_stats(struct rchan *chan, int flags) +{ + unsigned int i, count = 0; + struct rchan_buf *rbuf; + + if (!chan || flags > RELAY_STATS_LAST) + return 0; + + if (chan->is_global) { + rbuf = *per_cpu_ptr(chan->buf, 0); + if (flags & RELAY_STATS_BUF_FULL) + count = rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count = rbuf->stats.big_count; + } else { + for_each_online_cpu(i) { + rbuf = *per_cpu_ptr(chan->buf, i); + if (rbuf) { + if (flags & RELAY_STATS_BUF_FULL) + count += rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count += rbuf->stats.big_count; + } + } + } + + return count; +} + +/** * relay_file_open - open file op for relay files * @inode: the inode * @filp: the file diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 47168d2afbf1..6941145b5058 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -415,9 +415,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; + size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL); char buf[16]; - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + snprintf(buf, sizeof(buf), "%zu\n", dropped); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } @@ -456,23 +457,6 @@ static const struct file_operations blk_msg_fops = { .llseek = noop_llseek, }; -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); @@ -491,7 +475,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename, } static const struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; @@ -580,7 +563,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } bt->dev = dev; - atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); ret = -EIO; diff --git a/kernel/ucount.c b/kernel/ucount.c index 8686e329b8f2..586af49fc03e 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -199,18 +199,16 @@ void put_ucounts(struct ucounts *ucounts) } } -static inline bool atomic_long_inc_below(atomic_long_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, long u) { - long c, old; - c = atomic_long_read(v); - for (;;) { + long c = atomic_long_read(v); + + do { if (unlikely(c >= u)) return false; - old = atomic_long_cmpxchg(v, c, c+1); - if (likely(old == c)) - return true; - c = old; - } + } while (!atomic_long_try_cmpxchg(v, &c, c+1)); + + return true; } struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, |