summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/panthor/panthor_sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/panthor/panthor_sched.c')
-rw-r--r--drivers/gpu/drm/panthor/panthor_sched.c390
1 files changed, 338 insertions, 52 deletions
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c
index 9929e22f4d8d..ef4bec7ff9c7 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -93,6 +93,9 @@
#define MIN_CSGS 3
#define MAX_CSG_PRIO 0xf
+#define NUM_INSTRS_PER_CACHE_LINE (64 / sizeof(u64))
+#define MAX_INSTRS_PER_JOB 24
+
struct panthor_group;
/**
@@ -137,8 +140,6 @@ enum panthor_csg_priority {
* non-real-time groups. When such a group becomes executable,
* it will evict the group with the lowest non-rt priority if
* there's no free group slot available.
- *
- * Currently not exposed to userspace.
*/
PANTHOR_CSG_PRIORITY_RT,
@@ -476,6 +477,18 @@ struct panthor_queue {
*/
struct list_head in_flight_jobs;
} fence_ctx;
+
+ /** @profiling: Job profiling data slots and access information. */
+ struct {
+ /** @slots: Kernel BO holding the slots. */
+ struct panthor_kernel_bo *slots;
+
+ /** @slot_count: Number of jobs ringbuffer can hold at once. */
+ u32 slot_count;
+
+ /** @seqno: Index of the next available profiling information slot. */
+ u32 seqno;
+ } profiling;
};
/**
@@ -605,6 +618,18 @@ struct panthor_group {
*/
struct panthor_kernel_bo *syncobjs;
+ /** @fdinfo: Per-file total cycle and timestamp values reference. */
+ struct {
+ /** @data: Total sampled values for jobs in queues from this group. */
+ struct panthor_gpu_usage data;
+
+ /**
+ * @lock: Mutex to govern concurrent access from drm file's fdinfo callback
+ * and job post-completion processing function
+ */
+ struct mutex lock;
+ } fdinfo;
+
/** @state: Group state. */
enum panthor_group_state state;
@@ -662,6 +687,18 @@ struct panthor_group {
struct list_head wait_node;
};
+struct panthor_job_profiling_data {
+ struct {
+ u64 before;
+ u64 after;
+ } cycles;
+
+ struct {
+ u64 before;
+ u64 after;
+ } time;
+};
+
/**
* group_queue_work() - Queue a group work
* @group: Group to queue the work for.
@@ -775,6 +812,15 @@ struct panthor_job {
/** @done_fence: Fence signaled when the job is finished or cancelled. */
struct dma_fence *done_fence;
+
+ /** @profiling: Job profiling information. */
+ struct {
+ /** @mask: Current device job profiling enablement bitmask. */
+ u32 mask;
+
+ /** @slot: Job index in the profiling slots BO. */
+ u32 slot;
+ } profiling;
};
static void
@@ -839,6 +885,7 @@ static void group_free_queue(struct panthor_group *group, struct panthor_queue *
panthor_kernel_bo_destroy(queue->ringbuf);
panthor_kernel_bo_destroy(queue->iface.mem);
+ panthor_kernel_bo_destroy(queue->profiling.slots);
/* Release the last_fence we were holding, if any. */
dma_fence_put(queue->fence_ctx.last_fence);
@@ -853,6 +900,8 @@ static void group_release_work(struct work_struct *work)
release_work);
u32 i;
+ mutex_destroy(&group->fdinfo.lock);
+
for (i = 0; i < group->queue_count; i++)
group_free_queue(group, group->queues[i]);
@@ -1989,8 +2038,6 @@ tick_ctx_init(struct panthor_scheduler *sched,
}
}
-#define NUM_INSTRS_PER_SLOT 16
-
static void
group_term_post_processing(struct panthor_group *group)
{
@@ -2546,7 +2593,7 @@ static void queue_start(struct panthor_queue *queue)
list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
job->base.s_fence->parent = dma_fence_get(job->done_fence);
- drm_sched_start(&queue->scheduler);
+ drm_sched_start(&queue->scheduler, 0);
}
static void panthor_group_stop(struct panthor_group *group)
@@ -2790,6 +2837,41 @@ void panthor_sched_post_reset(struct panthor_device *ptdev, bool reset_failed)
}
}
+static void update_fdinfo_stats(struct panthor_job *job)
+{
+ struct panthor_group *group = job->group;
+ struct panthor_queue *queue = group->queues[job->queue_idx];
+ struct panthor_gpu_usage *fdinfo = &group->fdinfo.data;
+ struct panthor_job_profiling_data *slots = queue->profiling.slots->kmap;
+ struct panthor_job_profiling_data *data = &slots[job->profiling.slot];
+
+ mutex_lock(&group->fdinfo.lock);
+ if (job->profiling.mask & PANTHOR_DEVICE_PROFILING_CYCLES)
+ fdinfo->cycles += data->cycles.after - data->cycles.before;
+ if (job->profiling.mask & PANTHOR_DEVICE_PROFILING_TIMESTAMP)
+ fdinfo->time += data->time.after - data->time.before;
+ mutex_unlock(&group->fdinfo.lock);
+}
+
+void panthor_fdinfo_gather_group_samples(struct panthor_file *pfile)
+{
+ struct panthor_group_pool *gpool = pfile->groups;
+ struct panthor_group *group;
+ unsigned long i;
+
+ if (IS_ERR_OR_NULL(gpool))
+ return;
+
+ xa_for_each(&gpool->xa, i, group) {
+ mutex_lock(&group->fdinfo.lock);
+ pfile->stats.cycles += group->fdinfo.data.cycles;
+ pfile->stats.time += group->fdinfo.data.time;
+ group->fdinfo.data.cycles = 0;
+ group->fdinfo.data.time = 0;
+ mutex_unlock(&group->fdinfo.lock);
+ }
+}
+
static void group_sync_upd_work(struct work_struct *work)
{
struct panthor_group *group =
@@ -2822,6 +2904,8 @@ static void group_sync_upd_work(struct work_struct *work)
dma_fence_end_signalling(cookie);
list_for_each_entry_safe(job, job_tmp, &done_jobs, node) {
+ if (job->profiling.mask)
+ update_fdinfo_stats(job);
list_del_init(&job->node);
panthor_job_put(&job->base);
}
@@ -2829,65 +2913,198 @@ static void group_sync_upd_work(struct work_struct *work)
group_put(group);
}
-static struct dma_fence *
-queue_run_job(struct drm_sched_job *sched_job)
+struct panthor_job_ringbuf_instrs {
+ u64 buffer[MAX_INSTRS_PER_JOB];
+ u32 count;
+};
+
+struct panthor_job_instr {
+ u32 profile_mask;
+ u64 instr;
+};
+
+#define JOB_INSTR(__prof, __instr) \
+ { \
+ .profile_mask = __prof, \
+ .instr = __instr, \
+ }
+
+static void
+copy_instrs_to_ringbuf(struct panthor_queue *queue,
+ struct panthor_job *job,
+ struct panthor_job_ringbuf_instrs *instrs)
+{
+ u64 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
+ u64 start = job->ringbuf.start & (ringbuf_size - 1);
+ u64 size, written;
+
+ /*
+ * We need to write a whole slot, including any trailing zeroes
+ * that may come at the end of it. Also, because instrs.buffer has
+ * been zero-initialised, there's no need to pad it with 0's
+ */
+ instrs->count = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
+ size = instrs->count * sizeof(u64);
+ WARN_ON(size > ringbuf_size);
+ written = min(ringbuf_size - start, size);
+
+ memcpy(queue->ringbuf->kmap + start, instrs->buffer, written);
+
+ if (written < size)
+ memcpy(queue->ringbuf->kmap,
+ &instrs->buffer[written / sizeof(u64)],
+ size - written);
+}
+
+struct panthor_job_cs_params {
+ u32 profile_mask;
+ u64 addr_reg; u64 val_reg;
+ u64 cycle_reg; u64 time_reg;
+ u64 sync_addr; u64 times_addr;
+ u64 cs_start; u64 cs_size;
+ u32 last_flush; u32 waitall_mask;
+};
+
+static void
+get_job_cs_params(struct panthor_job *job, struct panthor_job_cs_params *params)
{
- struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
struct panthor_group *group = job->group;
struct panthor_queue *queue = group->queues[job->queue_idx];
struct panthor_device *ptdev = group->ptdev;
struct panthor_scheduler *sched = ptdev->scheduler;
- u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf);
- u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1);
- u64 addr_reg = ptdev->csif_info.cs_reg_count -
- ptdev->csif_info.unpreserved_cs_reg_count;
- u64 val_reg = addr_reg + 2;
- u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
- job->queue_idx * sizeof(struct panthor_syncobj_64b);
- u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
- struct dma_fence *done_fence;
- int ret;
- u64 call_instrs[NUM_INSTRS_PER_SLOT] = {
- /* MOV32 rX+2, cs.latest_flush */
- (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush,
+ params->addr_reg = ptdev->csif_info.cs_reg_count -
+ ptdev->csif_info.unpreserved_cs_reg_count;
+ params->val_reg = params->addr_reg + 2;
+ params->cycle_reg = params->addr_reg;
+ params->time_reg = params->val_reg;
- /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
- (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233,
+ params->sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) +
+ job->queue_idx * sizeof(struct panthor_syncobj_64b);
+ params->times_addr = panthor_kernel_bo_gpuva(queue->profiling.slots) +
+ (job->profiling.slot * sizeof(struct panthor_job_profiling_data));
+ params->waitall_mask = GENMASK(sched->sb_slot_count - 1, 0);
- /* MOV48 rX:rX+1, cs.start */
- (1ull << 56) | (addr_reg << 48) | job->call_info.start,
+ params->cs_start = job->call_info.start;
+ params->cs_size = job->call_info.size;
+ params->last_flush = job->call_info.latest_flush;
- /* MOV32 rX+2, cs.size */
- (2ull << 56) | (val_reg << 48) | job->call_info.size,
+ params->profile_mask = job->profiling.mask;
+}
- /* WAIT(0) => waits for FLUSH_CACHE2 instruction */
- (3ull << 56) | (1 << 16),
+#define JOB_INSTR_ALWAYS(instr) \
+ JOB_INSTR(PANTHOR_DEVICE_PROFILING_DISABLED, (instr))
+#define JOB_INSTR_TIMESTAMP(instr) \
+ JOB_INSTR(PANTHOR_DEVICE_PROFILING_TIMESTAMP, (instr))
+#define JOB_INSTR_CYCLES(instr) \
+ JOB_INSTR(PANTHOR_DEVICE_PROFILING_CYCLES, (instr))
+static void
+prepare_job_instrs(const struct panthor_job_cs_params *params,
+ struct panthor_job_ringbuf_instrs *instrs)
+{
+ const struct panthor_job_instr instr_seq[] = {
+ /* MOV32 rX+2, cs.latest_flush */
+ JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->last_flush),
+ /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */
+ JOB_INSTR_ALWAYS((36ull << 56) | (0ull << 48) | (params->val_reg << 40) |
+ (0 << 16) | 0x233),
+ /* MOV48 rX:rX+1, cycles_offset */
+ JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
+ (params->times_addr +
+ offsetof(struct panthor_job_profiling_data, cycles.before))),
+ /* STORE_STATE cycles */
+ JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
+ /* MOV48 rX:rX+1, time_offset */
+ JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) |
+ (params->times_addr +
+ offsetof(struct panthor_job_profiling_data, time.before))),
+ /* STORE_STATE timer */
+ JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
+ /* MOV48 rX:rX+1, cs.start */
+ JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->cs_start),
+ /* MOV32 rX+2, cs.size */
+ JOB_INSTR_ALWAYS((2ull << 56) | (params->val_reg << 48) | params->cs_size),
+ /* WAIT(0) => waits for FLUSH_CACHE2 instruction */
+ JOB_INSTR_ALWAYS((3ull << 56) | (1 << 16)),
/* CALL rX:rX+1, rX+2 */
- (32ull << 56) | (addr_reg << 40) | (val_reg << 32),
-
+ JOB_INSTR_ALWAYS((32ull << 56) | (params->addr_reg << 40) |
+ (params->val_reg << 32)),
+ /* MOV48 rX:rX+1, cycles_offset */
+ JOB_INSTR_CYCLES((1ull << 56) | (params->cycle_reg << 48) |
+ (params->times_addr +
+ offsetof(struct panthor_job_profiling_data, cycles.after))),
+ /* STORE_STATE cycles */
+ JOB_INSTR_CYCLES((40ull << 56) | (params->cycle_reg << 40) | (1ll << 32)),
+ /* MOV48 rX:rX+1, time_offset */
+ JOB_INSTR_TIMESTAMP((1ull << 56) | (params->time_reg << 48) |
+ (params->times_addr +
+ offsetof(struct panthor_job_profiling_data, time.after))),
+ /* STORE_STATE timer */
+ JOB_INSTR_TIMESTAMP((40ull << 56) | (params->time_reg << 40) | (0ll << 32)),
/* MOV48 rX:rX+1, sync_addr */
- (1ull << 56) | (addr_reg << 48) | sync_addr,
-
+ JOB_INSTR_ALWAYS((1ull << 56) | (params->addr_reg << 48) | params->sync_addr),
/* MOV48 rX+2, #1 */
- (1ull << 56) | (val_reg << 48) | 1,
-
+ JOB_INSTR_ALWAYS((1ull << 56) | (params->val_reg << 48) | 1),
/* WAIT(all) */
- (3ull << 56) | (waitall_mask << 16),
-
+ JOB_INSTR_ALWAYS((3ull << 56) | (params->waitall_mask << 16)),
/* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/
- (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1,
+ JOB_INSTR_ALWAYS((51ull << 56) | (0ull << 48) | (params->addr_reg << 40) |
+ (params->val_reg << 32) | (0 << 16) | 1),
+ /* ERROR_BARRIER, so we can recover from faults at job boundaries. */
+ JOB_INSTR_ALWAYS((47ull << 56)),
+ };
+ u32 pad;
- /* ERROR_BARRIER, so we can recover from faults at job
- * boundaries.
- */
- (47ull << 56),
+ instrs->count = 0;
+
+ /* NEED to be cacheline aligned to please the prefetcher. */
+ static_assert(sizeof(instrs->buffer) % 64 == 0,
+ "panthor_job_ringbuf_instrs::buffer is not aligned on a cacheline");
+
+ /* Make sure we have enough storage to store the whole sequence. */
+ static_assert(ALIGN(ARRAY_SIZE(instr_seq), NUM_INSTRS_PER_CACHE_LINE) ==
+ ARRAY_SIZE(instrs->buffer),
+ "instr_seq vs panthor_job_ringbuf_instrs::buffer size mismatch");
+
+ for (u32 i = 0; i < ARRAY_SIZE(instr_seq); i++) {
+ /* If the profile mask of this instruction is not enabled, skip it. */
+ if (instr_seq[i].profile_mask &&
+ !(instr_seq[i].profile_mask & params->profile_mask))
+ continue;
+
+ instrs->buffer[instrs->count++] = instr_seq[i].instr;
+ }
+
+ pad = ALIGN(instrs->count, NUM_INSTRS_PER_CACHE_LINE);
+ memset(&instrs->buffer[instrs->count], 0,
+ (pad - instrs->count) * sizeof(instrs->buffer[0]));
+ instrs->count = pad;
+}
+
+static u32 calc_job_credits(u32 profile_mask)
+{
+ struct panthor_job_ringbuf_instrs instrs;
+ struct panthor_job_cs_params params = {
+ .profile_mask = profile_mask,
};
- /* Need to be cacheline aligned to please the prefetcher. */
- static_assert(sizeof(call_instrs) % 64 == 0,
- "call_instrs is not aligned on a cacheline");
+ prepare_job_instrs(&params, &instrs);
+ return instrs.count;
+}
+
+static struct dma_fence *
+queue_run_job(struct drm_sched_job *sched_job)
+{
+ struct panthor_job *job = container_of(sched_job, struct panthor_job, base);
+ struct panthor_group *group = job->group;
+ struct panthor_queue *queue = group->queues[job->queue_idx];
+ struct panthor_device *ptdev = group->ptdev;
+ struct panthor_scheduler *sched = ptdev->scheduler;
+ struct panthor_job_ringbuf_instrs instrs;
+ struct panthor_job_cs_params cs_params;
+ struct dma_fence *done_fence;
+ int ret;
/* Stream size is zero, nothing to do except making sure all previously
* submitted jobs are done before we signal the
@@ -2914,17 +3131,23 @@ queue_run_job(struct drm_sched_job *sched_job)
queue->fence_ctx.id,
atomic64_inc_return(&queue->fence_ctx.seqno));
- memcpy(queue->ringbuf->kmap + ringbuf_insert,
- call_instrs, sizeof(call_instrs));
+ job->profiling.slot = queue->profiling.seqno++;
+ if (queue->profiling.seqno == queue->profiling.slot_count)
+ queue->profiling.seqno = 0;
+
+ job->ringbuf.start = queue->iface.input->insert;
+
+ get_job_cs_params(job, &cs_params);
+ prepare_job_instrs(&cs_params, &instrs);
+ copy_instrs_to_ringbuf(queue, job, &instrs);
+
+ job->ringbuf.end = job->ringbuf.start + (instrs.count * sizeof(u64));
panthor_job_get(&job->base);
spin_lock(&queue->fence_ctx.lock);
list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs);
spin_unlock(&queue->fence_ctx.lock);
- job->ringbuf.start = queue->iface.input->insert;
- job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs);
-
/* Make sure the ring buffer is updated before the INSERT
* register.
*/
@@ -3017,6 +3240,33 @@ static const struct drm_sched_backend_ops panthor_queue_sched_ops = {
.free_job = queue_free_job,
};
+static u32 calc_profiling_ringbuf_num_slots(struct panthor_device *ptdev,
+ u32 cs_ringbuf_size)
+{
+ u32 min_profiled_job_instrs = U32_MAX;
+ u32 last_flag = fls(PANTHOR_DEVICE_PROFILING_ALL);
+
+ /*
+ * We want to calculate the minimum size of a profiled job's CS,
+ * because since they need additional instructions for the sampling
+ * of performance metrics, they might take up further slots in
+ * the queue's ringbuffer. This means we might not need as many job
+ * slots for keeping track of their profiling information. What we
+ * need is the maximum number of slots we should allocate to this end,
+ * which matches the maximum number of profiled jobs we can place
+ * simultaneously in the queue's ring buffer.
+ * That has to be calculated separately for every single job profiling
+ * flag, but not in the case job profiling is disabled, since unprofiled
+ * jobs don't need to keep track of this at all.
+ */
+ for (u32 i = 0; i < last_flag; i++) {
+ min_profiled_job_instrs =
+ min(min_profiled_job_instrs, calc_job_credits(BIT(i)));
+ }
+
+ return DIV_ROUND_UP(cs_ringbuf_size, min_profiled_job_instrs * sizeof(u64));
+}
+
static struct panthor_queue *
group_create_queue(struct panthor_group *group,
const struct drm_panthor_queue_create *args)
@@ -3070,9 +3320,35 @@ group_create_queue(struct panthor_group *group,
goto err_free_queue;
}
+ queue->profiling.slot_count =
+ calc_profiling_ringbuf_num_slots(group->ptdev, args->ringbuf_size);
+
+ queue->profiling.slots =
+ panthor_kernel_bo_create(group->ptdev, group->vm,
+ queue->profiling.slot_count *
+ sizeof(struct panthor_job_profiling_data),
+ DRM_PANTHOR_BO_NO_MMAP,
+ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC |
+ DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED,
+ PANTHOR_VM_KERNEL_AUTO_VA);
+
+ if (IS_ERR(queue->profiling.slots)) {
+ ret = PTR_ERR(queue->profiling.slots);
+ goto err_free_queue;
+ }
+
+ ret = panthor_kernel_bo_vmap(queue->profiling.slots);
+ if (ret)
+ goto err_free_queue;
+
+ /*
+ * Credit limit argument tells us the total number of instructions
+ * across all CS slots in the ringbuffer, with some jobs requiring
+ * twice as many as others, depending on their profiling status.
+ */
ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops,
group->ptdev->scheduler->wq, 1,
- args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)),
+ args->ringbuf_size / sizeof(u64),
0, msecs_to_jiffies(JOB_TIMEOUT_MS),
group->ptdev->reset.wq,
NULL, "panthor-queue", group->ptdev->base.dev);
@@ -3213,6 +3489,8 @@ int panthor_group_create(struct panthor_file *pfile,
}
mutex_unlock(&sched->reset.lock);
+ mutex_init(&group->fdinfo.lock);
+
return gid;
err_put_group:
@@ -3380,6 +3658,7 @@ panthor_job_create(struct panthor_file *pfile,
{
struct panthor_group_pool *gpool = pfile->groups;
struct panthor_job *job;
+ u32 credits;
int ret;
if (qsubmit->pad)
@@ -3438,9 +3717,16 @@ panthor_job_create(struct panthor_file *pfile,
}
}
+ job->profiling.mask = pfile->ptdev->profile_mask;
+ credits = calc_job_credits(job->profiling.mask);
+ if (credits == 0) {
+ ret = -EINVAL;
+ goto err_put_job;
+ }
+
ret = drm_sched_job_init(&job->base,
&job->group->queues[job->queue_idx]->entity,
- 1, job->group);
+ credits, job->group);
if (ret)
goto err_put_job;