diff options
Diffstat (limited to 'drivers/gpu/drm/v3d/v3d_sched.c')
| -rw-r--r-- | drivers/gpu/drm/v3d/v3d_sched.c | 397 | 
1 files changed, 375 insertions, 22 deletions
| diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 038e1ae589c7..54015ad765c7 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -18,12 +18,17 @@   * semaphores to interlock between them.   */ +#include <linux/sched/clock.h>  #include <linux/kthread.h> +#include <drm/drm_syncobj.h> +  #include "v3d_drv.h"  #include "v3d_regs.h"  #include "v3d_trace.h" +#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16 +  static struct v3d_job *  to_v3d_job(struct drm_sched_job *sched_job)  { @@ -54,6 +59,12 @@ to_csd_job(struct drm_sched_job *sched_job)  	return container_of(sched_job, struct v3d_csd_job, base.base);  } +static struct v3d_cpu_job * +to_cpu_job(struct drm_sched_job *sched_job) +{ +	return container_of(sched_job, struct v3d_cpu_job, base.base); +} +  static void  v3d_sched_job_free(struct drm_sched_job *sched_job)  { @@ -63,6 +74,28 @@ v3d_sched_job_free(struct drm_sched_job *sched_job)  }  static void +v3d_cpu_job_free(struct drm_sched_job *sched_job) +{ +	struct v3d_cpu_job *job = to_cpu_job(sched_job); +	struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query; +	struct v3d_performance_query_info *performance_query = &job->performance_query; + +	if (timestamp_query->queries) { +		for (int i = 0; i < timestamp_query->count; i++) +			drm_syncobj_put(timestamp_query->queries[i].syncobj); +		kvfree(timestamp_query->queries); +	} + +	if (performance_query->queries) { +		for (int i = 0; i < performance_query->count; i++) +			drm_syncobj_put(performance_query->queries[i].syncobj); +		kvfree(performance_query->queries); +	} + +	v3d_job_cleanup(&job->base); +} + +static void  v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)  {  	if (job->perfmon != v3d->active_perfmon) @@ -76,6 +109,7 @@ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)  {  	struct v3d_bin_job *job = to_bin_job(sched_job);  	struct v3d_dev *v3d = job->base.v3d; +	struct v3d_file_priv *file = job->base.file->driver_priv;  	struct drm_device *dev = &v3d->drm;  	struct dma_fence *fence;  	unsigned long irqflags; @@ -107,6 +141,9 @@ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)  	trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,  			    job->start, job->end); +	file->start_ns[V3D_BIN] = local_clock(); +	v3d->queue[V3D_BIN].start_ns = file->start_ns[V3D_BIN]; +  	v3d_switch_perfmon(v3d, &job->base);  	/* Set the current and end address of the control list. @@ -131,6 +168,7 @@ static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job)  {  	struct v3d_render_job *job = to_render_job(sched_job);  	struct v3d_dev *v3d = job->base.v3d; +	struct v3d_file_priv *file = job->base.file->driver_priv;  	struct drm_device *dev = &v3d->drm;  	struct dma_fence *fence; @@ -158,6 +196,9 @@ static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job)  	trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,  			    job->start, job->end); +	file->start_ns[V3D_RENDER] = local_clock(); +	v3d->queue[V3D_RENDER].start_ns = file->start_ns[V3D_RENDER]; +  	v3d_switch_perfmon(v3d, &job->base);  	/* XXX: Set the QCFG */ @@ -176,6 +217,7 @@ v3d_tfu_job_run(struct drm_sched_job *sched_job)  {  	struct v3d_tfu_job *job = to_tfu_job(sched_job);  	struct v3d_dev *v3d = job->base.v3d; +	struct v3d_file_priv *file = job->base.file->driver_priv;  	struct drm_device *dev = &v3d->drm;  	struct dma_fence *fence; @@ -190,20 +232,25 @@ v3d_tfu_job_run(struct drm_sched_job *sched_job)  	trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno); -	V3D_WRITE(V3D_TFU_IIA, job->args.iia); -	V3D_WRITE(V3D_TFU_IIS, job->args.iis); -	V3D_WRITE(V3D_TFU_ICA, job->args.ica); -	V3D_WRITE(V3D_TFU_IUA, job->args.iua); -	V3D_WRITE(V3D_TFU_IOA, job->args.ioa); -	V3D_WRITE(V3D_TFU_IOS, job->args.ios); -	V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]); -	if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) { -		V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]); -		V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]); -		V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]); +	file->start_ns[V3D_TFU] = local_clock(); +	v3d->queue[V3D_TFU].start_ns = file->start_ns[V3D_TFU]; + +	V3D_WRITE(V3D_TFU_IIA(v3d->ver), job->args.iia); +	V3D_WRITE(V3D_TFU_IIS(v3d->ver), job->args.iis); +	V3D_WRITE(V3D_TFU_ICA(v3d->ver), job->args.ica); +	V3D_WRITE(V3D_TFU_IUA(v3d->ver), job->args.iua); +	V3D_WRITE(V3D_TFU_IOA(v3d->ver), job->args.ioa); +	if (v3d->ver >= 71) +		V3D_WRITE(V3D_V7_TFU_IOC, job->args.v71.ioc); +	V3D_WRITE(V3D_TFU_IOS(v3d->ver), job->args.ios); +	V3D_WRITE(V3D_TFU_COEF0(v3d->ver), job->args.coef[0]); +	if (v3d->ver >= 71 || (job->args.coef[0] & V3D_TFU_COEF0_USECOEF)) { +		V3D_WRITE(V3D_TFU_COEF1(v3d->ver), job->args.coef[1]); +		V3D_WRITE(V3D_TFU_COEF2(v3d->ver), job->args.coef[2]); +		V3D_WRITE(V3D_TFU_COEF3(v3d->ver), job->args.coef[3]);  	}  	/* ICFG kicks off the job. */ -	V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC); +	V3D_WRITE(V3D_TFU_ICFG(v3d->ver), job->args.icfg | V3D_TFU_ICFG_IOC);  	return fence;  } @@ -213,9 +260,10 @@ v3d_csd_job_run(struct drm_sched_job *sched_job)  {  	struct v3d_csd_job *job = to_csd_job(sched_job);  	struct v3d_dev *v3d = job->base.v3d; +	struct v3d_file_priv *file = job->base.file->driver_priv;  	struct drm_device *dev = &v3d->drm;  	struct dma_fence *fence; -	int i; +	int i, csd_cfg0_reg, csd_cfg_reg_count;  	v3d->csd_job = job; @@ -231,24 +279,314 @@ v3d_csd_job_run(struct drm_sched_job *sched_job)  	trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno); +	file->start_ns[V3D_CSD] = local_clock(); +	v3d->queue[V3D_CSD].start_ns = file->start_ns[V3D_CSD]; +  	v3d_switch_perfmon(v3d, &job->base); -	for (i = 1; i <= 6; i++) -		V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]); +	csd_cfg0_reg = V3D_CSD_QUEUED_CFG0(v3d->ver); +	csd_cfg_reg_count = v3d->ver < 71 ? 6 : 7; +	for (i = 1; i <= csd_cfg_reg_count; i++) +		V3D_CORE_WRITE(0, csd_cfg0_reg + 4 * i, job->args.cfg[i]);  	/* CFG0 write kicks off the job. */ -	V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0, job->args.cfg[0]); +	V3D_CORE_WRITE(0, csd_cfg0_reg, job->args.cfg[0]);  	return fence;  } +static void +v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job) +{ +	struct v3d_indirect_csd_info *indirect_csd = &job->indirect_csd; +	struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]); +	struct v3d_bo *indirect = to_v3d_bo(indirect_csd->indirect); +	struct drm_v3d_submit_csd *args = &indirect_csd->job->args; +	u32 *wg_counts; + +	v3d_get_bo_vaddr(bo); +	v3d_get_bo_vaddr(indirect); + +	wg_counts = (uint32_t *)(bo->vaddr + indirect_csd->offset); + +	if (wg_counts[0] == 0 || wg_counts[1] == 0 || wg_counts[2] == 0) +		return; + +	args->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT; +	args->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; +	args->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; +	args->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) * +		       (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; + +	for (int i = 0; i < 3; i++) { +		/* 0xffffffff indicates that the uniform rewrite is not needed */ +		if (indirect_csd->wg_uniform_offsets[i] != 0xffffffff) { +			u32 uniform_idx = indirect_csd->wg_uniform_offsets[i]; +			((uint32_t *)indirect->vaddr)[uniform_idx] = wg_counts[i]; +		} +	} + +	v3d_put_bo_vaddr(indirect); +	v3d_put_bo_vaddr(bo); +} + +static void +v3d_timestamp_query(struct v3d_cpu_job *job) +{ +	struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query; +	struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]); +	u8 *value_addr; + +	v3d_get_bo_vaddr(bo); + +	for (int i = 0; i < timestamp_query->count; i++) { +		value_addr = ((u8 *)bo->vaddr) + timestamp_query->queries[i].offset; +		*((u64 *)value_addr) = i == 0 ? ktime_get_ns() : 0ull; + +		drm_syncobj_replace_fence(timestamp_query->queries[i].syncobj, +					  job->base.done_fence); +	} + +	v3d_put_bo_vaddr(bo); +} + +static void +v3d_reset_timestamp_queries(struct v3d_cpu_job *job) +{ +	struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query; +	struct v3d_timestamp_query *queries = timestamp_query->queries; +	struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]); +	u8 *value_addr; + +	v3d_get_bo_vaddr(bo); + +	for (int i = 0; i < timestamp_query->count; i++) { +		value_addr = ((u8 *)bo->vaddr) + queries[i].offset; +		*((u64 *)value_addr) = 0; + +		drm_syncobj_replace_fence(queries[i].syncobj, NULL); +	} + +	v3d_put_bo_vaddr(bo); +} + +static void +write_to_buffer(void *dst, u32 idx, bool do_64bit, u64 value) +{ +	if (do_64bit) { +		u64 *dst64 = (u64 *)dst; + +		dst64[idx] = value; +	} else { +		u32 *dst32 = (u32 *)dst; + +		dst32[idx] = (u32)value; +	} +} + +static void +v3d_copy_query_results(struct v3d_cpu_job *job) +{ +	struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query; +	struct v3d_timestamp_query *queries = timestamp_query->queries; +	struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]); +	struct v3d_bo *timestamp = to_v3d_bo(job->base.bo[1]); +	struct v3d_copy_query_results_info *copy = &job->copy; +	struct dma_fence *fence; +	u8 *query_addr; +	bool available, write_result; +	u8 *data; +	int i; + +	v3d_get_bo_vaddr(bo); +	v3d_get_bo_vaddr(timestamp); + +	data = ((u8 *)bo->vaddr) + copy->offset; + +	for (i = 0; i < timestamp_query->count; i++) { +		fence = drm_syncobj_fence_get(queries[i].syncobj); +		available = fence ? dma_fence_is_signaled(fence) : false; + +		write_result = available || copy->do_partial; +		if (write_result) { +			query_addr = ((u8 *)timestamp->vaddr) + queries[i].offset; +			write_to_buffer(data, 0, copy->do_64bit, *((u64 *)query_addr)); +		} + +		if (copy->availability_bit) +			write_to_buffer(data, 1, copy->do_64bit, available ? 1u : 0u); + +		data += copy->stride; + +		dma_fence_put(fence); +	} + +	v3d_put_bo_vaddr(timestamp); +	v3d_put_bo_vaddr(bo); +} + +static void +v3d_reset_performance_queries(struct v3d_cpu_job *job) +{ +	struct v3d_performance_query_info *performance_query = &job->performance_query; +	struct v3d_file_priv *v3d_priv = job->base.file->driver_priv; +	struct v3d_dev *v3d = job->base.v3d; +	struct v3d_perfmon *perfmon; + +	for (int i = 0; i < performance_query->count; i++) { +		for (int j = 0; j < performance_query->nperfmons; j++) { +			perfmon = v3d_perfmon_find(v3d_priv, +						   performance_query->queries[i].kperfmon_ids[j]); +			if (!perfmon) { +				DRM_DEBUG("Failed to find perfmon."); +				continue; +			} + +			v3d_perfmon_stop(v3d, perfmon, false); + +			memset(perfmon->values, 0, perfmon->ncounters * sizeof(u64)); + +			v3d_perfmon_put(perfmon); +		} + +		drm_syncobj_replace_fence(performance_query->queries[i].syncobj, NULL); +	} +} + +static void +v3d_write_performance_query_result(struct v3d_cpu_job *job, void *data, u32 query) +{ +	struct v3d_performance_query_info *performance_query = &job->performance_query; +	struct v3d_copy_query_results_info *copy = &job->copy; +	struct v3d_file_priv *v3d_priv = job->base.file->driver_priv; +	struct v3d_dev *v3d = job->base.v3d; +	struct v3d_perfmon *perfmon; +	u64 counter_values[V3D_PERFCNT_NUM]; + +	for (int i = 0; i < performance_query->nperfmons; i++) { +		perfmon = v3d_perfmon_find(v3d_priv, +					   performance_query->queries[query].kperfmon_ids[i]); +		if (!perfmon) { +			DRM_DEBUG("Failed to find perfmon."); +			continue; +		} + +		v3d_perfmon_stop(v3d, perfmon, true); + +		memcpy(&counter_values[i * DRM_V3D_MAX_PERF_COUNTERS], perfmon->values, +		       perfmon->ncounters * sizeof(u64)); + +		v3d_perfmon_put(perfmon); +	} + +	for (int i = 0; i < performance_query->ncounters; i++) +		write_to_buffer(data, i, copy->do_64bit, counter_values[i]); +} + +static void +v3d_copy_performance_query(struct v3d_cpu_job *job) +{ +	struct v3d_performance_query_info *performance_query = &job->performance_query; +	struct v3d_copy_query_results_info *copy = &job->copy; +	struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]); +	struct dma_fence *fence; +	bool available, write_result; +	u8 *data; + +	v3d_get_bo_vaddr(bo); + +	data = ((u8 *)bo->vaddr) + copy->offset; + +	for (int i = 0; i < performance_query->count; i++) { +		fence = drm_syncobj_fence_get(performance_query->queries[i].syncobj); +		available = fence ? dma_fence_is_signaled(fence) : false; + +		write_result = available || copy->do_partial; +		if (write_result) +			v3d_write_performance_query_result(job, data, i); + +		if (copy->availability_bit) +			write_to_buffer(data, performance_query->ncounters, +					copy->do_64bit, available ? 1u : 0u); + +		data += copy->stride; + +		dma_fence_put(fence); +	} + +	v3d_put_bo_vaddr(bo); +} + +static const v3d_cpu_job_fn cpu_job_function[] = { +	[V3D_CPU_JOB_TYPE_INDIRECT_CSD] = v3d_rewrite_csd_job_wg_counts_from_indirect, +	[V3D_CPU_JOB_TYPE_TIMESTAMP_QUERY] = v3d_timestamp_query, +	[V3D_CPU_JOB_TYPE_RESET_TIMESTAMP_QUERY] = v3d_reset_timestamp_queries, +	[V3D_CPU_JOB_TYPE_COPY_TIMESTAMP_QUERY] = v3d_copy_query_results, +	[V3D_CPU_JOB_TYPE_RESET_PERFORMANCE_QUERY] = v3d_reset_performance_queries, +	[V3D_CPU_JOB_TYPE_COPY_PERFORMANCE_QUERY] = v3d_copy_performance_query, +}; + +static struct dma_fence * +v3d_cpu_job_run(struct drm_sched_job *sched_job) +{ +	struct v3d_cpu_job *job = to_cpu_job(sched_job); +	struct v3d_dev *v3d = job->base.v3d; +	struct v3d_file_priv *file = job->base.file->driver_priv; +	u64 runtime; + +	v3d->cpu_job = job; + +	if (job->job_type >= ARRAY_SIZE(cpu_job_function)) { +		DRM_DEBUG_DRIVER("Unknown CPU job: %d\n", job->job_type); +		return NULL; +	} + +	file->start_ns[V3D_CPU] = local_clock(); +	v3d->queue[V3D_CPU].start_ns = file->start_ns[V3D_CPU]; + +	trace_v3d_cpu_job_begin(&v3d->drm, job->job_type); + +	cpu_job_function[job->job_type](job); + +	trace_v3d_cpu_job_end(&v3d->drm, job->job_type); + +	runtime = local_clock() - file->start_ns[V3D_CPU]; + +	file->enabled_ns[V3D_CPU] += runtime; +	v3d->queue[V3D_CPU].enabled_ns += runtime; + +	file->jobs_sent[V3D_CPU]++; +	v3d->queue[V3D_CPU].jobs_sent++; + +	file->start_ns[V3D_CPU] = 0; +	v3d->queue[V3D_CPU].start_ns = 0; + +	return NULL; +} +  static struct dma_fence *  v3d_cache_clean_job_run(struct drm_sched_job *sched_job)  {  	struct v3d_job *job = to_v3d_job(sched_job);  	struct v3d_dev *v3d = job->v3d; +	struct v3d_file_priv *file = job->file->driver_priv; +	u64 runtime; + +	file->start_ns[V3D_CACHE_CLEAN] = local_clock(); +	v3d->queue[V3D_CACHE_CLEAN].start_ns = file->start_ns[V3D_CACHE_CLEAN];  	v3d_clean_caches(v3d); +	runtime = local_clock() - file->start_ns[V3D_CACHE_CLEAN]; + +	file->enabled_ns[V3D_CACHE_CLEAN] += runtime; +	v3d->queue[V3D_CACHE_CLEAN].enabled_ns += runtime; + +	file->jobs_sent[V3D_CACHE_CLEAN]++; +	v3d->queue[V3D_CACHE_CLEAN].jobs_sent++; + +	file->start_ns[V3D_CACHE_CLEAN] = 0; +	v3d->queue[V3D_CACHE_CLEAN].start_ns = 0; +  	return NULL;  } @@ -336,7 +674,7 @@ v3d_csd_job_timedout(struct drm_sched_job *sched_job)  {  	struct v3d_csd_job *job = to_csd_job(sched_job);  	struct v3d_dev *v3d = job->base.v3d; -	u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4); +	u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4(v3d->ver));  	/* If we've made progress, skip reset and let the timer get  	 * rearmed. @@ -379,6 +717,12 @@ static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = {  	.free_job = v3d_sched_job_free  }; +static const struct drm_sched_backend_ops v3d_cpu_sched_ops = { +	.run_job = v3d_cpu_job_run, +	.timedout_job = v3d_generic_job_timedout, +	.free_job = v3d_cpu_job_free +}; +  int  v3d_sched_init(struct v3d_dev *v3d)  { @@ -388,7 +732,7 @@ v3d_sched_init(struct v3d_dev *v3d)  	int ret;  	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched, -			     &v3d_bin_sched_ops, +			     &v3d_bin_sched_ops, NULL,  			     DRM_SCHED_PRIORITY_COUNT,  			     hw_jobs_limit, job_hang_limit,  			     msecs_to_jiffies(hang_limit_ms), NULL, @@ -397,7 +741,7 @@ v3d_sched_init(struct v3d_dev *v3d)  		return ret;  	ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched, -			     &v3d_render_sched_ops, +			     &v3d_render_sched_ops, NULL,  			     DRM_SCHED_PRIORITY_COUNT,  			     hw_jobs_limit, job_hang_limit,  			     msecs_to_jiffies(hang_limit_ms), NULL, @@ -406,7 +750,7 @@ v3d_sched_init(struct v3d_dev *v3d)  		goto fail;  	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched, -			     &v3d_tfu_sched_ops, +			     &v3d_tfu_sched_ops, NULL,  			     DRM_SCHED_PRIORITY_COUNT,  			     hw_jobs_limit, job_hang_limit,  			     msecs_to_jiffies(hang_limit_ms), NULL, @@ -416,7 +760,7 @@ v3d_sched_init(struct v3d_dev *v3d)  	if (v3d_has_csd(v3d)) {  		ret = drm_sched_init(&v3d->queue[V3D_CSD].sched, -				     &v3d_csd_sched_ops, +				     &v3d_csd_sched_ops, NULL,  				     DRM_SCHED_PRIORITY_COUNT,  				     hw_jobs_limit, job_hang_limit,  				     msecs_to_jiffies(hang_limit_ms), NULL, @@ -425,7 +769,7 @@ v3d_sched_init(struct v3d_dev *v3d)  			goto fail;  		ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched, -				     &v3d_cache_clean_sched_ops, +				     &v3d_cache_clean_sched_ops, NULL,  				     DRM_SCHED_PRIORITY_COUNT,  				     hw_jobs_limit, job_hang_limit,  				     msecs_to_jiffies(hang_limit_ms), NULL, @@ -434,6 +778,15 @@ v3d_sched_init(struct v3d_dev *v3d)  			goto fail;  	} +	ret = drm_sched_init(&v3d->queue[V3D_CPU].sched, +			     &v3d_cpu_sched_ops, NULL, +			     DRM_SCHED_PRIORITY_COUNT, +			     1, job_hang_limit, +			     msecs_to_jiffies(hang_limit_ms), NULL, +			     NULL, "v3d_cpu", v3d->drm.dev); +	if (ret) +		goto fail; +  	return 0;  fail: | 
