diff options
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gpu_error.c')
| -rw-r--r-- | drivers/gpu/drm/i915/i915_gpu_error.c | 1184 | 
1 files changed, 647 insertions, 537 deletions
| diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index fda0977d2059..730129ca4c17 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -41,6 +41,7 @@  #include "gem/i915_gem_context.h"  #include "gem/i915_gem_lmem.h" +#include "gt/intel_gt_pm.h"  #include "i915_drv.h"  #include "i915_gpu_error.h" @@ -232,14 +233,13 @@ static void pool_free(struct pagevec *pv, void *addr)  #ifdef CONFIG_DRM_I915_COMPRESS_ERROR -struct compress { +struct i915_vma_compress {  	struct pagevec pool;  	struct z_stream_s zstream;  	void *tmp; -	bool wc;  }; -static bool compress_init(struct compress *c) +static bool compress_init(struct i915_vma_compress *c)  {  	struct z_stream_s *zstream = &c->zstream; @@ -261,7 +261,7 @@ static bool compress_init(struct compress *c)  	return true;  } -static bool compress_start(struct compress *c) +static bool compress_start(struct i915_vma_compress *c)  {  	struct z_stream_s *zstream = &c->zstream;  	void *workspace = zstream->workspace; @@ -272,8 +272,8 @@ static bool compress_start(struct compress *c)  	return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;  } -static void *compress_next_page(struct compress *c, -				struct drm_i915_error_object *dst) +static void *compress_next_page(struct i915_vma_compress *c, +				struct i915_vma_coredump *dst)  {  	void *page; @@ -287,14 +287,15 @@ static void *compress_next_page(struct compress *c,  	return dst->pages[dst->page_count++] = page;  } -static int compress_page(struct compress *c, +static int compress_page(struct i915_vma_compress *c,  			 void *src, -			 struct drm_i915_error_object *dst) +			 struct i915_vma_coredump *dst, +			 bool wc)  {  	struct z_stream_s *zstream = &c->zstream;  	zstream->next_in = src; -	if (c->wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) +	if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))  		zstream->next_in = c->tmp;  	zstream->avail_in = PAGE_SIZE; @@ -318,8 +319,8 @@ static int compress_page(struct compress *c,  	return 0;  } -static int compress_flush(struct compress *c, -			  struct drm_i915_error_object *dst) +static int compress_flush(struct i915_vma_compress *c, +			  struct i915_vma_coredump *dst)  {  	struct z_stream_s *zstream = &c->zstream; @@ -347,12 +348,12 @@ end:  	return 0;  } -static void compress_finish(struct compress *c) +static void compress_finish(struct i915_vma_compress *c)  {  	zlib_deflateEnd(&c->zstream);  } -static void compress_fini(struct compress *c) +static void compress_fini(struct i915_vma_compress *c)  {  	kfree(c->zstream.workspace);  	if (c->tmp) @@ -367,24 +368,24 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)  #else -struct compress { +struct i915_vma_compress {  	struct pagevec pool; -	bool wc;  }; -static bool compress_init(struct compress *c) +static bool compress_init(struct i915_vma_compress *c)  {  	return pool_init(&c->pool, ALLOW_FAIL) == 0;  } -static bool compress_start(struct compress *c) +static bool compress_start(struct i915_vma_compress *c)  {  	return true;  } -static int compress_page(struct compress *c, +static int compress_page(struct i915_vma_compress *c,  			 void *src, -			 struct drm_i915_error_object *dst) +			 struct i915_vma_coredump *dst, +			 bool wc)  {  	void *ptr; @@ -392,24 +393,24 @@ static int compress_page(struct compress *c,  	if (!ptr)  		return -ENOMEM; -	if (!(c->wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE))) +	if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))  		memcpy(ptr, src, PAGE_SIZE);  	dst->pages[dst->page_count++] = ptr;  	return 0;  } -static int compress_flush(struct compress *c, -			  struct drm_i915_error_object *dst) +static int compress_flush(struct i915_vma_compress *c, +			  struct i915_vma_coredump *dst)  {  	return 0;  } -static void compress_finish(struct compress *c) +static void compress_finish(struct i915_vma_compress *c)  {  } -static void compress_fini(struct compress *c) +static void compress_fini(struct i915_vma_compress *c)  {  	pool_fini(&c->pool);  } @@ -422,7 +423,7 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)  #endif  static void error_print_instdone(struct drm_i915_error_state_buf *m, -				 const struct drm_i915_error_engine *ee) +				 const struct intel_engine_coredump *ee)  {  	const struct sseu_dev_info *sseu = &RUNTIME_INFO(m->i915)->sseu;  	int slice; @@ -453,40 +454,56 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,  static void error_print_request(struct drm_i915_error_state_buf *m,  				const char *prefix, -				const struct drm_i915_error_request *erq, -				const unsigned long epoch) +				const struct i915_request_coredump *erq)  {  	if (!erq->seqno)  		return; -	err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", +	err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, start %08x, head %08x, tail %08x\n",  		   prefix, erq->pid, erq->context, erq->seqno,  		   test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,  			    &erq->flags) ? "!" : "",  		   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,  			    &erq->flags) ? "+" : "",  		   erq->sched_attr.priority, -		   jiffies_to_msecs(erq->jiffies - epoch),  		   erq->start, erq->head, erq->tail);  }  static void error_print_context(struct drm_i915_error_state_buf *m,  				const char *header, -				const struct drm_i915_error_context *ctx) +				const struct i915_gem_context_coredump *ctx)  {  	err_printf(m, "%s%s[%d] prio %d, guilty %d active %d\n",  		   header, ctx->comm, ctx->pid, ctx->sched_attr.priority,  		   ctx->guilty, ctx->active);  } +static struct i915_vma_coredump * +__find_vma(struct i915_vma_coredump *vma, const char *name) +{ +	while (vma) { +		if (strcmp(vma->name, name) == 0) +			return vma; +		vma = vma->next; +	} + +	return NULL; +} + +static struct i915_vma_coredump * +find_batch(const struct intel_engine_coredump *ee) +{ +	return __find_vma(ee->vma, "batch"); +} +  static void error_print_engine(struct drm_i915_error_state_buf *m, -			       const struct drm_i915_error_engine *ee, -			       const unsigned long epoch) +			       const struct intel_engine_coredump *ee)  { +	struct i915_vma_coredump *batch;  	int n;  	err_printf(m, "%s command stream:\n", ee->engine->name); -	err_printf(m, "  IDLE?: %s\n", yesno(ee->idle)); +	err_printf(m, "  CCID:  0x%08x\n", ee->ccid);  	err_printf(m, "  START: 0x%08x\n", ee->start);  	err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);  	err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n", @@ -501,9 +518,10 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,  	error_print_instdone(m, ee); -	if (ee->batchbuffer) { -		u64 start = ee->batchbuffer->gtt_offset; -		u64 end = start + ee->batchbuffer->gtt_size; +	batch = find_batch(ee); +	if (batch) { +		u64 start = batch->gtt_offset; +		u64 end = start + batch->gtt_size;  		err_printf(m, "  batch: [0x%08x_%08x, 0x%08x_%08x]\n",  			   upper_32_bits(start), lower_32_bits(start), @@ -541,7 +559,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,  	for (n = 0; n < ee->num_ports; n++) {  		err_printf(m, "  ELSP[%d]:", n); -		error_print_request(m, " ", &ee->execlist[n], epoch); +		error_print_request(m, " ", &ee->execlist[n]);  	}  	error_print_context(m, "  Active context: ", &ee->context); @@ -556,38 +574,35 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)  	va_end(args);  } -static void print_error_obj(struct drm_i915_error_state_buf *m, +static void print_error_vma(struct drm_i915_error_state_buf *m,  			    const struct intel_engine_cs *engine, -			    const char *name, -			    const struct drm_i915_error_object *obj) +			    const struct i915_vma_coredump *vma)  {  	char out[ASCII85_BUFSZ];  	int page; -	if (!obj) +	if (!vma)  		return; -	if (name) { -		err_printf(m, "%s --- %s = 0x%08x %08x\n", -			   engine ? engine->name : "global", name, -			   upper_32_bits(obj->gtt_offset), -			   lower_32_bits(obj->gtt_offset)); -	} +	err_printf(m, "%s --- %s = 0x%08x %08x\n", +		   engine ? engine->name : "global", vma->name, +		   upper_32_bits(vma->gtt_offset), +		   lower_32_bits(vma->gtt_offset)); -	if (obj->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K) -		err_printf(m, "gtt_page_sizes = 0x%08x\n", obj->gtt_page_sizes); +	if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K) +		err_printf(m, "gtt_page_sizes = 0x%08x\n", vma->gtt_page_sizes);  	err_compression_marker(m); -	for (page = 0; page < obj->page_count; page++) { +	for (page = 0; page < vma->page_count; page++) {  		int i, len;  		len = PAGE_SIZE; -		if (page == obj->page_count - 1) -			len -= obj->unused; +		if (page == vma->page_count - 1) +			len -= vma->unused;  		len = ascii85_encode_len(len);  		for (i = 0; i < len; i++) -			err_puts(m, ascii85_encode(obj->pages[page][i], out)); +			err_puts(m, ascii85_encode(vma->pages[page][i], out));  	}  	err_puts(m, "\n");  } @@ -626,18 +641,13 @@ static void err_print_pciid(struct drm_i915_error_state_buf *m,  }  static void err_print_uc(struct drm_i915_error_state_buf *m, -			 const struct i915_error_uc *error_uc) +			 const struct intel_uc_coredump *error_uc)  {  	struct drm_printer p = i915_error_printer(m); -	const struct i915_gpu_state *error = -		container_of(error_uc, typeof(*error), uc); - -	if (!error->device_info.has_gt_uc) -		return;  	intel_uc_fw_dump(&error_uc->guc_fw, &p);  	intel_uc_fw_dump(&error_uc->huc_fw, &p); -	print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log); +	print_error_vma(m, NULL, error_uc->guc_log);  }  static void err_free_sgl(struct scatterlist *sgl) @@ -657,12 +667,78 @@ static void err_free_sgl(struct scatterlist *sgl)  	}  } +static void err_print_gt(struct drm_i915_error_state_buf *m, +			 struct intel_gt_coredump *gt) +{ +	const struct intel_engine_coredump *ee; +	int i, j; + +	err_printf(m, "GT awake: %s\n", yesno(gt->awake)); +	err_printf(m, "EIR: 0x%08x\n", gt->eir); +	err_printf(m, "IER: 0x%08x\n", gt->ier); +	for (i = 0; i < gt->ngtier; i++) +		err_printf(m, "GTIER[%d]: 0x%08x\n", i, gt->gtier[i]); +	err_printf(m, "PGTBL_ER: 0x%08x\n", gt->pgtbl_er); +	err_printf(m, "FORCEWAKE: 0x%08x\n", gt->forcewake); +	err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr); + +	for (i = 0; i < gt->nfence; i++) +		err_printf(m, "  fence[%d] = %08llx\n", i, gt->fence[i]); + +	if (IS_GEN_RANGE(m->i915, 6, 11)) { +		err_printf(m, "ERROR: 0x%08x\n", gt->error); +		err_printf(m, "DONE_REG: 0x%08x\n", gt->done_reg); +	} + +	if (INTEL_GEN(m->i915) >= 8) +		err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", +			   gt->fault_data1, gt->fault_data0); + +	if (IS_GEN(m->i915, 7)) +		err_printf(m, "ERR_INT: 0x%08x\n", gt->err_int); + +	if (IS_GEN_RANGE(m->i915, 8, 11)) +		err_printf(m, "GTT_CACHE_EN: 0x%08x\n", gt->gtt_cache); + +	if (IS_GEN(m->i915, 12)) +		err_printf(m, "AUX_ERR_DBG: 0x%08x\n", gt->aux_err); + +	if (INTEL_GEN(m->i915) >= 12) { +		int i; + +		for (i = 0; i < GEN12_SFC_DONE_MAX; i++) +			err_printf(m, "  SFC_DONE[%d]: 0x%08x\n", i, +				   gt->sfc_done[i]); + +		err_printf(m, "  GAM_DONE: 0x%08x\n", gt->gam_done); +	} + +	for (ee = gt->engine; ee; ee = ee->next) { +		const struct i915_vma_coredump *vma; + +		error_print_engine(m, ee); + +		for (vma = ee->vma; vma; vma = vma->next) +			print_error_vma(m, ee->engine, vma); + +		if (ee->num_requests) { +			err_printf(m, "%s --- %d requests\n", +				   ee->engine->name, +				   ee->num_requests); +			for (j = 0; j < ee->num_requests; j++) +				error_print_request(m, " ", &ee->requests[j]); +		} +	} + +	if (gt->uc) +		err_print_uc(m, gt->uc); +} +  static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, -			       struct i915_gpu_state *error) +			       struct i915_gpu_coredump *error)  { -	const struct drm_i915_error_engine *ee; +	const struct intel_engine_coredump *ee;  	struct timespec64 ts; -	int i, j;  	if (*error->error_msg)  		err_printf(m, "%s\n", error->error_msg); @@ -682,7 +758,7 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,  	err_printf(m, "Capture: %lu jiffies; %d ms ago\n",  		   error->capture, jiffies_to_msecs(jiffies - error->capture)); -	for (ee = error->engine; ee; ee = ee->next) +	for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next)  		err_printf(m, "Active process (on ring %s): %s [%d]\n",  			   ee->engine->name,  			   ee->context.comm, @@ -708,90 +784,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,  			   CSR_VERSION_MINOR(csr->version));  	} -	err_printf(m, "GT awake: %s\n", yesno(error->awake));  	err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));  	err_printf(m, "PM suspended: %s\n", yesno(error->suspended)); -	err_printf(m, "EIR: 0x%08x\n", error->eir); -	err_printf(m, "IER: 0x%08x\n", error->ier); -	for (i = 0; i < error->ngtier; i++) -		err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); -	err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); -	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); -	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); -	err_printf(m, "CCID: 0x%08x\n", error->ccid); - -	for (i = 0; i < error->nfence; i++) -		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]); - -	if (IS_GEN_RANGE(m->i915, 6, 11)) { -		err_printf(m, "ERROR: 0x%08x\n", error->error); -		err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); -	} - -	if (INTEL_GEN(m->i915) >= 8) -		err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", -			   error->fault_data1, error->fault_data0); - -	if (IS_GEN(m->i915, 7)) -		err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); - -	if (IS_GEN_RANGE(m->i915, 8, 11)) -		err_printf(m, "GTT_CACHE_EN: 0x%08x\n", error->gtt_cache); - -	if (IS_GEN(m->i915, 12)) -		err_printf(m, "AUX_ERR_DBG: 0x%08x\n", error->aux_err); - -	if (INTEL_GEN(m->i915) >= 12) { -		int i; - -		for (i = 0; i < GEN12_SFC_DONE_MAX; i++) -			err_printf(m, "  SFC_DONE[%d]: 0x%08x\n", i, -				   error->sfc_done[i]); - -		err_printf(m, "  GAM_DONE: 0x%08x\n", error->gam_done); -	} - -	for (ee = error->engine; ee; ee = ee->next) -		error_print_engine(m, ee, error->capture); - -	for (ee = error->engine; ee; ee = ee->next) { -		const struct drm_i915_error_object *obj; - -		obj = ee->batchbuffer; -		if (obj) { -			err_puts(m, ee->engine->name); -			if (ee->context.pid) -				err_printf(m, " (submitted by %s [%d])", -					   ee->context.comm, -					   ee->context.pid); -			err_printf(m, " --- gtt_offset = 0x%08x %08x\n", -				   upper_32_bits(obj->gtt_offset), -				   lower_32_bits(obj->gtt_offset)); -			print_error_obj(m, ee->engine, NULL, obj); -		} - -		for (j = 0; j < ee->user_bo_count; j++) -			print_error_obj(m, ee->engine, "user", ee->user_bo[j]); - -		if (ee->num_requests) { -			err_printf(m, "%s --- %d requests\n", -				   ee->engine->name, -				   ee->num_requests); -			for (j = 0; j < ee->num_requests; j++) -				error_print_request(m, " ", -						    &ee->requests[j], -						    error->capture); -		} -		print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer); -		print_error_obj(m, ee->engine, "HW Status", ee->hws_page); -		print_error_obj(m, ee->engine, "HW context", ee->ctx); -		print_error_obj(m, ee->engine, "WA context", ee->wa_ctx); -		print_error_obj(m, ee->engine, -				"WA batchbuffer", ee->wa_batchbuffer); -		print_error_obj(m, ee->engine, -				"NULL context", ee->default_state); -	} +	if (error->gt) +		err_print_gt(m, error->gt);  	if (error->overlay)  		intel_overlay_print_error_state(m, error->overlay); @@ -802,10 +799,9 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,  	err_print_capabilities(m, &error->device_info, &error->runtime_info,  			       &error->driver_caps);  	err_print_params(m, &error->params); -	err_print_uc(m, &error->uc);  } -static int err_print_to_sgl(struct i915_gpu_state *error) +static int err_print_to_sgl(struct i915_gpu_coredump *error)  {  	struct drm_i915_error_state_buf m; @@ -842,8 +838,8 @@ static int err_print_to_sgl(struct i915_gpu_state *error)  	return 0;  } -ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, -				      char *buf, loff_t off, size_t rem) +ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error, +					 char *buf, loff_t off, size_t rem)  {  	struct scatterlist *sg;  	size_t count; @@ -906,85 +902,89 @@ ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,  	return count;  } -static void i915_error_object_free(struct drm_i915_error_object *obj) +static void i915_vma_coredump_free(struct i915_vma_coredump *vma)  { -	int page; - -	if (obj == NULL) -		return; +	while (vma) { +		struct i915_vma_coredump *next = vma->next; +		int page; -	for (page = 0; page < obj->page_count; page++) -		free_page((unsigned long)obj->pages[page]); +		for (page = 0; page < vma->page_count; page++) +			free_page((unsigned long)vma->pages[page]); -	kfree(obj); +		kfree(vma); +		vma = next; +	}  } - -static void cleanup_params(struct i915_gpu_state *error) +static void cleanup_params(struct i915_gpu_coredump *error)  {  	i915_params_free(&error->params);  } -static void cleanup_uc_state(struct i915_gpu_state *error) +static void cleanup_uc(struct intel_uc_coredump *uc)  { -	struct i915_error_uc *error_uc = &error->uc; +	kfree(uc->guc_fw.path); +	kfree(uc->huc_fw.path); +	i915_vma_coredump_free(uc->guc_log); -	kfree(error_uc->guc_fw.path); -	kfree(error_uc->huc_fw.path); -	i915_error_object_free(error_uc->guc_log); +	kfree(uc);  } -void __i915_gpu_state_free(struct kref *error_ref) +static void cleanup_gt(struct intel_gt_coredump *gt)  { -	struct i915_gpu_state *error = -		container_of(error_ref, typeof(*error), ref); -	long i; +	while (gt->engine) { +		struct intel_engine_coredump *ee = gt->engine; + +		gt->engine = ee->next; -	while (error->engine) { -		struct drm_i915_error_engine *ee = error->engine; +		i915_vma_coredump_free(ee->vma); +		kfree(ee->requests); +		kfree(ee); +	} -		error->engine = ee->next; +	if (gt->uc) +		cleanup_uc(gt->uc); -		for (i = 0; i < ee->user_bo_count; i++) -			i915_error_object_free(ee->user_bo[i]); -		kfree(ee->user_bo); +	kfree(gt); +} -		i915_error_object_free(ee->batchbuffer); -		i915_error_object_free(ee->wa_batchbuffer); -		i915_error_object_free(ee->ringbuffer); -		i915_error_object_free(ee->hws_page); -		i915_error_object_free(ee->ctx); -		i915_error_object_free(ee->wa_ctx); +void __i915_gpu_coredump_free(struct kref *error_ref) +{ +	struct i915_gpu_coredump *error = +		container_of(error_ref, typeof(*error), ref); -		kfree(ee->requests); -		kfree(ee); +	while (error->gt) { +		struct intel_gt_coredump *gt = error->gt; + +		error->gt = gt->next; +		cleanup_gt(gt);  	}  	kfree(error->overlay);  	kfree(error->display);  	cleanup_params(error); -	cleanup_uc_state(error);  	err_free_sgl(error->sgl);  	kfree(error);  } -static struct drm_i915_error_object * -i915_error_object_create(struct drm_i915_private *i915, -			 struct i915_vma *vma, -			 struct compress *compress) +static struct i915_vma_coredump * +i915_vma_coredump_create(const struct intel_gt *gt, +			 const struct i915_vma *vma, +			 const char *name, +			 struct i915_vma_compress *compress)  { -	struct i915_ggtt *ggtt = &i915->ggtt; +	struct i915_ggtt *ggtt = gt->ggtt;  	const u64 slot = ggtt->error_capture.start; -	struct drm_i915_error_object *dst; +	struct i915_vma_coredump *dst;  	unsigned long num_pages;  	struct sgt_iter iter;  	int ret;  	might_sleep(); -	if (!vma || !vma->pages) +	if (!vma || !vma->pages || !compress)  		return NULL;  	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; @@ -998,6 +998,9 @@ i915_error_object_create(struct drm_i915_private *i915,  		return NULL;  	} +	strcpy(dst->name, name); +	dst->next = NULL; +  	dst->gtt_offset = vma->node.start;  	dst->gtt_size = vma->node.size;  	dst->gtt_page_sizes = vma->page_sizes.gtt; @@ -1005,9 +1008,6 @@ i915_error_object_create(struct drm_i915_private *i915,  	dst->page_count = 0;  	dst->unused = 0; -	compress->wc = i915_gem_object_is_lmem(vma->obj) || -		       drm_mm_node_allocated(&ggtt->error_capture); -  	ret = -EINVAL;  	if (drm_mm_node_allocated(&ggtt->error_capture)) {  		void __iomem *s; @@ -1016,9 +1016,12 @@ i915_error_object_create(struct drm_i915_private *i915,  		for_each_sgt_daddr(dma, iter, vma->pages) {  			ggtt->vm.insert_page(&ggtt->vm, dma, slot,  					     I915_CACHE_NONE, 0); +			mb();  			s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE); -			ret = compress_page(compress, (void  __force *)s, dst); +			ret = compress_page(compress, +					    (void  __force *)s, dst, +					    true);  			io_mapping_unmap(s);  			if (ret)  				break; @@ -1031,7 +1034,9 @@ i915_error_object_create(struct drm_i915_private *i915,  			void __iomem *s;  			s = io_mapping_map_wc(&mem->iomap, dma, PAGE_SIZE); -			ret = compress_page(compress, (void __force *)s, dst); +			ret = compress_page(compress, +					    (void __force *)s, dst, +					    true);  			io_mapping_unmap(s);  			if (ret)  				break; @@ -1045,7 +1050,7 @@ i915_error_object_create(struct drm_i915_private *i915,  			drm_clflush_pages(&page, 1);  			s = kmap(page); -			ret = compress_page(compress, s, dst); +			ret = compress_page(compress, s, dst, false);  			kunmap(page);  			drm_clflush_pages(&page, 1); @@ -1066,77 +1071,56 @@ i915_error_object_create(struct drm_i915_private *i915,  	return dst;  } -/* - * Generate a semi-unique error code. The code is not meant to have meaning, The - * code's only purpose is to try to prevent false duplicated bug reports by - * grossly estimating a GPU error state. - * - * TODO Ideally, hashing the batchbuffer would be a very nice way to determine - * the hang if we could strip the GTT offset information from it. - * - * It's only a small step better than a random number in its current form. - */ -static u32 i915_error_generate_code(struct i915_gpu_state *error) +static void gt_record_fences(struct intel_gt_coredump *gt)  { -	const struct drm_i915_error_engine *ee = error->engine; - -	/* -	 * IPEHR would be an ideal way to detect errors, as it's the gross -	 * measure of "the command that hung." However, has some very common -	 * synchronization commands which almost always appear in the case -	 * strictly a client bug. Use instdone to differentiate those some. -	 */ -	return ee ? ee->ipehr ^ ee->instdone.instdone : 0; -} - -static void gem_record_fences(struct i915_gpu_state *error) -{ -	struct drm_i915_private *dev_priv = error->i915; -	struct intel_uncore *uncore = &dev_priv->uncore; +	struct i915_ggtt *ggtt = gt->_gt->ggtt; +	struct intel_uncore *uncore = gt->_gt->uncore;  	int i; -	if (INTEL_GEN(dev_priv) >= 6) { -		for (i = 0; i < dev_priv->ggtt.num_fences; i++) -			error->fence[i] = +	if (INTEL_GEN(uncore->i915) >= 6) { +		for (i = 0; i < ggtt->num_fences; i++) +			gt->fence[i] =  				intel_uncore_read64(uncore,  						    FENCE_REG_GEN6_LO(i)); -	} else if (INTEL_GEN(dev_priv) >= 4) { -		for (i = 0; i < dev_priv->ggtt.num_fences; i++) -			error->fence[i] = +	} else if (INTEL_GEN(uncore->i915) >= 4) { +		for (i = 0; i < ggtt->num_fences; i++) +			gt->fence[i] =  				intel_uncore_read64(uncore,  						    FENCE_REG_965_LO(i));  	} else { -		for (i = 0; i < dev_priv->ggtt.num_fences; i++) -			error->fence[i] = +		for (i = 0; i < ggtt->num_fences; i++) +			gt->fence[i] =  				intel_uncore_read(uncore, FENCE_REG(i));  	} -	error->nfence = i; +	gt->nfence = i;  } -static void error_record_engine_registers(struct i915_gpu_state *error, -					  struct intel_engine_cs *engine, -					  struct drm_i915_error_engine *ee) +static void engine_record_registers(struct intel_engine_coredump *ee)  { -	struct drm_i915_private *dev_priv = engine->i915; +	const struct intel_engine_cs *engine = ee->engine; +	struct drm_i915_private *i915 = engine->i915; -	if (INTEL_GEN(dev_priv) >= 6) { +	if (INTEL_GEN(i915) >= 6) {  		ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); -		if (INTEL_GEN(dev_priv) >= 12) -			ee->fault_reg = I915_READ(GEN12_RING_FAULT_REG); -		else if (INTEL_GEN(dev_priv) >= 8) -			ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG); +		if (INTEL_GEN(i915) >= 12) +			ee->fault_reg = intel_uncore_read(engine->uncore, +							  GEN12_RING_FAULT_REG); +		else if (INTEL_GEN(i915) >= 8) +			ee->fault_reg = intel_uncore_read(engine->uncore, +							  GEN8_RING_FAULT_REG);  		else  			ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);  	} -	if (INTEL_GEN(dev_priv) >= 4) { +	if (INTEL_GEN(i915) >= 4) {  		ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);  		ee->ipeir = ENGINE_READ(engine, RING_IPEIR);  		ee->ipehr = ENGINE_READ(engine, RING_IPEHR);  		ee->instps = ENGINE_READ(engine, RING_INSTPS);  		ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); -		if (INTEL_GEN(dev_priv) >= 8) { +		ee->ccid = ENGINE_READ(engine, CCID); +		if (INTEL_GEN(i915) >= 8) {  			ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;  			ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;  		} @@ -1155,13 +1139,13 @@ static void error_record_engine_registers(struct i915_gpu_state *error,  	ee->head = ENGINE_READ(engine, RING_HEAD);  	ee->tail = ENGINE_READ(engine, RING_TAIL);  	ee->ctl = ENGINE_READ(engine, RING_CTL); -	if (INTEL_GEN(dev_priv) > 2) +	if (INTEL_GEN(i915) > 2)  		ee->mode = ENGINE_READ(engine, RING_MI_MODE); -	if (!HWS_NEEDS_PHYSICAL(dev_priv)) { +	if (!HWS_NEEDS_PHYSICAL(i915)) {  		i915_reg_t mmio; -		if (IS_GEN(dev_priv, 7)) { +		if (IS_GEN(i915, 7)) {  			switch (engine->id) {  			default:  				MISSING_CASE(engine->id); @@ -1186,40 +1170,40 @@ static void error_record_engine_registers(struct i915_gpu_state *error,  			mmio = RING_HWS_PGA(engine->mmio_base);  		} -		ee->hws = I915_READ(mmio); +		ee->hws = intel_uncore_read(engine->uncore, mmio);  	} -	ee->idle = intel_engine_is_idle(engine); -	ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, -						  engine); +	ee->reset_count = i915_reset_engine_count(&i915->gpu_error, engine); -	if (HAS_PPGTT(dev_priv)) { +	if (HAS_PPGTT(i915)) {  		int i;  		ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7); -		if (IS_GEN(dev_priv, 6)) { +		if (IS_GEN(i915, 6)) {  			ee->vm_info.pp_dir_base =  				ENGINE_READ(engine, RING_PP_DIR_BASE_READ); -		} else if (IS_GEN(dev_priv, 7)) { +		} else if (IS_GEN(i915, 7)) {  			ee->vm_info.pp_dir_base =  				ENGINE_READ(engine, RING_PP_DIR_BASE); -		} else if (INTEL_GEN(dev_priv) >= 8) { +		} else if (INTEL_GEN(i915) >= 8) {  			u32 base = engine->mmio_base;  			for (i = 0; i < 4; i++) {  				ee->vm_info.pdp[i] = -					I915_READ(GEN8_RING_PDP_UDW(base, i)); +					intel_uncore_read(engine->uncore, +							  GEN8_RING_PDP_UDW(base, i));  				ee->vm_info.pdp[i] <<= 32;  				ee->vm_info.pdp[i] |= -					I915_READ(GEN8_RING_PDP_LDW(base, i)); +					intel_uncore_read(engine->uncore, +							  GEN8_RING_PDP_LDW(base, i));  			}  		}  	}  }  static void record_request(const struct i915_request *request, -			   struct drm_i915_error_request *erq) +			   struct i915_request_coredump *erq)  {  	const struct i915_gem_context *ctx; @@ -1227,7 +1211,6 @@ static void record_request(const struct i915_request *request,  	erq->context = request->fence.context;  	erq->seqno = request->fence.seqno;  	erq->sched_attr = request->sched.attr; -	erq->jiffies = request->emitted_jiffies;  	erq->start = i915_ggtt_offset(request->ring->vma);  	erq->head = request->head;  	erq->tail = request->tail; @@ -1240,9 +1223,9 @@ static void record_request(const struct i915_request *request,  	rcu_read_unlock();  } -static void engine_record_requests(struct intel_engine_cs *engine, +static void engine_record_requests(const struct intel_engine_cs *engine,  				   struct i915_request *first, -				   struct drm_i915_error_engine *ee) +				   struct intel_engine_coredump *ee)  {  	struct i915_request *request;  	int count; @@ -1288,11 +1271,10 @@ static void engine_record_requests(struct intel_engine_cs *engine,  	ee->num_requests = count;  } -static void error_record_engine_execlists(const struct intel_engine_cs *engine, -					  struct drm_i915_error_engine *ee) +static void engine_record_execlists(struct intel_engine_coredump *ee)  { -	const struct intel_engine_execlists * const execlists = &engine->execlists; -	struct i915_request * const *port = execlists->active; +	const struct intel_engine_execlists * const el = &ee->engine->execlists; +	struct i915_request * const *port = el->active;  	unsigned int n = 0;  	while (*port) @@ -1301,7 +1283,7 @@ static void error_record_engine_execlists(const struct intel_engine_cs *engine,  	ee->num_ports = n;  } -static bool record_context(struct drm_i915_error_context *e, +static bool record_context(struct i915_gem_context_coredump *e,  			   const struct i915_request *rq)  {  	struct i915_gem_context *ctx; @@ -1334,23 +1316,24 @@ static bool record_context(struct drm_i915_error_context *e,  	return capture;  } -struct capture_vma { -	struct capture_vma *next; -	void **slot; +struct intel_engine_capture_vma { +	struct intel_engine_capture_vma *next; +	struct i915_vma *vma; +	char name[16];  }; -static struct capture_vma * -capture_vma(struct capture_vma *next, +static struct intel_engine_capture_vma * +capture_vma(struct intel_engine_capture_vma *next,  	    struct i915_vma *vma, -	    struct drm_i915_error_object **out) +	    const char *name, +	    gfp_t gfp)  { -	struct capture_vma *c; +	struct intel_engine_capture_vma *c; -	*out = NULL;  	if (!vma)  		return next; -	c = kmalloc(sizeof(*c), ATOMIC_MAYFAIL); +	c = kmalloc(sizeof(*c), gfp);  	if (!c)  		return next; @@ -1359,54 +1342,31 @@ capture_vma(struct capture_vma *next,  		return next;  	} -	c->slot = (void **)out; -	*c->slot = i915_vma_get(vma); +	strcpy(c->name, name); +	c->vma = i915_vma_get(vma);  	c->next = next;  	return c;  } -static struct capture_vma * -request_record_user_bo(struct i915_request *request, -		       struct drm_i915_error_engine *ee, -		       struct capture_vma *capture) +static struct intel_engine_capture_vma * +capture_user(struct intel_engine_capture_vma *capture, +	     const struct i915_request *rq, +	     gfp_t gfp)  {  	struct i915_capture_list *c; -	struct drm_i915_error_object **bo; -	long count, max; - -	max = 0; -	for (c = request->capture_list; c; c = c->next) -		max++; -	if (!max) -		return capture; - -	bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); -	if (!bo) { -		/* If we can't capture everything, try to capture something. */ -		max = min_t(long, max, PAGE_SIZE / sizeof(*bo)); -		bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); -	} -	if (!bo) -		return capture; -	count = 0; -	for (c = request->capture_list; c; c = c->next) { -		capture = capture_vma(capture, c->vma, &bo[count]); -		if (++count == max) -			break; -	} - -	ee->user_bo = bo; -	ee->user_bo_count = count; +	for (c = rq->capture_list; c; c = c->next) +		capture = capture_vma(capture, c->vma, "user", gfp);  	return capture;  } -static struct drm_i915_error_object * -capture_object(struct drm_i915_private *dev_priv, +static struct i915_vma_coredump * +capture_object(const struct intel_gt *gt,  	       struct drm_i915_gem_object *obj, -	       struct compress *compress) +	       const char *name, +	       struct i915_vma_compress *compress)  {  	if (obj && i915_gem_object_has_pages(obj)) {  		struct i915_vma fake = { @@ -1416,127 +1376,183 @@ capture_object(struct drm_i915_private *dev_priv,  			.obj = obj,  		}; -		return i915_error_object_create(dev_priv, &fake, compress); +		return i915_vma_coredump_create(gt, &fake, name, compress);  	} else {  		return NULL;  	}  } -static void -gem_record_rings(struct i915_gpu_state *error, struct compress *compress) +static void add_vma(struct intel_engine_coredump *ee, +		    struct i915_vma_coredump *vma)  { -	struct drm_i915_private *i915 = error->i915; -	struct intel_engine_cs *engine; -	struct drm_i915_error_engine *ee; +	if (vma) { +		vma->next = ee->vma; +		ee->vma = vma; +	} +} + +struct intel_engine_coredump * +intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp) +{ +	struct intel_engine_coredump *ee; -	ee = kzalloc(sizeof(*ee), GFP_KERNEL); +	ee = kzalloc(sizeof(*ee), gfp);  	if (!ee) -		return; +		return NULL; -	for_each_uabi_engine(engine, i915) { -		struct capture_vma *capture = NULL; -		struct i915_request *request; -		unsigned long flags; +	ee->engine = engine; -		/* Refill our page pool before entering atomic section */ -		pool_refill(&compress->pool, ALLOW_FAIL); +	engine_record_registers(ee); +	engine_record_execlists(ee); -		spin_lock_irqsave(&engine->active.lock, flags); -		request = intel_engine_find_active_request(engine); -		if (!request) { -			spin_unlock_irqrestore(&engine->active.lock, flags); -			continue; -		} +	return ee; +} -		error->simulated |= record_context(&ee->context, request); +struct intel_engine_capture_vma * +intel_engine_coredump_add_request(struct intel_engine_coredump *ee, +				  struct i915_request *rq, +				  gfp_t gfp) +{ +	struct intel_engine_capture_vma *vma = NULL; -		/* -		 * We need to copy these to an anonymous buffer -		 * as the simplest method to avoid being overwritten -		 * by userspace. -		 */ -		capture = capture_vma(capture, -				      request->batch, -				      &ee->batchbuffer); +	ee->simulated |= record_context(&ee->context, rq); +	if (ee->simulated) +		return NULL; -		if (HAS_BROKEN_CS_TLB(i915)) -			capture = capture_vma(capture, -					      engine->gt->scratch, -					      &ee->wa_batchbuffer); +	/* +	 * We need to copy these to an anonymous buffer +	 * as the simplest method to avoid being overwritten +	 * by userspace. +	 */ +	vma = capture_vma(vma, rq->batch, "batch", gfp); +	vma = capture_user(vma, rq, gfp); +	vma = capture_vma(vma, rq->ring->vma, "ring", gfp); +	vma = capture_vma(vma, rq->context->state, "HW context", gfp); +	if (HAS_BROKEN_CS_TLB(rq->i915)) +		vma = capture_vma(vma, ee->engine->gt->scratch, "WA batch", gfp); -		capture = request_record_user_bo(request, ee, capture); +	ee->cpu_ring_head = rq->ring->head; +	ee->cpu_ring_tail = rq->ring->tail; -		capture = capture_vma(capture, -				      request->context->state, -				      &ee->ctx); +	ee->rq_head = rq->head; +	ee->rq_post = rq->postfix; +	ee->rq_tail = rq->tail; -		capture = capture_vma(capture, -				      request->ring->vma, -				      &ee->ringbuffer); +	return vma; +} -		ee->cpu_ring_head = request->ring->head; -		ee->cpu_ring_tail = request->ring->tail; +void +intel_engine_coredump_add_vma(struct intel_engine_coredump *ee, +			      struct intel_engine_capture_vma *capture, +			      struct i915_vma_compress *compress) +{ +	const struct intel_engine_cs *engine = ee->engine; -		ee->rq_head = request->head; -		ee->rq_post = request->postfix; -		ee->rq_tail = request->tail; +	while (capture) { +		struct intel_engine_capture_vma *this = capture; +		struct i915_vma *vma = this->vma; -		engine_record_requests(engine, request, ee); -		spin_unlock_irqrestore(&engine->active.lock, flags); +		add_vma(ee, +			i915_vma_coredump_create(engine->gt, +						 vma, this->name, +						 compress)); -		error_record_engine_registers(error, engine, ee); -		error_record_engine_execlists(engine, ee); +		i915_active_release(&vma->active); +		i915_vma_put(vma); -		while (capture) { -			struct capture_vma *this = capture; -			struct i915_vma *vma = *this->slot; +		capture = this->next; +		kfree(this); +	} -			*this->slot = -				i915_error_object_create(i915, vma, compress); +	add_vma(ee, +		i915_vma_coredump_create(engine->gt, +					 engine->status_page.vma, +					 "HW Status", +					 compress)); -			i915_active_release(&vma->active); -			i915_vma_put(vma); +	add_vma(ee, +		i915_vma_coredump_create(engine->gt, +					 engine->wa_ctx.vma, +					 "WA context", +					 compress)); -			capture = this->next; -			kfree(this); -		} +	add_vma(ee, +		capture_object(engine->gt, +			       engine->default_state, +			       "NULL context", +			       compress)); +} + +static struct intel_engine_coredump * +capture_engine(struct intel_engine_cs *engine, +	       struct i915_vma_compress *compress) +{ +	struct intel_engine_capture_vma *capture; +	struct intel_engine_coredump *ee; +	struct i915_request *rq; +	unsigned long flags; -		ee->hws_page = -			i915_error_object_create(i915, -						 engine->status_page.vma, -						 compress); +	ee = intel_engine_coredump_alloc(engine, GFP_KERNEL); +	if (!ee) +		return NULL; -		ee->wa_ctx = -			i915_error_object_create(i915, -						 engine->wa_ctx.vma, -						 compress); +	spin_lock_irqsave(&engine->active.lock, flags); -		ee->default_state = -			capture_object(i915, engine->default_state, compress); +	rq = intel_engine_find_active_request(engine); +	if (!rq) { +		spin_unlock_irqrestore(&engine->active.lock, flags); +		kfree(ee); +		return NULL; +	} -		ee->engine = engine; +	capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); +	engine_record_requests(engine, rq, ee); -		ee->next = error->engine; -		error->engine = ee; +	spin_unlock_irqrestore(&engine->active.lock, flags); -		ee = kzalloc(sizeof(*ee), GFP_KERNEL); -		if (!ee) -			return; -	} +	intel_engine_coredump_add_vma(ee, capture, compress); -	kfree(ee); +	return ee;  }  static void -capture_uc_state(struct i915_gpu_state *error, struct compress *compress) +gt_record_engines(struct intel_gt_coredump *gt, +		  struct i915_vma_compress *compress)  { -	struct drm_i915_private *i915 = error->i915; -	struct i915_error_uc *error_uc = &error->uc; -	struct intel_uc *uc = &i915->gt.uc; +	struct intel_engine_cs *engine; +	enum intel_engine_id id; -	/* Capturing uC state won't be useful if there is no GuC */ -	if (!error->device_info.has_gt_uc) -		return; +	for_each_engine(engine, gt->_gt, id) { +		struct intel_engine_coredump *ee; + +		/* Refill our page pool before entering atomic section */ +		pool_refill(&compress->pool, ALLOW_FAIL); + +		ee = capture_engine(engine, compress); +		if (!ee) +			continue; + +		gt->simulated |= ee->simulated; +		if (ee->simulated) { +			kfree(ee); +			continue; +		} + +		ee->next = gt->engine; +		gt->engine = ee; +	} +} + +static struct intel_uc_coredump * +gt_record_uc(struct intel_gt_coredump *gt, +	     struct i915_vma_compress *compress) +{ +	const struct intel_uc *uc = >->_gt->uc; +	struct intel_uc_coredump *error_uc; + +	error_uc = kzalloc(sizeof(*error_uc), ALLOW_FAIL); +	if (!error_uc) +		return NULL;  	memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw));  	memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw)); @@ -1547,19 +1563,42 @@ capture_uc_state(struct i915_gpu_state *error, struct compress *compress)  	 */  	error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);  	error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL); -	error_uc->guc_log = i915_error_object_create(i915, -						     uc->guc.log.vma, -						     compress); +	error_uc->guc_log = +		i915_vma_coredump_create(gt->_gt, +					 uc->guc.log.vma, "GuC log buffer", +					 compress); + +	return error_uc; +} + +static void gt_capture_prepare(struct intel_gt_coredump *gt) +{ +	struct i915_ggtt *ggtt = gt->_gt->ggtt; + +	mutex_lock(&ggtt->error_mutex); +} + +static void gt_capture_finish(struct intel_gt_coredump *gt) +{ +	struct i915_ggtt *ggtt = gt->_gt->ggtt; + +	if (drm_mm_node_allocated(&ggtt->error_capture)) +		ggtt->vm.clear_range(&ggtt->vm, +				     ggtt->error_capture.start, +				     PAGE_SIZE); + +	mutex_unlock(&ggtt->error_mutex);  }  /* Capture all registers which don't fit into another category. */ -static void capture_reg_state(struct i915_gpu_state *error) +static void gt_record_regs(struct intel_gt_coredump *gt)  { -	struct drm_i915_private *i915 = error->i915; -	struct intel_uncore *uncore = &i915->uncore; +	struct intel_uncore *uncore = gt->_gt->uncore; +	struct drm_i915_private *i915 = uncore->i915;  	int i; -	/* General organization +	/* +	 * General organization  	 * 1. Registers specific to a single generation  	 * 2. Registers which belong to multiple generations  	 * 3. Feature specific registers. @@ -1569,138 +1608,162 @@ static void capture_reg_state(struct i915_gpu_state *error)  	/* 1: Registers specific to a single generation */  	if (IS_VALLEYVIEW(i915)) { -		error->gtier[0] = intel_uncore_read(uncore, GTIER); -		error->ier = intel_uncore_read(uncore, VLV_IER); -		error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); +		gt->gtier[0] = intel_uncore_read(uncore, GTIER); +		gt->ier = intel_uncore_read(uncore, VLV_IER); +		gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);  	}  	if (IS_GEN(i915, 7)) -		error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); +		gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);  	if (INTEL_GEN(i915) >= 12) { -		error->fault_data0 = intel_uncore_read(uncore, -						       GEN12_FAULT_TLB_DATA0); -		error->fault_data1 = intel_uncore_read(uncore, -						       GEN12_FAULT_TLB_DATA1); +		gt->fault_data0 = intel_uncore_read(uncore, +						    GEN12_FAULT_TLB_DATA0); +		gt->fault_data1 = intel_uncore_read(uncore, +						    GEN12_FAULT_TLB_DATA1);  	} else if (INTEL_GEN(i915) >= 8) { -		error->fault_data0 = intel_uncore_read(uncore, -						       GEN8_FAULT_TLB_DATA0); -		error->fault_data1 = intel_uncore_read(uncore, -						       GEN8_FAULT_TLB_DATA1); +		gt->fault_data0 = intel_uncore_read(uncore, +						    GEN8_FAULT_TLB_DATA0); +		gt->fault_data1 = intel_uncore_read(uncore, +						    GEN8_FAULT_TLB_DATA1);  	}  	if (IS_GEN(i915, 6)) { -		error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); -		error->gab_ctl = intel_uncore_read(uncore, GAB_CTL); -		error->gfx_mode = intel_uncore_read(uncore, GFX_MODE); +		gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); +		gt->gab_ctl = intel_uncore_read(uncore, GAB_CTL); +		gt->gfx_mode = intel_uncore_read(uncore, GFX_MODE);  	}  	/* 2: Registers which belong to multiple generations */  	if (INTEL_GEN(i915) >= 7) -		error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); +		gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);  	if (INTEL_GEN(i915) >= 6) { -		error->derrmr = intel_uncore_read(uncore, DERRMR); +		gt->derrmr = intel_uncore_read(uncore, DERRMR);  		if (INTEL_GEN(i915) < 12) { -			error->error = intel_uncore_read(uncore, ERROR_GEN6); -			error->done_reg = intel_uncore_read(uncore, DONE_REG); +			gt->error = intel_uncore_read(uncore, ERROR_GEN6); +			gt->done_reg = intel_uncore_read(uncore, DONE_REG);  		}  	} -	if (INTEL_GEN(i915) >= 5) -		error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE)); -  	/* 3: Feature specific registers */  	if (IS_GEN_RANGE(i915, 6, 7)) { -		error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); -		error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); +		gt->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); +		gt->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);  	}  	if (IS_GEN_RANGE(i915, 8, 11)) -		error->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN); +		gt->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN);  	if (IS_GEN(i915, 12)) -		error->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG); +		gt->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG);  	if (INTEL_GEN(i915) >= 12) {  		for (i = 0; i < GEN12_SFC_DONE_MAX; i++) { -			error->sfc_done[i] = +			gt->sfc_done[i] =  				intel_uncore_read(uncore, GEN12_SFC_DONE(i));  		} -		error->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE); +		gt->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE);  	}  	/* 4: Everything else */  	if (INTEL_GEN(i915) >= 11) { -		error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); -		error->gtier[0] = +		gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); +		gt->gtier[0] =  			intel_uncore_read(uncore,  					  GEN11_RENDER_COPY_INTR_ENABLE); -		error->gtier[1] = +		gt->gtier[1] =  			intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE); -		error->gtier[2] = +		gt->gtier[2] =  			intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE); -		error->gtier[3] = +		gt->gtier[3] =  			intel_uncore_read(uncore,  					  GEN11_GPM_WGBOXPERF_INTR_ENABLE); -		error->gtier[4] = +		gt->gtier[4] =  			intel_uncore_read(uncore,  					  GEN11_CRYPTO_RSVD_INTR_ENABLE); -		error->gtier[5] = +		gt->gtier[5] =  			intel_uncore_read(uncore,  					  GEN11_GUNIT_CSME_INTR_ENABLE); -		error->ngtier = 6; +		gt->ngtier = 6;  	} else if (INTEL_GEN(i915) >= 8) { -		error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); +		gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);  		for (i = 0; i < 4; i++) -			error->gtier[i] = intel_uncore_read(uncore, -							    GEN8_GT_IER(i)); -		error->ngtier = 4; +			gt->gtier[i] = +				intel_uncore_read(uncore, GEN8_GT_IER(i)); +		gt->ngtier = 4;  	} else if (HAS_PCH_SPLIT(i915)) { -		error->ier = intel_uncore_read(uncore, DEIER); -		error->gtier[0] = intel_uncore_read(uncore, GTIER); -		error->ngtier = 1; +		gt->ier = intel_uncore_read(uncore, DEIER); +		gt->gtier[0] = intel_uncore_read(uncore, GTIER); +		gt->ngtier = 1;  	} else if (IS_GEN(i915, 2)) { -		error->ier = intel_uncore_read16(uncore, GEN2_IER); +		gt->ier = intel_uncore_read16(uncore, GEN2_IER);  	} else if (!IS_VALLEYVIEW(i915)) { -		error->ier = intel_uncore_read(uncore, GEN2_IER); +		gt->ier = intel_uncore_read(uncore, GEN2_IER);  	} -	error->eir = intel_uncore_read(uncore, EIR); -	error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); +	gt->eir = intel_uncore_read(uncore, EIR); +	gt->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); +} + +/* + * Generate a semi-unique error code. The code is not meant to have meaning, The + * code's only purpose is to try to prevent false duplicated bug reports by + * grossly estimating a GPU error state. + * + * TODO Ideally, hashing the batchbuffer would be a very nice way to determine + * the hang if we could strip the GTT offset information from it. + * + * It's only a small step better than a random number in its current form. + */ +static u32 generate_ecode(const struct intel_engine_coredump *ee) +{ +	/* +	 * IPEHR would be an ideal way to detect errors, as it's the gross +	 * measure of "the command that hung." However, has some very common +	 * synchronization commands which almost always appear in the case +	 * strictly a client bug. Use instdone to differentiate those some. +	 */ +	return ee ? ee->ipehr ^ ee->instdone.instdone : 0;  } -static const char * -error_msg(struct i915_gpu_state *error, -	  intel_engine_mask_t engines, const char *msg) +static const char *error_msg(struct i915_gpu_coredump *error)  { +	struct intel_engine_coredump *first = NULL; +	struct intel_gt_coredump *gt; +	intel_engine_mask_t engines;  	int len; +	engines = 0; +	for (gt = error->gt; gt; gt = gt->next) { +		struct intel_engine_coredump *cs; + +		if (gt->engine && !first) +			first = gt->engine; + +		for (cs = gt->engine; cs; cs = cs->next) +			engines |= cs->engine->mask; +	} +  	len = scnprintf(error->error_msg, sizeof(error->error_msg), -			"GPU HANG: ecode %d:%x:0x%08x", +			"GPU HANG: ecode %d:%x:%08x",  			INTEL_GEN(error->i915), engines, -			i915_error_generate_code(error)); -	if (error->engine) { +			generate_ecode(first)); +	if (first) {  		/* Just show the first executing process, more is confusing */  		len += scnprintf(error->error_msg + len,  				 sizeof(error->error_msg) - len,  				 ", in %s [%d]", -				 error->engine->context.comm, -				 error->engine->context.pid); +				 first->context.comm, first->context.pid);  	} -	if (msg) -		len += scnprintf(error->error_msg + len, -				 sizeof(error->error_msg) - len, -				 ", %s", msg);  	return error->error_msg;  } -static void capture_gen_state(struct i915_gpu_state *error) +static void capture_gen(struct i915_gpu_coredump *error)  {  	struct drm_i915_private *i915 = error->i915; -	error->awake = i915->gt.awake;  	error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);  	error->suspended = i915->runtime_pm.suspended; @@ -1711,6 +1774,7 @@ static void capture_gen_state(struct i915_gpu_state *error)  	error->reset_count = i915_reset_count(&i915->gpu_error);  	error->suspend_count = i915->suspend_count; +	i915_params_copy(&error->params, &i915_modparams);  	memcpy(&error->device_info,  	       INTEL_INFO(i915),  	       sizeof(error->device_info)); @@ -1720,115 +1784,138 @@ static void capture_gen_state(struct i915_gpu_state *error)  	error->driver_caps = i915->caps;  } -static void capture_params(struct i915_gpu_state *error) +struct i915_gpu_coredump * +i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)  { -	i915_params_copy(&error->params, &i915_modparams); +	struct i915_gpu_coredump *error; + +	if (!i915_modparams.error_capture) +		return NULL; + +	error = kzalloc(sizeof(*error), gfp); +	if (!error) +		return NULL; + +	kref_init(&error->ref); +	error->i915 = i915; + +	error->time = ktime_get_real(); +	error->boottime = ktime_get_boottime(); +	error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time); +	error->capture = jiffies; + +	capture_gen(error); + +	return error;  } -static void capture_finish(struct i915_gpu_state *error) +#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) + +struct intel_gt_coredump * +intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp)  { -	struct i915_ggtt *ggtt = &error->i915->ggtt; +	struct intel_gt_coredump *gc; -	if (drm_mm_node_allocated(&ggtt->error_capture)) { -		const u64 slot = ggtt->error_capture.start; +	gc = kzalloc(sizeof(*gc), gfp); +	if (!gc) +		return NULL; + +	gc->_gt = gt; +	gc->awake = intel_gt_pm_is_awake(gt); + +	gt_record_regs(gc); +	gt_record_fences(gc); + +	return gc; +} -		ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); +struct i915_vma_compress * +i915_vma_capture_prepare(struct intel_gt_coredump *gt) +{ +	struct i915_vma_compress *compress; + +	compress = kmalloc(sizeof(*compress), ALLOW_FAIL); +	if (!compress) +		return NULL; + +	if (!compress_init(compress)) { +		kfree(compress); +		return NULL;  	} + +	gt_capture_prepare(gt); + +	return compress;  } -#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) +void i915_vma_capture_finish(struct intel_gt_coredump *gt, +			     struct i915_vma_compress *compress) +{ +	if (!compress) +		return; -struct i915_gpu_state * -i915_capture_gpu_state(struct drm_i915_private *i915) +	gt_capture_finish(gt); + +	compress_fini(compress); +	kfree(compress); +} + +struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)  { -	struct i915_gpu_state *error; -	struct compress compress; +	struct i915_gpu_coredump *error;  	/* Check if GPU capture has been disabled */  	error = READ_ONCE(i915->gpu_error.first_error);  	if (IS_ERR(error))  		return error; -	error = kzalloc(sizeof(*error), ALLOW_FAIL); -	if (!error) { -		i915_disable_error_state(i915, -ENOMEM); +	error = i915_gpu_coredump_alloc(i915, ALLOW_FAIL); +	if (!error)  		return ERR_PTR(-ENOMEM); -	} -	if (!compress_init(&compress)) { -		kfree(error); -		i915_disable_error_state(i915, -ENOMEM); -		return ERR_PTR(-ENOMEM); -	} +	error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL); +	if (error->gt) { +		struct i915_vma_compress *compress; -	kref_init(&error->ref); -	error->i915 = i915; +		compress = i915_vma_capture_prepare(error->gt); +		if (!compress) { +			kfree(error->gt); +			kfree(error); +			return ERR_PTR(-ENOMEM); +		} -	error->time = ktime_get_real(); -	error->boottime = ktime_get_boottime(); -	error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time); -	error->capture = jiffies; +		gt_record_engines(error->gt, compress); + +		if (INTEL_INFO(i915)->has_gt_uc) +			error->gt->uc = gt_record_uc(error->gt, compress); -	capture_params(error); -	capture_gen_state(error); -	capture_uc_state(error, &compress); -	capture_reg_state(error); -	gem_record_fences(error); -	gem_record_rings(error, &compress); +		i915_vma_capture_finish(error->gt, compress); + +		error->simulated |= error->gt->simulated; +	}  	error->overlay = intel_overlay_capture_error_state(i915);  	error->display = intel_display_capture_error_state(i915); -	capture_finish(error); -	compress_fini(&compress); -  	return error;  } -/** - * i915_capture_error_state - capture an error record for later analysis - * @i915: i915 device - * @engine_mask: the mask of engines triggering the hang - * @msg: a message to insert into the error capture header - * - * Should be called when an error is detected (either a hang or an error - * interrupt) to capture error state from the time of the error.  Fills - * out a structure which becomes available in debugfs for user level tools - * to pick up. - */ -void i915_capture_error_state(struct drm_i915_private *i915, -			      intel_engine_mask_t engine_mask, -			      const char *msg) +void i915_error_state_store(struct i915_gpu_coredump *error)  { +	struct drm_i915_private *i915;  	static bool warned; -	struct i915_gpu_state *error; -	unsigned long flags; -	if (!i915_modparams.error_capture) +	if (IS_ERR_OR_NULL(error))  		return; -	if (READ_ONCE(i915->gpu_error.first_error)) -		return; +	i915 = error->i915; +	dev_info(i915->drm.dev, "%s\n", error_msg(error)); -	error = i915_capture_gpu_state(i915); -	if (IS_ERR(error)) +	if (error->simulated || +	    cmpxchg(&i915->gpu_error.first_error, NULL, error))  		return; -	dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg)); - -	if (!error->simulated) { -		spin_lock_irqsave(&i915->gpu_error.lock, flags); -		if (!i915->gpu_error.first_error) { -			i915->gpu_error.first_error = error; -			error = NULL; -		} -		spin_unlock_irqrestore(&i915->gpu_error.lock, flags); -	} - -	if (error) { -		__i915_gpu_state_free(&error->ref); -		return; -	} +	i915_gpu_coredump_get(error);  	if (!xchg(&warned, true) &&  	    ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { @@ -1841,15 +1928,38 @@ void i915_capture_error_state(struct drm_i915_private *i915,  	}  } -struct i915_gpu_state * +/** + * i915_capture_error_state - capture an error record for later analysis + * @i915: i915 device + * + * Should be called when an error is detected (either a hang or an error + * interrupt) to capture error state from the time of the error.  Fills + * out a structure which becomes available in debugfs for user level tools + * to pick up. + */ +void i915_capture_error_state(struct drm_i915_private *i915) +{ +	struct i915_gpu_coredump *error; + +	error = i915_gpu_coredump(i915); +	if (IS_ERR(error)) { +		cmpxchg(&i915->gpu_error.first_error, NULL, error); +		return; +	} + +	i915_error_state_store(error); +	i915_gpu_coredump_put(error); +} + +struct i915_gpu_coredump *  i915_first_error_state(struct drm_i915_private *i915)  { -	struct i915_gpu_state *error; +	struct i915_gpu_coredump *error;  	spin_lock_irq(&i915->gpu_error.lock);  	error = i915->gpu_error.first_error;  	if (!IS_ERR_OR_NULL(error)) -		i915_gpu_state_get(error); +		i915_gpu_coredump_get(error);  	spin_unlock_irq(&i915->gpu_error.lock);  	return error; @@ -1857,7 +1967,7 @@ i915_first_error_state(struct drm_i915_private *i915)  void i915_reset_error_state(struct drm_i915_private *i915)  { -	struct i915_gpu_state *error; +	struct i915_gpu_coredump *error;  	spin_lock_irq(&i915->gpu_error.lock);  	error = i915->gpu_error.first_error; @@ -1866,7 +1976,7 @@ void i915_reset_error_state(struct drm_i915_private *i915)  	spin_unlock_irq(&i915->gpu_error.lock);  	if (!IS_ERR_OR_NULL(error)) -		i915_gpu_state_put(error); +		i915_gpu_coredump_put(error);  }  void i915_disable_error_state(struct drm_i915_private *i915, int err) | 
