diff options
Diffstat (limited to 'drivers/gpu/drm/scheduler/sched_main.c')
| -rw-r--r-- | drivers/gpu/drm/scheduler/sched_main.c | 219 | 
1 files changed, 129 insertions, 90 deletions
| diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index dbb69063b3d5..19fc601c9eeb 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -60,8 +60,6 @@  static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); -static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job); -  /**   * drm_sched_rq_init - initialize a given run queue struct   * @@ -286,8 +284,6 @@ static void drm_sched_job_finish(struct work_struct *work)  	cancel_delayed_work_sync(&sched->work_tdr);  	spin_lock_irqsave(&sched->job_list_lock, flags); -	/* remove job from ring_mirror_list */ -	list_del_init(&s_job->node);  	/* queue TDR for next job */  	drm_sched_start_timeout(sched);  	spin_unlock_irqrestore(&sched->job_list_lock, flags); @@ -295,22 +291,11 @@ static void drm_sched_job_finish(struct work_struct *work)  	sched->ops->free_job(s_job);  } -static void drm_sched_job_finish_cb(struct dma_fence *f, -				    struct dma_fence_cb *cb) -{ -	struct drm_sched_job *job = container_of(cb, struct drm_sched_job, -						 finish_cb); -	schedule_work(&job->finish_work); -} -  static void drm_sched_job_begin(struct drm_sched_job *s_job)  {  	struct drm_gpu_scheduler *sched = s_job->sched;  	unsigned long flags; -	dma_fence_add_callback(&s_job->s_fence->finished, &s_job->finish_cb, -			       drm_sched_job_finish_cb); -  	spin_lock_irqsave(&sched->job_list_lock, flags);  	list_add_tail(&s_job->node, &sched->ring_mirror_list);  	drm_sched_start_timeout(sched); @@ -335,6 +320,51 @@ static void drm_sched_job_timedout(struct work_struct *work)  	spin_unlock_irqrestore(&sched->job_list_lock, flags);  } + /** +  * drm_sched_increase_karma - Update sched_entity guilty flag +  * +  * @bad: The job guilty of time out +  * +  * Increment on every hang caused by the 'bad' job. If this exceeds the hang +  * limit of the scheduler then the respective sched entity is marked guilty and +  * jobs from it will not be scheduled further +  */ +void drm_sched_increase_karma(struct drm_sched_job *bad) +{ +	int i; +	struct drm_sched_entity *tmp; +	struct drm_sched_entity *entity; +	struct drm_gpu_scheduler *sched = bad->sched; + +	/* don't increase @bad's karma if it's from KERNEL RQ, +	 * because sometimes GPU hang would cause kernel jobs (like VM updating jobs) +	 * corrupt but keep in mind that kernel jobs always considered good. +	 */ +	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { +		atomic_inc(&bad->karma); +		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; +		     i++) { +			struct drm_sched_rq *rq = &sched->sched_rq[i]; + +			spin_lock(&rq->lock); +			list_for_each_entry_safe(entity, tmp, &rq->entities, list) { +				if (bad->s_fence->scheduled.context == +				    entity->fence_context) { +					if (atomic_read(&bad->karma) > +					    bad->sched->hang_limit) +						if (entity->guilty) +							atomic_set(entity->guilty, 1); +					break; +				} +			} +			spin_unlock(&rq->lock); +			if (&entity->list != &rq->entities) +				break; +		} +	} +} +EXPORT_SYMBOL(drm_sched_increase_karma); +  /**   * drm_sched_hw_job_reset - stop the scheduler if it contains the bad job   * @@ -342,50 +372,42 @@ static void drm_sched_job_timedout(struct work_struct *work)   * @bad: bad scheduler job   *   */ -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) +void drm_sched_stop(struct drm_gpu_scheduler *sched)  {  	struct drm_sched_job *s_job; -	struct drm_sched_entity *entity, *tmp;  	unsigned long flags; -	int i; +	struct dma_fence *last_fence =  NULL; +	kthread_park(sched->thread); + +	/* +	 * Verify all the signaled jobs in mirror list are removed from the ring +	 * by waiting for the latest job to enter the list. This should insure that +	 * also all the previous jobs that were in flight also already singaled +	 * and removed from the list. +	 */  	spin_lock_irqsave(&sched->job_list_lock, flags);  	list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {  		if (s_job->s_fence->parent &&  		    dma_fence_remove_callback(s_job->s_fence->parent, -					      &s_job->s_fence->cb)) { +					      &s_job->cb)) {  			dma_fence_put(s_job->s_fence->parent);  			s_job->s_fence->parent = NULL;  			atomic_dec(&sched->hw_rq_count); +		} else { +			 last_fence = dma_fence_get(&s_job->s_fence->finished); +			 break;  		}  	}  	spin_unlock_irqrestore(&sched->job_list_lock, flags); -	if (bad && bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { -		atomic_inc(&bad->karma); -		/* don't increase @bad's karma if it's from KERNEL RQ, -		 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs) -		 * corrupt but keep in mind that kernel jobs always considered good. -		 */ -		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; i++ ) { -			struct drm_sched_rq *rq = &sched->sched_rq[i]; - -			spin_lock(&rq->lock); -			list_for_each_entry_safe(entity, tmp, &rq->entities, list) { -				if (bad->s_fence->scheduled.context == entity->fence_context) { -				    if (atomic_read(&bad->karma) > bad->sched->hang_limit) -						if (entity->guilty) -							atomic_set(entity->guilty, 1); -					break; -				} -			} -			spin_unlock(&rq->lock); -			if (&entity->list != &rq->entities) -				break; -		} +	if (last_fence) { +		dma_fence_wait(last_fence, false); +		dma_fence_put(last_fence);  	}  } -EXPORT_SYMBOL(drm_sched_hw_job_reset); + +EXPORT_SYMBOL(drm_sched_stop);  /**   * drm_sched_job_recovery - recover jobs after a reset @@ -393,18 +415,58 @@ EXPORT_SYMBOL(drm_sched_hw_job_reset);   * @sched: scheduler instance   *   */ -void drm_sched_job_recovery(struct drm_gpu_scheduler *sched) +void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)  {  	struct drm_sched_job *s_job, *tmp; -	bool found_guilty = false; -	unsigned long flags;  	int r; -	spin_lock_irqsave(&sched->job_list_lock, flags); +	if (!full_recovery) +		goto unpark; + +	/* +	 * Locking the list is not required here as the sched thread is parked +	 * so no new jobs are being pushed in to HW and in drm_sched_stop we +	 * flushed all the jobs who were still in mirror list but who already +	 * signaled and removed them self from the list. Also concurrent +	 * GPU recovers can't run in parallel. +	 */ +	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { +		struct dma_fence *fence = s_job->s_fence->parent; + +		if (fence) { +			r = dma_fence_add_callback(fence, &s_job->cb, +						   drm_sched_process_job); +			if (r == -ENOENT) +				drm_sched_process_job(fence, &s_job->cb); +			else if (r) +				DRM_ERROR("fence add callback failed (%d)\n", +					  r); +		} else +			drm_sched_process_job(NULL, &s_job->cb); +	} + +	drm_sched_start_timeout(sched); + +unpark: +	kthread_unpark(sched->thread); +} +EXPORT_SYMBOL(drm_sched_start); + +/** + * drm_sched_resubmit_jobs - helper to relunch job from mirror ring list + * + * @sched: scheduler instance + * + */ +void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) +{ +	struct drm_sched_job *s_job, *tmp; +	uint64_t guilty_context; +	bool found_guilty = false; + +	/*TODO DO we need spinlock here ? */  	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {  		struct drm_sched_fence *s_fence = s_job->s_fence; -		struct dma_fence *fence; -		uint64_t guilty_context;  		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {  			found_guilty = true; @@ -414,31 +476,11 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)  		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)  			dma_fence_set_error(&s_fence->finished, -ECANCELED); -		spin_unlock_irqrestore(&sched->job_list_lock, flags); -		fence = sched->ops->run_job(s_job); +		s_job->s_fence->parent = sched->ops->run_job(s_job);  		atomic_inc(&sched->hw_rq_count); - -		if (fence) { -			s_fence->parent = dma_fence_get(fence); -			r = dma_fence_add_callback(fence, &s_fence->cb, -						   drm_sched_process_job); -			if (r == -ENOENT) -				drm_sched_process_job(fence, &s_fence->cb); -			else if (r) -				DRM_ERROR("fence add callback failed (%d)\n", -					  r); -			dma_fence_put(fence); -		} else { -			if (s_fence->finished.error < 0) -				drm_sched_expel_job_unlocked(s_job); -			drm_sched_process_job(NULL, &s_fence->cb); -		} -		spin_lock_irqsave(&sched->job_list_lock, flags);  	} -	drm_sched_start_timeout(sched); -	spin_unlock_irqrestore(&sched->job_list_lock, flags);  } -EXPORT_SYMBOL(drm_sched_job_recovery); +EXPORT_SYMBOL(drm_sched_resubmit_jobs);  /**   * drm_sched_job_init - init a scheduler job @@ -552,18 +594,27 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)   */  static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb)  { -	struct drm_sched_fence *s_fence = -		container_of(cb, struct drm_sched_fence, cb); +	struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb); +	struct drm_sched_fence *s_fence = s_job->s_fence;  	struct drm_gpu_scheduler *sched = s_fence->sched; +	unsigned long flags; + +	cancel_delayed_work(&sched->work_tdr); -	dma_fence_get(&s_fence->finished);  	atomic_dec(&sched->hw_rq_count);  	atomic_dec(&sched->num_jobs); + +	spin_lock_irqsave(&sched->job_list_lock, flags); +	/* remove job from ring_mirror_list */ +	list_del_init(&s_job->node); +	spin_unlock_irqrestore(&sched->job_list_lock, flags); +  	drm_sched_fence_finished(s_fence);  	trace_drm_sched_process_job(s_fence); -	dma_fence_put(&s_fence->finished);  	wake_up_interruptible(&sched->wake_up_worker); + +	schedule_work(&s_job->finish_work);  }  /** @@ -626,34 +677,22 @@ static int drm_sched_main(void *param)  		if (fence) {  			s_fence->parent = dma_fence_get(fence); -			r = dma_fence_add_callback(fence, &s_fence->cb, +			r = dma_fence_add_callback(fence, &sched_job->cb,  						   drm_sched_process_job);  			if (r == -ENOENT) -				drm_sched_process_job(fence, &s_fence->cb); +				drm_sched_process_job(fence, &sched_job->cb);  			else if (r)  				DRM_ERROR("fence add callback failed (%d)\n",  					  r);  			dma_fence_put(fence); -		} else { -			if (s_fence->finished.error < 0) -				drm_sched_expel_job_unlocked(sched_job); -			drm_sched_process_job(NULL, &s_fence->cb); -		} +		} else +			drm_sched_process_job(NULL, &sched_job->cb);  		wake_up(&sched->job_scheduled);  	}  	return 0;  } -static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job) -{ -	struct drm_gpu_scheduler *sched = s_job->sched; - -	spin_lock(&sched->job_list_lock); -	list_del_init(&s_job->node); -	spin_unlock(&sched->job_list_lock); -} -  /**   * drm_sched_init - Init a gpu scheduler instance   * | 
