diff options
Diffstat (limited to 'drivers/gpu/drm/scheduler/sched_main.c')
| -rw-r--r-- | drivers/gpu/drm/scheduler/sched_main.c | 147 |
1 files changed, 110 insertions, 37 deletions
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index d82a7ebf6099..a2a953693b45 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work) { struct drm_gpu_scheduler *sched; struct drm_sched_job *job; + enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL; sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work); @@ -331,7 +332,7 @@ static void drm_sched_job_timedout(struct work_struct *work) list_del_init(&job->list); spin_unlock(&sched->job_list_lock); - job->sched->ops->timedout_job(job); + status = job->sched->ops->timedout_job(job); /* * Guilty job did complete and hence needs to be manually removed @@ -345,9 +346,11 @@ static void drm_sched_job_timedout(struct work_struct *work) spin_unlock(&sched->job_list_lock); } - spin_lock(&sched->job_list_lock); - drm_sched_start_timeout(sched); - spin_unlock(&sched->job_list_lock); + if (status != DRM_GPU_SCHED_STAT_ENODEV) { + spin_lock(&sched->job_list_lock); + drm_sched_start_timeout(sched); + spin_unlock(&sched->job_list_lock); + } } /** @@ -361,40 +364,16 @@ static void drm_sched_job_timedout(struct work_struct *work) */ void drm_sched_increase_karma(struct drm_sched_job *bad) { - int i; - struct drm_sched_entity *tmp; - struct drm_sched_entity *entity; - struct drm_gpu_scheduler *sched = bad->sched; - - /* don't increase @bad's karma if it's from KERNEL RQ, - * because sometimes GPU hang would cause kernel jobs (like VM updating jobs) - * corrupt but keep in mind that kernel jobs always considered good. - */ - if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { - atomic_inc(&bad->karma); - for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; - i++) { - struct drm_sched_rq *rq = &sched->sched_rq[i]; - - spin_lock(&rq->lock); - list_for_each_entry_safe(entity, tmp, &rq->entities, list) { - if (bad->s_fence->scheduled.context == - entity->fence_context) { - if (atomic_read(&bad->karma) > - bad->sched->hang_limit) - if (entity->guilty) - atomic_set(entity->guilty, 1); - break; - } - } - spin_unlock(&rq->lock); - if (&entity->list != &rq->entities) - break; - } - } + drm_sched_increase_karma_ext(bad, 1); } EXPORT_SYMBOL(drm_sched_increase_karma); +void drm_sched_reset_karma(struct drm_sched_job *bad) +{ + drm_sched_increase_karma_ext(bad, 0); +} +EXPORT_SYMBOL(drm_sched_reset_karma); + /** * drm_sched_stop - stop the scheduler * @@ -534,14 +513,31 @@ EXPORT_SYMBOL(drm_sched_start); */ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) { + drm_sched_resubmit_jobs_ext(sched, INT_MAX); +} +EXPORT_SYMBOL(drm_sched_resubmit_jobs); + +/** + * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list + * + * @sched: scheduler instance + * @max: job numbers to relaunch + * + */ +void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max) +{ struct drm_sched_job *s_job, *tmp; uint64_t guilty_context; bool found_guilty = false; struct dma_fence *fence; + int i = 0; list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) { struct drm_sched_fence *s_fence = s_job->s_fence; + if (i >= max) + break; + if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) { found_guilty = true; guilty_context = s_job->s_fence->scheduled.context; @@ -552,6 +548,7 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) dma_fence_put(s_job->s_fence->parent); fence = sched->ops->run_job(s_job); + i++; if (IS_ERR_OR_NULL(fence)) { if (IS_ERR(fence)) @@ -563,7 +560,7 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched) } } } -EXPORT_SYMBOL(drm_sched_resubmit_jobs); +EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext); /** * drm_sched_job_init - init a scheduler job @@ -677,7 +674,7 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched) static struct drm_sched_job * drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) { - struct drm_sched_job *job; + struct drm_sched_job *job, *next; /* * Don't destroy jobs while the timeout worker is running OR thread @@ -696,6 +693,13 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched) if (job && dma_fence_is_signaled(&job->s_fence->finished)) { /* remove job from pending_list */ list_del_init(&job->list); + /* make the scheduled timestamp more accurate */ + next = list_first_entry_or_null(&sched->pending_list, + typeof(*next), list); + if (next) + next->s_fence->scheduled.timestamp = + job->s_fence->finished.timestamp; + } else { job = NULL; /* queue timeout for next job */ @@ -894,12 +898,81 @@ EXPORT_SYMBOL(drm_sched_init); */ void drm_sched_fini(struct drm_gpu_scheduler *sched) { + struct drm_sched_entity *s_entity; + int i; + if (sched->thread) kthread_stop(sched->thread); + for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { + struct drm_sched_rq *rq = &sched->sched_rq[i]; + + if (!rq) + continue; + + spin_lock(&rq->lock); + list_for_each_entry(s_entity, &rq->entities, list) + /* + * Prevents reinsertion and marks job_queue as idle, + * it will removed from rq in drm_sched_entity_fini + * eventually + */ + s_entity->stopped = true; + spin_unlock(&rq->lock); + + } + + /* Wakeup everyone stuck in drm_sched_entity_flush for this scheduler */ + wake_up_all(&sched->job_scheduled); + /* Confirm no work left behind accessing device structures */ cancel_delayed_work_sync(&sched->work_tdr); sched->ready = false; } EXPORT_SYMBOL(drm_sched_fini); + +/** + * drm_sched_increase_karma_ext - Update sched_entity guilty flag + * + * @bad: The job guilty of time out + * @type: type for increase/reset karma + * + */ +void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type) +{ + int i; + struct drm_sched_entity *tmp; + struct drm_sched_entity *entity; + struct drm_gpu_scheduler *sched = bad->sched; + + /* don't change @bad's karma if it's from KERNEL RQ, + * because sometimes GPU hang would cause kernel jobs (like VM updating jobs) + * corrupt but keep in mind that kernel jobs always considered good. + */ + if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) { + if (type == 0) + atomic_set(&bad->karma, 0); + else if (type == 1) + atomic_inc(&bad->karma); + + for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; + i++) { + struct drm_sched_rq *rq = &sched->sched_rq[i]; + + spin_lock(&rq->lock); + list_for_each_entry_safe(entity, tmp, &rq->entities, list) { + if (bad->s_fence->scheduled.context == + entity->fence_context) { + if (entity->guilty) + atomic_set(entity->guilty, type); + break; + } + } + spin_unlock(&rq->lock); + if (&entity->list != &rq->entities) + break; + } + } +} +EXPORT_SYMBOL(drm_sched_increase_karma_ext); |
