diff options
author | Philipp Stanner <phasta@kernel.org> | 2025-07-10 14:54:06 +0200 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2025-08-20 18:41:10 +0200 |
commit | c82fa2c15546120517f1034e2550fe46ca7c06b4 (patch) | |
tree | a4be7b587f661a5a76c2b48b3ac6261aafb19fe0 | |
parent | 5480999bbbecf74345e98132070be02e81d5689f (diff) |
drm/sched: Avoid memory leaks with cancel_job() callback
[ Upstream commit bf8bbaefaa6ae0a07971ea57b3208df60e8ad0a4 ]
Since its inception, the GPU scheduler can leak memory if the driver
calls drm_sched_fini() while there are still jobs in flight.
The simplest way to solve this in a backwards compatible manner is by
adding a new callback, drm_sched_backend_ops.cancel_job(), which
instructs the driver to signal the hardware fence associated with the
job. Afterwards, the scheduler can safely use the established free_job()
callback for freeing the job.
Implement the new backend_ops callback cancel_job().
Suggested-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Link: https://lore.kernel.org/dri-devel/20250418113211.69956-1-tvrtko.ursulin@igalia.com/
Reviewed-by: MaĆra Canal <mcanal@igalia.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Signed-off-by: Philipp Stanner <phasta@kernel.org>
Link: https://lore.kernel.org/r/20250710125412.128476-4-phasta@kernel.org
Signed-off-by: Sasha Levin <sashal@kernel.org>
-rw-r--r-- | drivers/gpu/drm/scheduler/sched_main.c | 34 | ||||
-rw-r--r-- | include/drm/gpu_scheduler.h | 18 |
2 files changed, 39 insertions, 13 deletions
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 829579c41c6b..3ac5e6acce04 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -1348,6 +1348,18 @@ Out_check_own: } EXPORT_SYMBOL(drm_sched_init); +static void drm_sched_cancel_remaining_jobs(struct drm_gpu_scheduler *sched) +{ + struct drm_sched_job *job, *tmp; + + /* All other accessors are stopped. No locking necessary. */ + list_for_each_entry_safe_reverse(job, tmp, &sched->pending_list, list) { + sched->ops->cancel_job(job); + list_del(&job->list); + sched->ops->free_job(job); + } +} + /** * drm_sched_fini - Destroy a gpu scheduler * @@ -1355,19 +1367,11 @@ EXPORT_SYMBOL(drm_sched_init); * * Tears down and cleans up the scheduler. * - * This stops submission of new jobs to the hardware through - * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job() - * will not be called for all jobs still in drm_gpu_scheduler.pending_list. - * There is no solution for this currently. Thus, it is up to the driver to make - * sure that: - * - * a) drm_sched_fini() is only called after for all submitted jobs - * drm_sched_backend_ops.free_job() has been called or that - * b) the jobs for which drm_sched_backend_ops.free_job() has not been called - * after drm_sched_fini() ran are freed manually. - * - * FIXME: Take care of the above problem and prevent this function from leaking - * the jobs in drm_gpu_scheduler.pending_list under any circumstances. + * This stops submission of new jobs to the hardware through &struct + * drm_sched_backend_ops.run_job. If &struct drm_sched_backend_ops.cancel_job + * is implemented, all jobs will be canceled through it and afterwards cleaned + * up through &struct drm_sched_backend_ops.free_job. If cancel_job is not + * implemented, memory could leak. */ void drm_sched_fini(struct drm_gpu_scheduler *sched) { @@ -1397,6 +1401,10 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched) /* Confirm no work left behind accessing device structures */ cancel_delayed_work_sync(&sched->work_tdr); + /* Avoid memory leaks if supported by the driver. */ + if (sched->ops->cancel_job) + drm_sched_cancel_remaining_jobs(sched); + if (sched->own_submit_wq) destroy_workqueue(sched->submit_wq); sched->ready = false; diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 1a7e377d4cbb..1eed74c7f2f7 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -508,6 +508,24 @@ struct drm_sched_backend_ops { * and it's time to clean it up. */ void (*free_job)(struct drm_sched_job *sched_job); + + /** + * @cancel_job: Used by the scheduler to guarantee remaining jobs' fences + * get signaled in drm_sched_fini(). + * + * Used by the scheduler to cancel all jobs that have not been executed + * with &struct drm_sched_backend_ops.run_job by the time + * drm_sched_fini() gets invoked. + * + * Drivers need to signal the passed job's hardware fence with an + * appropriate error code (e.g., -ECANCELED) in this callback. They + * must not free the job. + * + * The scheduler will only call this callback once it stopped calling + * all other callbacks forever, with the exception of &struct + * drm_sched_backend_ops.free_job. + */ + void (*cancel_job)(struct drm_sched_job *sched_job); }; /** |