From b16c765122f987056e1dc9ef6c214571bb5bd694 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 1 Oct 2018 15:47:53 +0100 Subject: drm/i915: Priority boost for new clients Taken from an idea used for FQ_CODEL, we give the first request of a new request flows a small priority boost. These flows are likely to correspond with short, interactive tasks and so be more latency sensitive than the longer free running queues. As soon as the client has more than one request in the queue, further requests are not boosted and it settles down into ordinary steady state behaviour. Such small kicks dramatically help combat the starvation issue, by allowing each client the opportunity to run even when the system is under heavy throughput load (within the constraints of the user selected priority). v2: Mark the preempted request as the start of a new flow, to prevent a single client being continually gazumped by its peers. Testcase: igt/benchmarks/rrul Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20181001144755.7978-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_request.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_request.c') diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index a492385b20892..56140ca054e84 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1127,8 +1127,20 @@ void i915_request_add(struct i915_request *request) */ local_bh_disable(); rcu_read_lock(); /* RCU serialisation for set-wedged protection */ - if (engine->schedule) - engine->schedule(request, &request->gem_context->sched); + if (engine->schedule) { + struct i915_sched_attr attr = request->gem_context->sched; + + /* + * Boost priorities to new clients (new request flows). + * + * Allow interactive/synchronous clients to jump ahead of + * the bulk clients. (FQ_CODEL) + */ + if (!prev || i915_request_completed(prev)) + attr.priority |= I915_PRIORITY_NEWCLIENT; + + engine->schedule(request, &attr); + } rcu_read_unlock(); i915_sw_fence_commit(&request->submit); local_bh_enable(); /* Kick the execlists tasklet if just scheduled */ -- cgit v1.2.3 From e2f3496e93be3238de2e2e6bfc83b3a83c084ce5 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 1 Oct 2018 15:47:54 +0100 Subject: drm/i915: Pull scheduling under standalone lock Currently, the backend scheduling code abuses struct_mutex into order to have a global lock to manipulate a temporary list (without widespread allocation) and to protect against list modifications. This is an extraneous coupling to struct_mutex and further can not extend beyond the local device. Pull all the code that needs to be under the one true lock into i915_scheduler.c, and make it so. Signed-off-by: Chris Wilson Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20181001144755.7978-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/Makefile | 1 + drivers/gpu/drm/i915/i915_request.c | 85 ------- drivers/gpu/drm/i915/i915_request.h | 8 - drivers/gpu/drm/i915/i915_scheduler.c | 377 ++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_scheduler.h | 25 +++ drivers/gpu/drm/i915/intel_display.c | 3 +- drivers/gpu/drm/i915/intel_lrc.c | 268 +---------------------- drivers/gpu/drm/i915/intel_ringbuffer.h | 5 +- 8 files changed, 411 insertions(+), 361 deletions(-) create mode 100644 drivers/gpu/drm/i915/i915_scheduler.c (limited to 'drivers/gpu/drm/i915/i915_request.c') diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index 5794f102f9b8f..ef1480c14e4e1 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -75,6 +75,7 @@ i915-y += i915_cmd_parser.o \ i915_gemfs.o \ i915_query.o \ i915_request.o \ + i915_scheduler.o \ i915_timeline.o \ i915_trace_points.o \ i915_vma.o \ diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 56140ca054e84..d73ad490a2612 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -111,91 +111,6 @@ i915_request_remove_from_client(struct i915_request *request) spin_unlock(&file_priv->mm.lock); } -static struct i915_dependency * -i915_dependency_alloc(struct drm_i915_private *i915) -{ - return kmem_cache_alloc(i915->dependencies, GFP_KERNEL); -} - -static void -i915_dependency_free(struct drm_i915_private *i915, - struct i915_dependency *dep) -{ - kmem_cache_free(i915->dependencies, dep); -} - -static void -__i915_sched_node_add_dependency(struct i915_sched_node *node, - struct i915_sched_node *signal, - struct i915_dependency *dep, - unsigned long flags) -{ - INIT_LIST_HEAD(&dep->dfs_link); - list_add(&dep->wait_link, &signal->waiters_list); - list_add(&dep->signal_link, &node->signalers_list); - dep->signaler = signal; - dep->flags = flags; -} - -static int -i915_sched_node_add_dependency(struct drm_i915_private *i915, - struct i915_sched_node *node, - struct i915_sched_node *signal) -{ - struct i915_dependency *dep; - - dep = i915_dependency_alloc(i915); - if (!dep) - return -ENOMEM; - - __i915_sched_node_add_dependency(node, signal, dep, - I915_DEPENDENCY_ALLOC); - return 0; -} - -static void -i915_sched_node_fini(struct drm_i915_private *i915, - struct i915_sched_node *node) -{ - struct i915_dependency *dep, *tmp; - - GEM_BUG_ON(!list_empty(&node->link)); - - /* - * Everyone we depended upon (the fences we wait to be signaled) - * should retire before us and remove themselves from our list. - * However, retirement is run independently on each timeline and - * so we may be called out-of-order. - */ - list_for_each_entry_safe(dep, tmp, &node->signalers_list, signal_link) { - GEM_BUG_ON(!i915_sched_node_signaled(dep->signaler)); - GEM_BUG_ON(!list_empty(&dep->dfs_link)); - - list_del(&dep->wait_link); - if (dep->flags & I915_DEPENDENCY_ALLOC) - i915_dependency_free(i915, dep); - } - - /* Remove ourselves from everyone who depends upon us */ - list_for_each_entry_safe(dep, tmp, &node->waiters_list, wait_link) { - GEM_BUG_ON(dep->signaler != node); - GEM_BUG_ON(!list_empty(&dep->dfs_link)); - - list_del(&dep->signal_link); - if (dep->flags & I915_DEPENDENCY_ALLOC) - i915_dependency_free(i915, dep); - } -} - -static void -i915_sched_node_init(struct i915_sched_node *node) -{ - INIT_LIST_HEAD(&node->signalers_list); - INIT_LIST_HEAD(&node->waiters_list); - INIT_LIST_HEAD(&node->link); - node->attr.priority = I915_PRIORITY_INVALID; -} - static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno) { struct intel_engine_cs *engine; diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 7fa94b0249683..5f7361e0fca6c 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -332,14 +332,6 @@ static inline bool i915_request_completed(const struct i915_request *rq) return __i915_request_completed(rq, seqno); } -static inline bool i915_sched_node_signaled(const struct i915_sched_node *node) -{ - const struct i915_request *rq = - container_of(node, const struct i915_request, sched); - - return i915_request_completed(rq); -} - void i915_retire_requests(struct drm_i915_private *i915); /* diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c new file mode 100644 index 0000000000000..de9a2ba7c3bca --- /dev/null +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -0,0 +1,377 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2018 Intel Corporation + */ + +#include + +#include "i915_drv.h" +#include "i915_request.h" +#include "i915_scheduler.h" + +static DEFINE_SPINLOCK(schedule_lock); + +static const struct i915_request * +node_to_request(const struct i915_sched_node *node) +{ + return container_of(node, const struct i915_request, sched); +} + +static inline bool node_signaled(const struct i915_sched_node *node) +{ + return i915_request_completed(node_to_request(node)); +} + +void i915_sched_node_init(struct i915_sched_node *node) +{ + INIT_LIST_HEAD(&node->signalers_list); + INIT_LIST_HEAD(&node->waiters_list); + INIT_LIST_HEAD(&node->link); + node->attr.priority = I915_PRIORITY_INVALID; +} + +static struct i915_dependency * +i915_dependency_alloc(struct drm_i915_private *i915) +{ + return kmem_cache_alloc(i915->dependencies, GFP_KERNEL); +} + +static void +i915_dependency_free(struct drm_i915_private *i915, + struct i915_dependency *dep) +{ + kmem_cache_free(i915->dependencies, dep); +} + +bool __i915_sched_node_add_dependency(struct i915_sched_node *node, + struct i915_sched_node *signal, + struct i915_dependency *dep, + unsigned long flags) +{ + bool ret = false; + + spin_lock(&schedule_lock); + + if (!node_signaled(signal)) { + INIT_LIST_HEAD(&dep->dfs_link); + list_add(&dep->wait_link, &signal->waiters_list); + list_add(&dep->signal_link, &node->signalers_list); + dep->signaler = signal; + dep->flags = flags; + + ret = true; + } + + spin_unlock(&schedule_lock); + + return ret; +} + +int i915_sched_node_add_dependency(struct drm_i915_private *i915, + struct i915_sched_node *node, + struct i915_sched_node *signal) +{ + struct i915_dependency *dep; + + dep = i915_dependency_alloc(i915); + if (!dep) + return -ENOMEM; + + if (!__i915_sched_node_add_dependency(node, signal, dep, + I915_DEPENDENCY_ALLOC)) + i915_dependency_free(i915, dep); + + return 0; +} + +void i915_sched_node_fini(struct drm_i915_private *i915, + struct i915_sched_node *node) +{ + struct i915_dependency *dep, *tmp; + + GEM_BUG_ON(!list_empty(&node->link)); + + spin_lock(&schedule_lock); + + /* + * Everyone we depended upon (the fences we wait to be signaled) + * should retire before us and remove themselves from our list. + * However, retirement is run independently on each timeline and + * so we may be called out-of-order. + */ + list_for_each_entry_safe(dep, tmp, &node->signalers_list, signal_link) { + GEM_BUG_ON(!node_signaled(dep->signaler)); + GEM_BUG_ON(!list_empty(&dep->dfs_link)); + + list_del(&dep->wait_link); + if (dep->flags & I915_DEPENDENCY_ALLOC) + i915_dependency_free(i915, dep); + } + + /* Remove ourselves from everyone who depends upon us */ + list_for_each_entry_safe(dep, tmp, &node->waiters_list, wait_link) { + GEM_BUG_ON(dep->signaler != node); + GEM_BUG_ON(!list_empty(&dep->dfs_link)); + + list_del(&dep->signal_link); + if (dep->flags & I915_DEPENDENCY_ALLOC) + i915_dependency_free(i915, dep); + } + + spin_unlock(&schedule_lock); +} + +static inline struct i915_priolist *to_priolist(struct rb_node *rb) +{ + return rb_entry(rb, struct i915_priolist, node); +} + +static void assert_priolists(struct intel_engine_execlists * const execlists, + long queue_priority) +{ + struct rb_node *rb; + long last_prio, i; + + if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + return; + + GEM_BUG_ON(rb_first_cached(&execlists->queue) != + rb_first(&execlists->queue.rb_root)); + + last_prio = (queue_priority >> I915_USER_PRIORITY_SHIFT) + 1; + for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { + const struct i915_priolist *p = to_priolist(rb); + + GEM_BUG_ON(p->priority >= last_prio); + last_prio = p->priority; + + GEM_BUG_ON(!p->used); + for (i = 0; i < ARRAY_SIZE(p->requests); i++) { + if (list_empty(&p->requests[i])) + continue; + + GEM_BUG_ON(!(p->used & BIT(i))); + } + } +} + +struct list_head * +i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct i915_priolist *p; + struct rb_node **parent, *rb; + bool first = true; + int idx, i; + + lockdep_assert_held(&engine->timeline.lock); + assert_priolists(execlists, INT_MAX); + + /* buckets sorted from highest [in slot 0] to lowest priority */ + idx = I915_PRIORITY_COUNT - (prio & I915_PRIORITY_MASK) - 1; + prio >>= I915_USER_PRIORITY_SHIFT; + if (unlikely(execlists->no_priolist)) + prio = I915_PRIORITY_NORMAL; + +find_priolist: + /* most positive priority is scheduled first, equal priorities fifo */ + rb = NULL; + parent = &execlists->queue.rb_root.rb_node; + while (*parent) { + rb = *parent; + p = to_priolist(rb); + if (prio > p->priority) { + parent = &rb->rb_left; + } else if (prio < p->priority) { + parent = &rb->rb_right; + first = false; + } else { + goto out; + } + } + + if (prio == I915_PRIORITY_NORMAL) { + p = &execlists->default_priolist; + } else { + p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC); + /* Convert an allocation failure to a priority bump */ + if (unlikely(!p)) { + prio = I915_PRIORITY_NORMAL; /* recurses just once */ + + /* To maintain ordering with all rendering, after an + * allocation failure we have to disable all scheduling. + * Requests will then be executed in fifo, and schedule + * will ensure that dependencies are emitted in fifo. + * There will be still some reordering with existing + * requests, so if userspace lied about their + * dependencies that reordering may be visible. + */ + execlists->no_priolist = true; + goto find_priolist; + } + } + + p->priority = prio; + for (i = 0; i < ARRAY_SIZE(p->requests); i++) + INIT_LIST_HEAD(&p->requests[i]); + rb_link_node(&p->node, rb, parent); + rb_insert_color_cached(&p->node, &execlists->queue, first); + p->used = 0; + +out: + p->used |= BIT(idx); + return &p->requests[idx]; +} + +static struct intel_engine_cs * +sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked) +{ + struct intel_engine_cs *engine = node_to_request(node)->engine; + + GEM_BUG_ON(!locked); + + if (engine != locked) { + spin_unlock(&locked->timeline.lock); + spin_lock(&engine->timeline.lock); + } + + return engine; +} + +void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr) +{ + struct list_head *uninitialized_var(pl); + struct intel_engine_cs *engine, *last; + struct i915_dependency *dep, *p; + struct i915_dependency stack; + const int prio = attr->priority; + LIST_HEAD(dfs); + + GEM_BUG_ON(prio == I915_PRIORITY_INVALID); + + if (i915_request_completed(rq)) + return; + + if (prio <= READ_ONCE(rq->sched.attr.priority)) + return; + + /* Needed in order to use the temporary link inside i915_dependency */ + spin_lock(&schedule_lock); + + stack.signaler = &rq->sched; + list_add(&stack.dfs_link, &dfs); + + /* + * Recursively bump all dependent priorities to match the new request. + * + * A naive approach would be to use recursion: + * static void update_priorities(struct i915_sched_node *node, prio) { + * list_for_each_entry(dep, &node->signalers_list, signal_link) + * update_priorities(dep->signal, prio) + * queue_request(node); + * } + * but that may have unlimited recursion depth and so runs a very + * real risk of overunning the kernel stack. Instead, we build + * a flat list of all dependencies starting with the current request. + * As we walk the list of dependencies, we add all of its dependencies + * to the end of the list (this may include an already visited + * request) and continue to walk onwards onto the new dependencies. The + * end result is a topological list of requests in reverse order, the + * last element in the list is the request we must execute first. + */ + list_for_each_entry(dep, &dfs, dfs_link) { + struct i915_sched_node *node = dep->signaler; + + /* + * Within an engine, there can be no cycle, but we may + * refer to the same dependency chain multiple times + * (redundant dependencies are not eliminated) and across + * engines. + */ + list_for_each_entry(p, &node->signalers_list, signal_link) { + GEM_BUG_ON(p == dep); /* no cycles! */ + + if (node_signaled(p->signaler)) + continue; + + GEM_BUG_ON(p->signaler->attr.priority < node->attr.priority); + if (prio > READ_ONCE(p->signaler->attr.priority)) + list_move_tail(&p->dfs_link, &dfs); + } + } + + /* + * If we didn't need to bump any existing priorities, and we haven't + * yet submitted this request (i.e. there is no potential race with + * execlists_submit_request()), we can set our own priority and skip + * acquiring the engine locks. + */ + if (rq->sched.attr.priority == I915_PRIORITY_INVALID) { + GEM_BUG_ON(!list_empty(&rq->sched.link)); + rq->sched.attr = *attr; + + if (stack.dfs_link.next == stack.dfs_link.prev) + goto out_unlock; + + __list_del_entry(&stack.dfs_link); + } + + last = NULL; + engine = rq->engine; + spin_lock_irq(&engine->timeline.lock); + + /* Fifo and depth-first replacement ensure our deps execute before us */ + list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) { + struct i915_sched_node *node = dep->signaler; + + INIT_LIST_HEAD(&dep->dfs_link); + + engine = sched_lock_engine(node, engine); + + /* Recheck after acquiring the engine->timeline.lock */ + if (prio <= node->attr.priority || node_signaled(node)) + continue; + + node->attr.priority = prio; + if (!list_empty(&node->link)) { + if (last != engine) { + pl = i915_sched_lookup_priolist(engine, prio); + last = engine; + } + list_move_tail(&node->link, pl); + } else { + /* + * If the request is not in the priolist queue because + * it is not yet runnable, then it doesn't contribute + * to our preemption decisions. On the other hand, + * if the request is on the HW, it too is not in the + * queue; but in that case we may still need to reorder + * the inflight requests. + */ + if (!i915_sw_fence_done(&node_to_request(node)->submit)) + continue; + } + + if (prio <= engine->execlists.queue_priority) + continue; + + /* + * If we are already the currently executing context, don't + * bother evaluating if we should preempt ourselves. + */ + if (node_to_request(node)->global_seqno && + i915_seqno_passed(port_request(engine->execlists.port)->global_seqno, + node_to_request(node)->global_seqno)) + continue; + + /* Defer (tasklet) submission until after all of our updates. */ + engine->execlists.queue_priority = prio; + tasklet_hi_schedule(&engine->execlists.tasklet); + } + + spin_unlock_irq(&engine->timeline.lock); + +out_unlock: + spin_unlock(&schedule_lock); +} diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h index 53dc7dbf88b9b..68d84a45ad7f4 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.h +++ b/drivers/gpu/drm/i915/i915_scheduler.h @@ -8,9 +8,14 @@ #define _I915_SCHEDULER_H_ #include +#include #include +struct drm_i915_private; +struct i915_request; +struct intel_engine_cs; + enum { I915_PRIORITY_MIN = I915_CONTEXT_MIN_USER_PRIORITY - 1, I915_PRIORITY_NORMAL = I915_CONTEXT_DEFAULT_PRIORITY, @@ -77,4 +82,24 @@ struct i915_dependency { #define I915_DEPENDENCY_ALLOC BIT(0) }; +void i915_sched_node_init(struct i915_sched_node *node); + +bool __i915_sched_node_add_dependency(struct i915_sched_node *node, + struct i915_sched_node *signal, + struct i915_dependency *dep, + unsigned long flags); + +int i915_sched_node_add_dependency(struct drm_i915_private *i915, + struct i915_sched_node *node, + struct i915_sched_node *signal); + +void i915_sched_node_fini(struct drm_i915_private *i915, + struct i915_sched_node *node); + +void i915_schedule(struct i915_request *request, + const struct i915_sched_attr *attr); + +struct list_head * +i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio); + #endif /* _I915_SCHEDULER_H_ */ diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 4c5c2b39e65ca..650e01f0c1979 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -13188,13 +13188,12 @@ intel_prepare_plane_fb(struct drm_plane *plane, ret = intel_plane_pin_fb(to_intel_plane_state(new_state)); - fb_obj_bump_render_priority(obj); - mutex_unlock(&dev_priv->drm.struct_mutex); i915_gem_object_unpin_pages(obj); if (ret) return ret; + fb_obj_bump_render_priority(obj); intel_fb_obj_flush(obj, ORIGIN_DIRTYFB); if (!new_state->fence) { /* implicit fencing */ diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 5f324d6b44d59..28d56387edf59 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -259,102 +259,6 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx, ce->lrc_desc = desc; } -static void assert_priolists(struct intel_engine_execlists * const execlists, - long queue_priority) -{ - struct rb_node *rb; - long last_prio, i; - - if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) - return; - - GEM_BUG_ON(rb_first_cached(&execlists->queue) != - rb_first(&execlists->queue.rb_root)); - - last_prio = (queue_priority >> I915_USER_PRIORITY_SHIFT) + 1; - for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { - struct i915_priolist *p = to_priolist(rb); - - GEM_BUG_ON(p->priority >= last_prio); - last_prio = p->priority; - - GEM_BUG_ON(!p->used); - for (i = 0; i < ARRAY_SIZE(p->requests); i++) { - if (list_empty(&p->requests[i])) - continue; - - GEM_BUG_ON(!(p->used & BIT(i))); - } - } -} - -static struct list_head * -lookup_priolist(struct intel_engine_cs *engine, int prio) -{ - struct intel_engine_execlists * const execlists = &engine->execlists; - struct i915_priolist *p; - struct rb_node **parent, *rb; - bool first = true; - int idx, i; - - assert_priolists(execlists, INT_MAX); - - /* buckets sorted from highest [in slot 0] to lowest priority */ - idx = I915_PRIORITY_COUNT - (prio & I915_PRIORITY_MASK) - 1; - prio >>= I915_USER_PRIORITY_SHIFT; - if (unlikely(execlists->no_priolist)) - prio = I915_PRIORITY_NORMAL; - -find_priolist: - /* most positive priority is scheduled first, equal priorities fifo */ - rb = NULL; - parent = &execlists->queue.rb_root.rb_node; - while (*parent) { - rb = *parent; - p = to_priolist(rb); - if (prio > p->priority) { - parent = &rb->rb_left; - } else if (prio < p->priority) { - parent = &rb->rb_right; - first = false; - } else { - goto out; - } - } - - if (prio == I915_PRIORITY_NORMAL) { - p = &execlists->default_priolist; - } else { - p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC); - /* Convert an allocation failure to a priority bump */ - if (unlikely(!p)) { - prio = I915_PRIORITY_NORMAL; /* recurses just once */ - - /* To maintain ordering with all rendering, after an - * allocation failure we have to disable all scheduling. - * Requests will then be executed in fifo, and schedule - * will ensure that dependencies are emitted in fifo. - * There will be still some reordering with existing - * requests, so if userspace lied about their - * dependencies that reordering may be visible. - */ - execlists->no_priolist = true; - goto find_priolist; - } - } - - p->priority = prio; - for (i = 0; i < ARRAY_SIZE(p->requests); i++) - INIT_LIST_HEAD(&p->requests[i]); - rb_link_node(&p->node, rb, parent); - rb_insert_color_cached(&p->node, &execlists->queue, first); - p->used = 0; - -out: - p->used |= BIT(idx); - return &p->requests[idx]; -} - static void unwind_wa_tail(struct i915_request *rq) { rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES); @@ -381,7 +285,7 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine) GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); if (rq_prio(rq) != prio) { prio = rq_prio(rq); - pl = lookup_priolist(engine, prio); + pl = i915_sched_lookup_priolist(engine, prio); } GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); @@ -398,7 +302,7 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine) if (!(prio & I915_PRIORITY_NEWCLIENT)) { prio |= I915_PRIORITY_NEWCLIENT; list_move_tail(&active->sched.link, - lookup_priolist(engine, prio)); + i915_sched_lookup_priolist(engine, prio)); } } @@ -792,7 +696,6 @@ done: */ execlists->queue_priority = port != execlists->port ? rq_prio(last) : INT_MIN; - assert_priolists(execlists, execlists->queue_priority); if (submit) { port_assign(port, last); @@ -1119,12 +1022,7 @@ static void queue_request(struct intel_engine_cs *engine, struct i915_sched_node *node, int prio) { - list_add_tail(&node->link, lookup_priolist(engine, prio)); -} - -static void __update_queue(struct intel_engine_cs *engine, int prio) -{ - engine->execlists.queue_priority = prio; + list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio)); } static void __submit_queue_imm(struct intel_engine_cs *engine) @@ -1143,7 +1041,7 @@ static void __submit_queue_imm(struct intel_engine_cs *engine) static void submit_queue(struct intel_engine_cs *engine, int prio) { if (prio > engine->execlists.queue_priority) { - __update_queue(engine, prio); + engine->execlists.queue_priority = prio; __submit_queue_imm(engine); } } @@ -1166,162 +1064,6 @@ static void execlists_submit_request(struct i915_request *request) spin_unlock_irqrestore(&engine->timeline.lock, flags); } -static struct i915_request *sched_to_request(struct i915_sched_node *node) -{ - return container_of(node, struct i915_request, sched); -} - -static struct intel_engine_cs * -sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked) -{ - struct intel_engine_cs *engine = sched_to_request(node)->engine; - - GEM_BUG_ON(!locked); - - if (engine != locked) { - spin_unlock(&locked->timeline.lock); - spin_lock(&engine->timeline.lock); - } - - return engine; -} - -static void execlists_schedule(struct i915_request *request, - const struct i915_sched_attr *attr) -{ - struct list_head *uninitialized_var(pl); - struct intel_engine_cs *engine, *last; - struct i915_dependency *dep, *p; - struct i915_dependency stack; - const int prio = attr->priority; - LIST_HEAD(dfs); - - GEM_BUG_ON(prio == I915_PRIORITY_INVALID); - - if (i915_request_completed(request)) - return; - - if (prio <= READ_ONCE(request->sched.attr.priority)) - return; - - /* Need BKL in order to use the temporary link inside i915_dependency */ - lockdep_assert_held(&request->i915->drm.struct_mutex); - - stack.signaler = &request->sched; - list_add(&stack.dfs_link, &dfs); - - /* - * Recursively bump all dependent priorities to match the new request. - * - * A naive approach would be to use recursion: - * static void update_priorities(struct i915_sched_node *node, prio) { - * list_for_each_entry(dep, &node->signalers_list, signal_link) - * update_priorities(dep->signal, prio) - * queue_request(node); - * } - * but that may have unlimited recursion depth and so runs a very - * real risk of overunning the kernel stack. Instead, we build - * a flat list of all dependencies starting with the current request. - * As we walk the list of dependencies, we add all of its dependencies - * to the end of the list (this may include an already visited - * request) and continue to walk onwards onto the new dependencies. The - * end result is a topological list of requests in reverse order, the - * last element in the list is the request we must execute first. - */ - list_for_each_entry(dep, &dfs, dfs_link) { - struct i915_sched_node *node = dep->signaler; - - /* - * Within an engine, there can be no cycle, but we may - * refer to the same dependency chain multiple times - * (redundant dependencies are not eliminated) and across - * engines. - */ - list_for_each_entry(p, &node->signalers_list, signal_link) { - GEM_BUG_ON(p == dep); /* no cycles! */ - - if (i915_sched_node_signaled(p->signaler)) - continue; - - GEM_BUG_ON(p->signaler->attr.priority < node->attr.priority); - if (prio > READ_ONCE(p->signaler->attr.priority)) - list_move_tail(&p->dfs_link, &dfs); - } - } - - /* - * If we didn't need to bump any existing priorities, and we haven't - * yet submitted this request (i.e. there is no potential race with - * execlists_submit_request()), we can set our own priority and skip - * acquiring the engine locks. - */ - if (request->sched.attr.priority == I915_PRIORITY_INVALID) { - GEM_BUG_ON(!list_empty(&request->sched.link)); - request->sched.attr = *attr; - if (stack.dfs_link.next == stack.dfs_link.prev) - return; - __list_del_entry(&stack.dfs_link); - } - - last = NULL; - engine = request->engine; - spin_lock_irq(&engine->timeline.lock); - - /* Fifo and depth-first replacement ensure our deps execute before us */ - list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) { - struct i915_sched_node *node = dep->signaler; - - INIT_LIST_HEAD(&dep->dfs_link); - - engine = sched_lock_engine(node, engine); - - /* Recheck after acquiring the engine->timeline.lock */ - if (prio <= node->attr.priority) - continue; - - if (i915_sched_node_signaled(node)) - continue; - - node->attr.priority = prio; - if (!list_empty(&node->link)) { - if (last != engine) { - pl = lookup_priolist(engine, prio); - last = engine; - } - list_move_tail(&node->link, pl); - } else { - /* - * If the request is not in the priolist queue because - * it is not yet runnable, then it doesn't contribute - * to our preemption decisions. On the other hand, - * if the request is on the HW, it too is not in the - * queue; but in that case we may still need to reorder - * the inflight requests. - */ - if (!i915_sw_fence_done(&sched_to_request(node)->submit)) - continue; - } - - if (prio <= engine->execlists.queue_priority) - continue; - - /* - * If we are already the currently executing context, don't - * bother evaluating if we should preempt ourselves. - */ - if (sched_to_request(node)->global_seqno && - i915_seqno_passed(port_request(engine->execlists.port)->global_seqno, - sched_to_request(node)->global_seqno)) - continue; - - /* Defer (tasklet) submission until after all of our updates. */ - __update_queue(engine, prio); - tasklet_hi_schedule(&engine->execlists.tasklet); - } - - spin_unlock_irq(&engine->timeline.lock); -} - static void execlists_context_destroy(struct intel_context *ce) { GEM_BUG_ON(ce->pin_count); @@ -2359,7 +2101,7 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine) { engine->submit_request = execlists_submit_request; engine->cancel_requests = execlists_cancel_requests; - engine->schedule = execlists_schedule; + engine->schedule = i915_schedule; engine->execlists.tasklet.func = execlists_submission_tasklet; engine->reset.prepare = execlists_reset_prepare; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 1534de5bb8523..f6ec48a75a698 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -498,11 +498,10 @@ struct intel_engine_cs { */ void (*submit_request)(struct i915_request *rq); - /* Call when the priority on a request has changed and it and its + /* + * Call when the priority on a request has changed and it and its * dependencies may need rescheduling. Note the request itself may * not be ready to run! - * - * Called under the struct_mutex. */ void (*schedule)(struct i915_request *request, const struct i915_sched_attr *attr); -- cgit v1.2.3 From e9eaf82d97a2b05460ff5ef6a3e07446f7d049fe Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 1 Oct 2018 15:47:55 +0100 Subject: drm/i915: Priority boost for waiting clients Latency is in the eye of the beholder. In the case where a client stops and waits for the gpu, give that request chain a small priority boost (not so that it overtakes higher priority clients, to preserve the external ordering) so that ideally the wait completes earlier. v2: Tvrtko recommends to keep the boost-from-user-stall as small as possible and to allow new client flows to be preferred for interactivity over stalls. Testcase: igt/gem_sync/switch-default Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Cc: Dmitry Rogozhkin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20181001144755.7978-3-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gem.c | 5 ++++- drivers/gpu/drm/i915/i915_request.c | 2 ++ drivers/gpu/drm/i915/i915_request.h | 5 +++-- drivers/gpu/drm/i915/i915_scheduler.c | 34 ++++++++++++++++++++++++++++------ drivers/gpu/drm/i915/i915_scheduler.h | 7 +++++-- 5 files changed, 42 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_request.c') diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 28e943ee8b5e6..7d45e71100bce 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1748,6 +1748,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, */ err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE | + I915_WAIT_PRIORITY | (write_domain ? I915_WAIT_ALL : 0), MAX_SCHEDULE_TIMEOUT, to_rps_client(file)); @@ -3751,7 +3752,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file) start = ktime_get(); ret = i915_gem_object_wait(obj, - I915_WAIT_INTERRUPTIBLE | I915_WAIT_ALL, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_PRIORITY | + I915_WAIT_ALL, to_wait_timeout(args->timeout_ns), to_rps_client(file)); diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index d73ad490a2612..abd4dacbab8e9 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1237,6 +1237,8 @@ long i915_request_wait(struct i915_request *rq, add_wait_queue(errq, &reset); intel_wait_init(&wait); + if (flags & I915_WAIT_PRIORITY) + i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT); restart: do { diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 5f7361e0fca6c..90e9d170a0cd5 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -277,8 +277,9 @@ long i915_request_wait(struct i915_request *rq, __attribute__((nonnull(1))); #define I915_WAIT_INTERRUPTIBLE BIT(0) #define I915_WAIT_LOCKED BIT(1) /* struct_mutex held, handle GPU reset */ -#define I915_WAIT_ALL BIT(2) /* used by i915_gem_object_wait() */ -#define I915_WAIT_FOR_IDLE_BOOST BIT(3) +#define I915_WAIT_PRIORITY BIT(2) /* small priority bump for the request */ +#define I915_WAIT_ALL BIT(3) /* used by i915_gem_object_wait() */ +#define I915_WAIT_FOR_IDLE_BOOST BIT(4) static inline bool intel_engine_has_started(struct intel_engine_cs *engine, u32 seqno); diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index de9a2ba7c3bca..340faea6c08a2 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -239,7 +239,8 @@ sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked) return engine; } -void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr) +static void __i915_schedule(struct i915_request *rq, + const struct i915_sched_attr *attr) { struct list_head *uninitialized_var(pl); struct intel_engine_cs *engine, *last; @@ -248,6 +249,8 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr) const int prio = attr->priority; LIST_HEAD(dfs); + /* Needed in order to use the temporary link inside i915_dependency */ + lockdep_assert_held(&schedule_lock); GEM_BUG_ON(prio == I915_PRIORITY_INVALID); if (i915_request_completed(rq)) @@ -256,9 +259,6 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr) if (prio <= READ_ONCE(rq->sched.attr.priority)) return; - /* Needed in order to use the temporary link inside i915_dependency */ - spin_lock(&schedule_lock); - stack.signaler = &rq->sched; list_add(&stack.dfs_link, &dfs); @@ -312,7 +312,7 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr) rq->sched.attr = *attr; if (stack.dfs_link.next == stack.dfs_link.prev) - goto out_unlock; + return; __list_del_entry(&stack.dfs_link); } @@ -371,7 +371,29 @@ void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr) } spin_unlock_irq(&engine->timeline.lock); +} -out_unlock: +void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr) +{ + spin_lock(&schedule_lock); + __i915_schedule(rq, attr); spin_unlock(&schedule_lock); } + +void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump) +{ + struct i915_sched_attr attr; + + GEM_BUG_ON(bump & ~I915_PRIORITY_MASK); + + if (READ_ONCE(rq->sched.attr.priority) == I915_PRIORITY_INVALID) + return; + + spin_lock_bh(&schedule_lock); + + attr = rq->sched.attr; + attr.priority |= bump; + __i915_schedule(rq, &attr); + + spin_unlock_bh(&schedule_lock); +} diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h index 68d84a45ad7f4..dbe9cb7ecd829 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.h +++ b/drivers/gpu/drm/i915/i915_scheduler.h @@ -24,13 +24,14 @@ enum { I915_PRIORITY_INVALID = INT_MIN }; -#define I915_USER_PRIORITY_SHIFT 1 +#define I915_USER_PRIORITY_SHIFT 2 #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT) #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT) #define I915_PRIORITY_MASK (I915_PRIORITY_COUNT - 1) -#define I915_PRIORITY_NEWCLIENT ((u8)BIT(0)) +#define I915_PRIORITY_WAIT ((u8)BIT(0)) +#define I915_PRIORITY_NEWCLIENT ((u8)BIT(1)) struct i915_sched_attr { /** @@ -99,6 +100,8 @@ void i915_sched_node_fini(struct drm_i915_private *i915, void i915_schedule(struct i915_request *request, const struct i915_sched_attr *attr); +void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump); + struct list_head * i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio); -- cgit v1.2.3 From 33373258cf021869380140078bb3f9732b7d852d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 5 Oct 2018 09:03:00 +0100 Subject: drm/i915: Remove the global cache shrink & rcu barrier on allocation failure Earlier, we reasoned that having idled the gpu under mempressure, that would be a good time to trim our request slabs in order to perform the next request allocation. We have stopped performing the global operation on the device (no idling) and wish to make the allocation failure handling more local, so out with the global barrier that may take a long time. Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20181005080300.9908-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_request.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_request.c') diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index abd4dacbab8e9..28819f8c4da6f 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -655,17 +655,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx) if (rq) cond_synchronize_rcu(rq->rcustate); - /* - * We've forced the client to stall and catch up with whatever - * backlog there might have been. As we are assuming that we - * caused the mempressure, now is an opportune time to - * recover as much memory from the request pool as is possible. - * Having already penalized the client to stall, we spend - * a little extra time to re-optimise page allocation. - */ - kmem_cache_shrink(i915->requests); - rcu_barrier(); /* Recover the TYPESAFE_BY_RCU pages */ - rq = kmem_cache_alloc(i915->requests, GFP_KERNEL); if (!rq) { ret = -ENOMEM; -- cgit v1.2.3 From 1e016a8693368c163b18104a28c4edc71858f2d2 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 24 Oct 2018 11:49:39 +0100 Subject: drm/i915: Park signaling thread while wrapping the seqno A danger encountered when resetting the seqno (using debugfs/i915_next_seqno) is that as we change the breadcrumb stored in the HWSP, it may be inspected by the signaler thread leading to confusion in our sanity checks. <0> [136.331342] i915/sig-347 3..s1 136336154us : execlists_submission_tasklet: rcs0 awake?=1, active=5 <0> [136.331373] i915/sig-347 3d.s2 136336155us : process_csb: rcs0 cs-irq head=5, tail=0 <0> [136.331402] i915/sig-347 3d.s2 136336155us : process_csb: rcs0 csb[0]: status=0x00000018:0x00000002, active=0x5 <0> [136.331434] i915/sig-347 3d.s2 136336156us : process_csb: rcs0 out[0]: ctx=2.1, global=219 (fence 46:8455) (current 219), prio=0 <0> [136.331466] i915/sig-347 3d.s2 136336156us : process_csb: rcs0 completed ctx=2 <0> [136.332027] gem_exec-1049 0.... 136336246us : reset_all_global_seqno.part.5: rcs0 seqno 219 (current 219) -> -43 <0> [136.332056] gem_exec-1049 0.... 136336251us : reset_all_global_seqno.part.5: bcs0 seqno 183 (current 183) -> -43 <0> [136.332085] gem_exec-1049 0.... 136336255us : reset_all_global_seqno.part.5: vcs0 seqno 191 (current 191) -> -43 <0> [136.332114] gem_exec-1049 0.... 136336259us : reset_all_global_seqno.part.5: vcs1 seqno 180 (current 180) -> -43 <0> [136.332143] gem_exec-1049 0.... 136336262us : reset_all_global_seqno.part.5: vecs0 seqno 212 (current 212) -> -43 <0> [136.332174] i915/sig-347 3.... 136336280us : intel_breadcrumbs_signaler: intel_breadcrumbs_signaler:673 GEM_BUG_ON(!i915_request_completed(rq)) Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20181024104939.2861-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_request.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/gpu/drm/i915/i915_request.c') diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 28819f8c4da6f..71107540581dc 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -136,6 +136,8 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno) intel_engine_get_seqno(engine), seqno); + kthread_park(engine->breadcrumbs.signaler); + if (!i915_seqno_passed(seqno, engine->timeline.seqno)) { /* Flush any waiters before we reuse the seqno */ intel_engine_disarm_breadcrumbs(engine); @@ -150,6 +152,8 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno) /* Finally reset hw state */ intel_engine_init_global_seqno(engine, seqno); engine->timeline.seqno = seqno; + + kthread_unpark(engine->breadcrumbs.signaler); } list_for_each_entry(timeline, &i915->gt.timelines, link) -- cgit v1.2.3