summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2025-09-12 09:39:06 +1000
committerDave Airlie <airlied@redhat.com>2025-09-12 09:44:07 +1000
commit9a3f210737e958c3f45a4ce0d7f1ff330af3965f (patch)
tree639fbcdef9c76979eb1f265a760f98291908420f
parentdab1f85526a454cd81174a9df70961169222efa2 (diff)
parentfd99415ec8a80866e5e19f7835876e7b4f294946 (diff)
Merge tag 'drm-xe-fixes-2025-09-11' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes
- Don't touch survivability_mode on fini (Michal) - Fixes around eviction and suspend (Thomas) - Extend Wa_13011645652 to PTL-H, WCL (Julia) Signed-off-by: Dave Airlie <airlied@redhat.com> From: Rodrigo Vivi <rodrigo.vivi@intel.com> Link: https://lore.kernel.org/r/aMLq7QlaEPHGKXKX@intel.com
-rw-r--r--drivers/gpu/drm/xe/tests/xe_bo.c2
-rw-r--r--drivers/gpu/drm/xe/tests/xe_dma_buf.c10
-rw-r--r--drivers/gpu/drm/xe/xe_bo.c16
-rw-r--r--drivers/gpu/drm/xe/xe_bo.h2
-rw-r--r--drivers/gpu/drm/xe/xe_device_types.h6
-rw-r--r--drivers/gpu/drm/xe/xe_dma_buf.c2
-rw-r--r--drivers/gpu/drm/xe/xe_exec.c9
-rw-r--r--drivers/gpu/drm/xe/xe_pm.c42
-rw-r--r--drivers/gpu/drm/xe/xe_survivability_mode.c3
-rw-r--r--drivers/gpu/drm/xe/xe_vm.c42
-rw-r--r--drivers/gpu/drm/xe/xe_vm.h2
-rw-r--r--drivers/gpu/drm/xe/xe_vm_types.h5
-rw-r--r--drivers/gpu/drm/xe/xe_wa_oob.rules3
13 files changed, 115 insertions, 29 deletions
diff --git a/drivers/gpu/drm/xe/tests/xe_bo.c b/drivers/gpu/drm/xe/tests/xe_bo.c
index bb469096d072..7b40cc8be1c9 100644
--- a/drivers/gpu/drm/xe/tests/xe_bo.c
+++ b/drivers/gpu/drm/xe/tests/xe_bo.c
@@ -236,7 +236,7 @@ static int evict_test_run_tile(struct xe_device *xe, struct xe_tile *tile, struc
}
xe_bo_lock(external, false);
- err = xe_bo_pin_external(external);
+ err = xe_bo_pin_external(external, false);
xe_bo_unlock(external);
if (err) {
KUNIT_FAIL(test, "external bo pin err=%pe\n",
diff --git a/drivers/gpu/drm/xe/tests/xe_dma_buf.c b/drivers/gpu/drm/xe/tests/xe_dma_buf.c
index c53f67ce4b0a..121f17c112ec 100644
--- a/drivers/gpu/drm/xe/tests/xe_dma_buf.c
+++ b/drivers/gpu/drm/xe/tests/xe_dma_buf.c
@@ -89,15 +89,7 @@ static void check_residency(struct kunit *test, struct xe_bo *exported,
return;
}
- /*
- * If on different devices, the exporter is kept in system if
- * possible, saving a migration step as the transfer is just
- * likely as fast from system memory.
- */
- if (params->mem_mask & XE_BO_FLAG_SYSTEM)
- KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, XE_PL_TT));
- else
- KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type));
+ KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(exported, mem_type));
if (params->force_different_devices)
KUNIT_EXPECT_TRUE(test, xe_bo_is_mem_type(imported, XE_PL_TT));
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 9954bb458ce1..bae7ff2e5927 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -186,6 +186,8 @@ static void try_add_system(struct xe_device *xe, struct xe_bo *bo,
bo->placements[*c] = (struct ttm_place) {
.mem_type = XE_PL_TT,
+ .flags = (bo_flags & XE_BO_FLAG_VRAM_MASK) ?
+ TTM_PL_FLAG_FALLBACK : 0,
};
*c += 1;
}
@@ -2269,6 +2271,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res)
/**
* xe_bo_pin_external - pin an external BO
* @bo: buffer object to be pinned
+ * @in_place: Pin in current placement, don't attempt to migrate.
*
* Pin an external (not tied to a VM, can be exported via dma-buf / prime FD)
* BO. Unique call compared to xe_bo_pin as this function has it own set of
@@ -2276,7 +2279,7 @@ uint64_t vram_region_gpu_offset(struct ttm_resource *res)
*
* Returns 0 for success, negative error code otherwise.
*/
-int xe_bo_pin_external(struct xe_bo *bo)
+int xe_bo_pin_external(struct xe_bo *bo, bool in_place)
{
struct xe_device *xe = xe_bo_device(bo);
int err;
@@ -2285,9 +2288,11 @@ int xe_bo_pin_external(struct xe_bo *bo)
xe_assert(xe, xe_bo_is_user(bo));
if (!xe_bo_is_pinned(bo)) {
- err = xe_bo_validate(bo, NULL, false);
- if (err)
- return err;
+ if (!in_place) {
+ err = xe_bo_validate(bo, NULL, false);
+ if (err)
+ return err;
+ }
spin_lock(&xe->pinned.lock);
list_add_tail(&bo->pinned_link, &xe->pinned.late.external);
@@ -2440,6 +2445,9 @@ int xe_bo_validate(struct xe_bo *bo, struct xe_vm *vm, bool allow_res_evict)
};
int ret;
+ if (xe_bo_is_pinned(bo))
+ return 0;
+
if (vm) {
lockdep_assert_held(&vm->lock);
xe_vm_assert_held(vm);
diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h
index 02e8cde4c6b2..9ce94d252015 100644
--- a/drivers/gpu/drm/xe/xe_bo.h
+++ b/drivers/gpu/drm/xe/xe_bo.h
@@ -198,7 +198,7 @@ static inline void xe_bo_unlock_vm_held(struct xe_bo *bo)
}
}
-int xe_bo_pin_external(struct xe_bo *bo);
+int xe_bo_pin_external(struct xe_bo *bo, bool in_place);
int xe_bo_pin(struct xe_bo *bo);
void xe_bo_unpin_external(struct xe_bo *bo);
void xe_bo_unpin(struct xe_bo *bo);
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index d4d2c6854790..7ceb0c90f391 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -553,6 +553,12 @@ struct xe_device {
/** @pm_notifier: Our PM notifier to perform actions in response to various PM events. */
struct notifier_block pm_notifier;
+ /** @pm_block: Completion to block validating tasks on suspend / hibernate prepare */
+ struct completion pm_block;
+ /** @rebind_resume_list: List of wq items to kick on resume. */
+ struct list_head rebind_resume_list;
+ /** @rebind_resume_lock: Lock to protect the rebind_resume_list */
+ struct mutex rebind_resume_lock;
/** @pmt: Support the PMT driver callback interface */
struct {
diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c
index 346f857f3837..af64baf872ef 100644
--- a/drivers/gpu/drm/xe/xe_dma_buf.c
+++ b/drivers/gpu/drm/xe/xe_dma_buf.c
@@ -72,7 +72,7 @@ static int xe_dma_buf_pin(struct dma_buf_attachment *attach)
return ret;
}
- ret = xe_bo_pin_external(bo);
+ ret = xe_bo_pin_external(bo, true);
xe_assert(xe, !ret);
return 0;
diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 44364c042ad7..374c831e691b 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -237,6 +237,15 @@ retry:
goto err_unlock_list;
}
+ /*
+ * It's OK to block interruptible here with the vm lock held, since
+ * on task freezing during suspend / hibernate, the call will
+ * return -ERESTARTSYS and the IOCTL will be rerun.
+ */
+ err = wait_for_completion_interruptible(&xe->pm_block);
+ if (err)
+ goto err_unlock_list;
+
vm_exec.vm = &vm->gpuvm;
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
if (xe_vm_in_lr_mode(vm)) {
diff --git a/drivers/gpu/drm/xe/xe_pm.c b/drivers/gpu/drm/xe/xe_pm.c
index e279b47ba03b..bb9b6ecad2af 100644
--- a/drivers/gpu/drm/xe/xe_pm.c
+++ b/drivers/gpu/drm/xe/xe_pm.c
@@ -24,6 +24,7 @@
#include "xe_pcode.h"
#include "xe_pxp.h"
#include "xe_trace.h"
+#include "xe_vm.h"
#include "xe_wa.h"
/**
@@ -290,6 +291,19 @@ static u32 vram_threshold_value(struct xe_device *xe)
return DEFAULT_VRAM_THRESHOLD;
}
+static void xe_pm_wake_rebind_workers(struct xe_device *xe)
+{
+ struct xe_vm *vm, *next;
+
+ mutex_lock(&xe->rebind_resume_lock);
+ list_for_each_entry_safe(vm, next, &xe->rebind_resume_list,
+ preempt.pm_activate_link) {
+ list_del_init(&vm->preempt.pm_activate_link);
+ xe_vm_resume_rebind_worker(vm);
+ }
+ mutex_unlock(&xe->rebind_resume_lock);
+}
+
static int xe_pm_notifier_callback(struct notifier_block *nb,
unsigned long action, void *data)
{
@@ -299,30 +313,30 @@ static int xe_pm_notifier_callback(struct notifier_block *nb,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
+ reinit_completion(&xe->pm_block);
xe_pm_runtime_get(xe);
err = xe_bo_evict_all_user(xe);
- if (err) {
+ if (err)
drm_dbg(&xe->drm, "Notifier evict user failed (%d)\n", err);
- xe_pm_runtime_put(xe);
- break;
- }
err = xe_bo_notifier_prepare_all_pinned(xe);
- if (err) {
+ if (err)
drm_dbg(&xe->drm, "Notifier prepare pin failed (%d)\n", err);
- xe_pm_runtime_put(xe);
- }
+ /*
+ * Keep the runtime pm reference until post hibernation / post suspend to
+ * avoid a runtime suspend interfering with evicted objects or backup
+ * allocations.
+ */
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
+ complete_all(&xe->pm_block);
+ xe_pm_wake_rebind_workers(xe);
xe_bo_notifier_unprepare_all_pinned(xe);
xe_pm_runtime_put(xe);
break;
}
- if (err)
- return NOTIFY_BAD;
-
return NOTIFY_DONE;
}
@@ -344,6 +358,14 @@ int xe_pm_init(struct xe_device *xe)
if (err)
return err;
+ err = drmm_mutex_init(&xe->drm, &xe->rebind_resume_lock);
+ if (err)
+ goto err_unregister;
+
+ init_completion(&xe->pm_block);
+ complete_all(&xe->pm_block);
+ INIT_LIST_HEAD(&xe->rebind_resume_list);
+
/* For now suspend/resume is only allowed with GuC */
if (!xe_device_uc_enabled(xe))
return 0;
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
index 41705f5d52e3..8f7b0add2364 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
@@ -41,6 +41,8 @@
*
* # echo 1 > /sys/kernel/config/xe/0000:03:00.0/survivability_mode
*
+ * It is the responsibility of the user to clear the mode once firmware flash is complete.
+ *
* Refer :ref:`xe_configfs` for more details on how to use configfs
*
* Survivability mode is indicated by the below admin-only readable sysfs which provides additional
@@ -147,7 +149,6 @@ static void xe_survivability_mode_fini(void *arg)
struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
struct device *dev = &pdev->dev;
- xe_configfs_clear_survivability_mode(pdev);
sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
}
diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index d60c4b115304..dc4f61e56579 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -393,6 +393,9 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
&vm->rebind_list);
+ if (!try_wait_for_completion(&vm->xe->pm_block))
+ return -EAGAIN;
+
ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false);
if (ret)
return ret;
@@ -479,6 +482,33 @@ static int xe_preempt_work_begin(struct drm_exec *exec, struct xe_vm *vm,
return xe_vm_validate_rebind(vm, exec, vm->preempt.num_exec_queues);
}
+static bool vm_suspend_rebind_worker(struct xe_vm *vm)
+{
+ struct xe_device *xe = vm->xe;
+ bool ret = false;
+
+ mutex_lock(&xe->rebind_resume_lock);
+ if (!try_wait_for_completion(&vm->xe->pm_block)) {
+ ret = true;
+ list_move_tail(&vm->preempt.pm_activate_link, &xe->rebind_resume_list);
+ }
+ mutex_unlock(&xe->rebind_resume_lock);
+
+ return ret;
+}
+
+/**
+ * xe_vm_resume_rebind_worker() - Resume the rebind worker.
+ * @vm: The vm whose preempt worker to resume.
+ *
+ * Resume a preempt worker that was previously suspended by
+ * vm_suspend_rebind_worker().
+ */
+void xe_vm_resume_rebind_worker(struct xe_vm *vm)
+{
+ queue_work(vm->xe->ordered_wq, &vm->preempt.rebind_work);
+}
+
static void preempt_rebind_work_func(struct work_struct *w)
{
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
@@ -502,6 +532,11 @@ static void preempt_rebind_work_func(struct work_struct *w)
}
retry:
+ if (!try_wait_for_completion(&vm->xe->pm_block) && vm_suspend_rebind_worker(vm)) {
+ up_write(&vm->lock);
+ return;
+ }
+
if (xe_vm_userptr_check_repin(vm)) {
err = xe_vm_userptr_pin(vm);
if (err)
@@ -1714,6 +1749,7 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef)
if (flags & XE_VM_FLAG_LR_MODE) {
INIT_WORK(&vm->preempt.rebind_work, preempt_rebind_work_func);
xe_pm_runtime_get_noresume(xe);
+ INIT_LIST_HEAD(&vm->preempt.pm_activate_link);
}
if (flags & XE_VM_FLAG_FAULT_MODE) {
@@ -1895,8 +1931,12 @@ void xe_vm_close_and_put(struct xe_vm *vm)
xe_assert(xe, !vm->preempt.num_exec_queues);
xe_vm_close(vm);
- if (xe_vm_in_preempt_fence_mode(vm))
+ if (xe_vm_in_preempt_fence_mode(vm)) {
+ mutex_lock(&xe->rebind_resume_lock);
+ list_del_init(&vm->preempt.pm_activate_link);
+ mutex_unlock(&xe->rebind_resume_lock);
flush_work(&vm->preempt.rebind_work);
+ }
if (xe_vm_in_fault_mode(vm))
xe_svm_close(vm);
diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
index 2ecb417c19a2..82b112795807 100644
--- a/drivers/gpu/drm/xe/xe_vm.h
+++ b/drivers/gpu/drm/xe/xe_vm.h
@@ -273,6 +273,8 @@ struct dma_fence *xe_vm_bind_kernel_bo(struct xe_vm *vm, struct xe_bo *bo,
struct xe_exec_queue *q, u64 addr,
enum xe_cache_level cache_lvl);
+void xe_vm_resume_rebind_worker(struct xe_vm *vm);
+
/**
* xe_vm_resv() - Return's the vm's reservation object
* @vm: The vm
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index 8a07feef503b..6058cf739388 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -293,6 +293,11 @@ struct xe_vm {
* BOs
*/
struct work_struct rebind_work;
+ /**
+ * @preempt.pm_activate_link: Link to list of rebind workers to be
+ * kicked on resume.
+ */
+ struct list_head pm_activate_link;
} preempt;
/** @um: unified memory state */
diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules
index e990f20eccfe..710f4423726c 100644
--- a/drivers/gpu/drm/xe/xe_wa_oob.rules
+++ b/drivers/gpu/drm/xe/xe_wa_oob.rules
@@ -30,7 +30,8 @@
16022287689 GRAPHICS_VERSION(2001)
GRAPHICS_VERSION(2004)
13011645652 GRAPHICS_VERSION(2004)
- GRAPHICS_VERSION(3001)
+ GRAPHICS_VERSION_RANGE(3000, 3001)
+ GRAPHICS_VERSION(3003)
14022293748 GRAPHICS_VERSION_RANGE(2001, 2002)
GRAPHICS_VERSION(2004)
GRAPHICS_VERSION_RANGE(3000, 3001)