summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/xe/xe_survivability_mode.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/xe/xe_survivability_mode.c')
-rw-r--r--drivers/gpu/drm/xe/xe_survivability_mode.c178
1 files changed, 135 insertions, 43 deletions
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c
index 8f7b0add2364..1662bfddd4bc 100644
--- a/drivers/gpu/drm/xe/xe_survivability_mode.c
+++ b/drivers/gpu/drm/xe/xe_survivability_mode.c
@@ -22,15 +22,18 @@
#define MAX_SCRATCH_MMIO 8
/**
- * DOC: Xe Boot Survivability
+ * DOC: Survivability Mode
*
- * Boot Survivability is a software based workflow for recovering a system in a failed boot state
+ * Survivability Mode is a software based workflow for recovering a system in a failed boot state
* Here system recoverability is concerned with recovering the firmware responsible for boot.
*
- * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware
- * to be flashed through mei and collect telemetry. The driver's probe flow is modified
- * such that it enters survivability mode when pcode initialization is incomplete and boot status
- * denotes a failure.
+ * Boot Survivability
+ * ===================
+ *
+ * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow
+ * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is
+ * modified such that it enters survivability mode when pcode initialization is incomplete and boot
+ * status denotes a failure.
*
* Survivability mode can also be entered manually using the survivability mode attribute available
* through configfs which is beneficial in several usecases. It can be used to address scenarios
@@ -48,7 +51,7 @@
* Survivability mode is indicated by the below admin-only readable sysfs which provides additional
* debug information::
*
- * /sys/bus/pci/devices/<device>/surivability_mode
+ * /sys/bus/pci/devices/<device>/survivability_mode
*
* Capability Information:
* Provides boot status
@@ -58,6 +61,22 @@
* Provides history of previous failures
* Auxiliary Information
* Certain failures may have information in addition to postcode information
+ *
+ * Runtime Survivability
+ * =====================
+ *
+ * Certain runtime firmware errors can cause the device to enter a wedged state
+ * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation.
+ * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and
+ * is indicated by the presence of survivability mode sysfs::
+ *
+ * /sys/bus/pci/devices/<device>/survivability_mode
+ *
+ * Survivability mode sysfs provides information about the type of survivability mode.
+ *
+ * When such errors occur, userspace is notified with the drm device wedged uevent and runtime
+ * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd
+ * to restore device to normal operation.
*/
static u32 aux_history_offset(u32 reg_value)
@@ -123,6 +142,14 @@ static void log_survivability_info(struct pci_dev *pdev)
}
}
+static int check_boot_failure(struct xe_device *xe)
+{
+ struct xe_survivability *survivability = &xe->survivability;
+
+ return survivability->boot_status == NON_CRITICAL_FAILURE ||
+ survivability->boot_status == CRITICAL_FAILURE;
+}
+
static ssize_t survivability_mode_show(struct device *dev,
struct device_attribute *attr, char *buff)
{
@@ -132,6 +159,12 @@ static ssize_t survivability_mode_show(struct device *dev,
struct xe_survivability_info *info = survivability->info;
int index = 0, count = 0;
+ count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n",
+ survivability->type ? "Runtime" : "Boot");
+
+ if (!check_boot_failure(xe))
+ return count;
+
for (index = 0; index < MAX_SCRATCH_MMIO; index++) {
if (info[index].reg)
count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name,
@@ -152,12 +185,11 @@ static void xe_survivability_mode_fini(void *arg)
sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr);
}
-static int enable_survivability_mode(struct pci_dev *pdev)
+static int create_survivability_sysfs(struct pci_dev *pdev)
{
struct device *dev = &pdev->dev;
struct xe_device *xe = pdev_to_xe_device(pdev);
- struct xe_survivability *survivability = &xe->survivability;
- int ret = 0;
+ int ret;
/* create survivability mode sysfs */
ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr);
@@ -171,6 +203,20 @@ static int enable_survivability_mode(struct pci_dev *pdev)
if (ret)
return ret;
+ return 0;
+}
+
+static int enable_boot_survivability_mode(struct pci_dev *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct xe_device *xe = pdev_to_xe_device(pdev);
+ struct xe_survivability *survivability = &xe->survivability;
+ int ret = 0;
+
+ ret = create_survivability_sysfs(pdev);
+ if (ret)
+ return ret;
+
/* Make sure xe_heci_gsc_init() knows about survivability mode */
survivability->mode = true;
@@ -193,15 +239,36 @@ err:
return ret;
}
+static int init_survivability_mode(struct xe_device *xe)
+{
+ struct xe_survivability *survivability = &xe->survivability;
+ struct xe_survivability_info *info;
+
+ survivability->size = MAX_SCRATCH_MMIO;
+
+ info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
+ GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ survivability->info = info;
+
+ populate_survivability_info(xe);
+
+ return 0;
+}
+
/**
- * xe_survivability_mode_is_enabled - check if survivability mode is enabled
+ * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled
* @xe: xe device instance
*
- * Returns true if in survivability mode, false otherwise
+ * Returns true if in boot survivability mode of type, else false
*/
-bool xe_survivability_mode_is_enabled(struct xe_device *xe)
+bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe)
{
- return xe->survivability.mode;
+ struct xe_survivability *survivability = &xe->survivability;
+
+ return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT;
}
/**
@@ -222,19 +289,10 @@ bool xe_survivability_mode_is_requested(struct xe_device *xe)
u32 data;
bool survivability_mode;
- if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
+ if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE)
return false;
survivability_mode = xe_configfs_get_survivability_mode(pdev);
-
- if (xe->info.platform < XE_BATTLEMAGE) {
- if (survivability_mode) {
- dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n");
- xe_configfs_clear_survivability_mode(pdev);
- }
- return false;
- }
-
/* Enable survivability mode if set via configfs */
if (survivability_mode)
return true;
@@ -242,44 +300,78 @@ bool xe_survivability_mode_is_requested(struct xe_device *xe)
data = xe_mmio_read32(mmio, PCODE_SCRATCH(0));
survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data);
- return survivability->boot_status == NON_CRITICAL_FAILURE ||
- survivability->boot_status == CRITICAL_FAILURE;
+ return check_boot_failure(xe);
}
/**
- * xe_survivability_mode_enable - Initialize and enable the survivability mode
+ * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode
* @xe: xe device instance
*
- * Initialize survivability information and enable survivability mode
+ * Initialize survivability information and enable runtime survivability mode.
+ * Runtime survivability mode is enabled when certain errors cause the device to be
+ * in non-recoverable state. The device is declared wedged with the appropriate
+ * recovery method and survivability mode sysfs exposed to userspace
*
- * Return: 0 if survivability mode is enabled or not requested; negative error
- * code otherwise.
+ * Return: 0 if runtime survivability mode is enabled, negative error code otherwise.
*/
-int xe_survivability_mode_enable(struct xe_device *xe)
+int xe_survivability_mode_runtime_enable(struct xe_device *xe)
{
struct xe_survivability *survivability = &xe->survivability;
- struct xe_survivability_info *info;
struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+ int ret;
- if (!xe_survivability_mode_is_requested(xe))
- return 0;
+ if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) {
+ dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n");
+ return -EINVAL;
+ }
- survivability->size = MAX_SCRATCH_MMIO;
+ ret = init_survivability_mode(xe);
+ if (ret)
+ return ret;
- info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info),
- GFP_KERNEL);
- if (!info)
- return -ENOMEM;
+ ret = create_survivability_sysfs(pdev);
+ if (ret)
+ dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n");
- survivability->info = info;
+ survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME;
+ dev_err(&pdev->dev, "Runtime Survivability mode enabled\n");
- populate_survivability_info(xe);
+ xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR);
+ xe_device_declare_wedged(xe);
+ dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n");
+
+ return 0;
+}
+
+/**
+ * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode
+ * @xe: xe device instance
+ *
+ * Initialize survivability information and enable boot survivability mode
+ *
+ * Return: 0 if boot survivability mode is enabled or not requested, negative error
+ * code otherwise.
+ */
+int xe_survivability_mode_boot_enable(struct xe_device *xe)
+{
+ struct xe_survivability *survivability = &xe->survivability;
+ struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+ int ret;
- /* Only log debug information and exit if it is a critical failure */
+ if (!xe_survivability_mode_is_requested(xe))
+ return 0;
+
+ ret = init_survivability_mode(xe);
+ if (ret)
+ return ret;
+
+ /* Log breadcrumbs but do not enter survivability mode for Critical boot errors */
if (survivability->boot_status == CRITICAL_FAILURE) {
log_survivability_info(pdev);
return -ENXIO;
}
- return enable_survivability_mode(pdev);
+ survivability->type = XE_SURVIVABILITY_TYPE_BOOT;
+
+ return enable_boot_survivability_mode(pdev);
}