diff options
Diffstat (limited to 'drivers/gpu/drm/xe/xe_survivability_mode.c')
-rw-r--r-- | drivers/gpu/drm/xe/xe_survivability_mode.c | 178 |
1 files changed, 135 insertions, 43 deletions
diff --git a/drivers/gpu/drm/xe/xe_survivability_mode.c b/drivers/gpu/drm/xe/xe_survivability_mode.c index 8f7b0add2364..1662bfddd4bc 100644 --- a/drivers/gpu/drm/xe/xe_survivability_mode.c +++ b/drivers/gpu/drm/xe/xe_survivability_mode.c @@ -22,15 +22,18 @@ #define MAX_SCRATCH_MMIO 8 /** - * DOC: Xe Boot Survivability + * DOC: Survivability Mode * - * Boot Survivability is a software based workflow for recovering a system in a failed boot state + * Survivability Mode is a software based workflow for recovering a system in a failed boot state * Here system recoverability is concerned with recovering the firmware responsible for boot. * - * This is implemented by loading the driver with bare minimum (no drm card) to allow the firmware - * to be flashed through mei and collect telemetry. The driver's probe flow is modified - * such that it enters survivability mode when pcode initialization is incomplete and boot status - * denotes a failure. + * Boot Survivability + * =================== + * + * Boot Survivability is implemented by loading the driver with bare minimum (no drm card) to allow + * the firmware to be flashed through mei driver and collect telemetry. The driver's probe flow is + * modified such that it enters survivability mode when pcode initialization is incomplete and boot + * status denotes a failure. * * Survivability mode can also be entered manually using the survivability mode attribute available * through configfs which is beneficial in several usecases. It can be used to address scenarios @@ -48,7 +51,7 @@ * Survivability mode is indicated by the below admin-only readable sysfs which provides additional * debug information:: * - * /sys/bus/pci/devices/<device>/surivability_mode + * /sys/bus/pci/devices/<device>/survivability_mode * * Capability Information: * Provides boot status @@ -58,6 +61,22 @@ * Provides history of previous failures * Auxiliary Information * Certain failures may have information in addition to postcode information + * + * Runtime Survivability + * ===================== + * + * Certain runtime firmware errors can cause the device to enter a wedged state + * (:ref:`xe-device-wedging`) requiring a firmware flash to restore normal operation. + * Runtime Survivability Mode indicates that a firmware flash is necessary to recover the device and + * is indicated by the presence of survivability mode sysfs:: + * + * /sys/bus/pci/devices/<device>/survivability_mode + * + * Survivability mode sysfs provides information about the type of survivability mode. + * + * When such errors occur, userspace is notified with the drm device wedged uevent and runtime + * survivability mode. User can then initiate a firmware flash using userspace tools like fwupd + * to restore device to normal operation. */ static u32 aux_history_offset(u32 reg_value) @@ -123,6 +142,14 @@ static void log_survivability_info(struct pci_dev *pdev) } } +static int check_boot_failure(struct xe_device *xe) +{ + struct xe_survivability *survivability = &xe->survivability; + + return survivability->boot_status == NON_CRITICAL_FAILURE || + survivability->boot_status == CRITICAL_FAILURE; +} + static ssize_t survivability_mode_show(struct device *dev, struct device_attribute *attr, char *buff) { @@ -132,6 +159,12 @@ static ssize_t survivability_mode_show(struct device *dev, struct xe_survivability_info *info = survivability->info; int index = 0, count = 0; + count += sysfs_emit_at(buff, count, "Survivability mode type: %s\n", + survivability->type ? "Runtime" : "Boot"); + + if (!check_boot_failure(xe)) + return count; + for (index = 0; index < MAX_SCRATCH_MMIO; index++) { if (info[index].reg) count += sysfs_emit_at(buff, count, "%s: 0x%x - 0x%x\n", info[index].name, @@ -152,12 +185,11 @@ static void xe_survivability_mode_fini(void *arg) sysfs_remove_file(&dev->kobj, &dev_attr_survivability_mode.attr); } -static int enable_survivability_mode(struct pci_dev *pdev) +static int create_survivability_sysfs(struct pci_dev *pdev) { struct device *dev = &pdev->dev; struct xe_device *xe = pdev_to_xe_device(pdev); - struct xe_survivability *survivability = &xe->survivability; - int ret = 0; + int ret; /* create survivability mode sysfs */ ret = sysfs_create_file(&dev->kobj, &dev_attr_survivability_mode.attr); @@ -171,6 +203,20 @@ static int enable_survivability_mode(struct pci_dev *pdev) if (ret) return ret; + return 0; +} + +static int enable_boot_survivability_mode(struct pci_dev *pdev) +{ + struct device *dev = &pdev->dev; + struct xe_device *xe = pdev_to_xe_device(pdev); + struct xe_survivability *survivability = &xe->survivability; + int ret = 0; + + ret = create_survivability_sysfs(pdev); + if (ret) + return ret; + /* Make sure xe_heci_gsc_init() knows about survivability mode */ survivability->mode = true; @@ -193,15 +239,36 @@ err: return ret; } +static int init_survivability_mode(struct xe_device *xe) +{ + struct xe_survivability *survivability = &xe->survivability; + struct xe_survivability_info *info; + + survivability->size = MAX_SCRATCH_MMIO; + + info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), + GFP_KERNEL); + if (!info) + return -ENOMEM; + + survivability->info = info; + + populate_survivability_info(xe); + + return 0; +} + /** - * xe_survivability_mode_is_enabled - check if survivability mode is enabled + * xe_survivability_mode_is_boot_enabled- check if boot survivability mode is enabled * @xe: xe device instance * - * Returns true if in survivability mode, false otherwise + * Returns true if in boot survivability mode of type, else false */ -bool xe_survivability_mode_is_enabled(struct xe_device *xe) +bool xe_survivability_mode_is_boot_enabled(struct xe_device *xe) { - return xe->survivability.mode; + struct xe_survivability *survivability = &xe->survivability; + + return survivability->mode && survivability->type == XE_SURVIVABILITY_TYPE_BOOT; } /** @@ -222,19 +289,10 @@ bool xe_survivability_mode_is_requested(struct xe_device *xe) u32 data; bool survivability_mode; - if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) + if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) return false; survivability_mode = xe_configfs_get_survivability_mode(pdev); - - if (xe->info.platform < XE_BATTLEMAGE) { - if (survivability_mode) { - dev_err(&pdev->dev, "Survivability Mode is not supported on this card\n"); - xe_configfs_clear_survivability_mode(pdev); - } - return false; - } - /* Enable survivability mode if set via configfs */ if (survivability_mode) return true; @@ -242,44 +300,78 @@ bool xe_survivability_mode_is_requested(struct xe_device *xe) data = xe_mmio_read32(mmio, PCODE_SCRATCH(0)); survivability->boot_status = REG_FIELD_GET(BOOT_STATUS, data); - return survivability->boot_status == NON_CRITICAL_FAILURE || - survivability->boot_status == CRITICAL_FAILURE; + return check_boot_failure(xe); } /** - * xe_survivability_mode_enable - Initialize and enable the survivability mode + * xe_survivability_mode_runtime_enable - Initialize and enable runtime survivability mode * @xe: xe device instance * - * Initialize survivability information and enable survivability mode + * Initialize survivability information and enable runtime survivability mode. + * Runtime survivability mode is enabled when certain errors cause the device to be + * in non-recoverable state. The device is declared wedged with the appropriate + * recovery method and survivability mode sysfs exposed to userspace * - * Return: 0 if survivability mode is enabled or not requested; negative error - * code otherwise. + * Return: 0 if runtime survivability mode is enabled, negative error code otherwise. */ -int xe_survivability_mode_enable(struct xe_device *xe) +int xe_survivability_mode_runtime_enable(struct xe_device *xe) { struct xe_survivability *survivability = &xe->survivability; - struct xe_survivability_info *info; struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + int ret; - if (!xe_survivability_mode_is_requested(xe)) - return 0; + if (!IS_DGFX(xe) || IS_SRIOV_VF(xe) || xe->info.platform < XE_BATTLEMAGE) { + dev_err(&pdev->dev, "Runtime Survivability Mode not supported\n"); + return -EINVAL; + } - survivability->size = MAX_SCRATCH_MMIO; + ret = init_survivability_mode(xe); + if (ret) + return ret; - info = devm_kcalloc(xe->drm.dev, survivability->size, sizeof(*info), - GFP_KERNEL); - if (!info) - return -ENOMEM; + ret = create_survivability_sysfs(pdev); + if (ret) + dev_err(&pdev->dev, "Failed to create survivability mode sysfs\n"); - survivability->info = info; + survivability->type = XE_SURVIVABILITY_TYPE_RUNTIME; + dev_err(&pdev->dev, "Runtime Survivability mode enabled\n"); - populate_survivability_info(xe); + xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_VENDOR); + xe_device_declare_wedged(xe); + dev_err(&pdev->dev, "Firmware flash required, Please refer to the userspace documentation for more details!\n"); + + return 0; +} + +/** + * xe_survivability_mode_boot_enable - Initialize and enable boot survivability mode + * @xe: xe device instance + * + * Initialize survivability information and enable boot survivability mode + * + * Return: 0 if boot survivability mode is enabled or not requested, negative error + * code otherwise. + */ +int xe_survivability_mode_boot_enable(struct xe_device *xe) +{ + struct xe_survivability *survivability = &xe->survivability; + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + int ret; - /* Only log debug information and exit if it is a critical failure */ + if (!xe_survivability_mode_is_requested(xe)) + return 0; + + ret = init_survivability_mode(xe); + if (ret) + return ret; + + /* Log breadcrumbs but do not enter survivability mode for Critical boot errors */ if (survivability->boot_status == CRITICAL_FAILURE) { log_survivability_info(pdev); return -ENXIO; } - return enable_survivability_mode(pdev); + survivability->type = XE_SURVIVABILITY_TYPE_BOOT; + + return enable_boot_survivability_mode(pdev); } |