diff options
Diffstat (limited to 'drivers/gpu/drm/i915/gt/intel_workarounds.c')
-rw-r--r-- | drivers/gpu/drm/i915/gt/intel_workarounds.c | 353 |
1 files changed, 333 insertions, 20 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 90a2b9e399b0..5726cd0a37e0 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -179,6 +179,12 @@ wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set) } static void +wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr) +{ + wa_write_masked_or(wal, reg, clr, 0); +} + +static void wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val) { wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val); @@ -199,6 +205,18 @@ wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val) #define WA_SET_FIELD_MASKED(addr, mask, value) \ wa_write_masked_or(wal, (addr), 0, _MASKED_FIELD((mask), (value))) +static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); +} + +static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); +} + static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { @@ -349,7 +367,10 @@ static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine, HDC_FORCE_NON_COHERENT); /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */ - if (IS_SKYLAKE(i915) || IS_KABYLAKE(i915) || IS_COFFEELAKE(i915)) + if (IS_SKYLAKE(i915) || + IS_KABYLAKE(i915) || + IS_COFFEELAKE(i915) || + IS_COMETLAKE(i915)) WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, GEN8_SAMPLER_POWER_BYPASS_DIS); @@ -383,7 +404,7 @@ static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine, static void skl_tune_iz_hashing(struct intel_engine_cs *engine, struct i915_wa_list *wal) { - struct drm_i915_private *i915 = engine->i915; + struct intel_gt *gt = engine->gt; u8 vals[3] = { 0, 0, 0 }; unsigned int i; @@ -394,7 +415,7 @@ static void skl_tune_iz_hashing(struct intel_engine_cs *engine, * Only consider slices where one, and only one, subslice has 7 * EUs */ - if (!is_power_of_2(RUNTIME_INFO(i915)->sseu.subslice_7eu[i])) + if (!is_power_of_2(gt->info.sseu.subslice_7eu[i])) continue; /* @@ -403,7 +424,7 @@ static void skl_tune_iz_hashing(struct intel_engine_cs *engine, * * -> 0 <= ss <= 3; */ - ss = ffs(RUNTIME_INFO(i915)->sseu.subslice_7eu[i]) - 1; + ss = ffs(gt->info.sseu.subslice_7eu[i]) - 1; vals[i] = 3 - ss; } @@ -594,11 +615,14 @@ static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine, * Wa_1604555607:gen12 and Wa_1608008084:gen12 * FF_MODE2 register will return the wrong value when read. The default * value for this register is zero for all fields and there are no bit - * masks. So instead of doing a RMW we should just write the TDS timer - * value for Wa_1604555607. + * masks. So instead of doing a RMW we should just write the GS Timer + * and TDS timer values for Wa_1604555607 and Wa_16011163337. */ - wa_add(wal, FF_MODE2, FF_MODE2_TDS_TIMER_MASK, - FF_MODE2_TDS_TIMER_128, 0); + wa_add(wal, + FF_MODE2, + FF_MODE2_GS_TIMER_MASK | FF_MODE2_TDS_TIMER_MASK, + FF_MODE2_GS_TIMER_224 | FF_MODE2_TDS_TIMER_128, + 0); /* WaDisableGPGPUMidThreadPreemption:tgl */ WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, @@ -624,7 +648,7 @@ __intel_engine_init_ctx_wa(struct intel_engine_cs *engine, icl_ctx_workarounds_init(engine, wal); else if (IS_CANNONLAKE(i915)) cnl_ctx_workarounds_init(engine, wal); - else if (IS_COFFEELAKE(i915)) + else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915)) cfl_ctx_workarounds_init(engine, wal); else if (IS_GEMINILAKE(i915)) glk_ctx_workarounds_init(engine, wal); @@ -638,6 +662,10 @@ __intel_engine_init_ctx_wa(struct intel_engine_cs *engine, chv_ctx_workarounds_init(engine, wal); else if (IS_BROADWELL(i915)) bdw_ctx_workarounds_init(engine, wal); + else if (IS_GEN(i915, 7)) + gen7_ctx_workarounds_init(engine, wal); + else if (IS_GEN(i915, 6)) + gen6_ctx_workarounds_init(engine, wal); else if (INTEL_GEN(i915) < 8) return; else @@ -687,10 +715,231 @@ int intel_engine_emit_ctx_wa(struct i915_request *rq) } static void +gen4_gt_workarounds_init(struct drm_i915_private *i915, + struct i915_wa_list *wal) +{ + /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */ + wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); +} + +static void +g4x_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen4_gt_workarounds_init(i915, wal); + + /* WaDisableRenderCachePipelinedFlush:g4x,ilk */ + wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE); +} + +static void +ilk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + g4x_gt_workarounds_init(i915, wal); + + wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED); +} + +static void +snb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableHiZPlanesWhenMSAAEnabled:snb */ + wa_masked_en(wal, + _3D_CHICKEN, + _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB); + + /* WaDisable_RenderCache_OperationalFlush:snb */ + wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, + GEN6_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + wa_masked_dis(wal, CACHE_MODE_0, CM0_STC_EVICT_DISABLE_LRA_SNB); + + wa_masked_en(wal, + _3D_CHICKEN3, + /* WaStripsFansDisableFastClipPerformanceFix:snb */ + _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL | + /* + * Bspec says: + * "This bit must be set if 3DSTATE_CLIP clip mode is set + * to normal and 3DSTATE_SF number of SF output attributes + * is more than 16." + */ + _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH); +} + +static void +ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableEarlyCull:ivb */ + wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); + + /* WaDisablePSDDualDispatchEnable:ivb */ + if (IS_IVB_GT1(i915)) + wa_masked_en(wal, + GEN7_HALF_SLICE_CHICKEN1, + GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); + + /* WaDisable_RenderCache_OperationalFlush:ivb */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); + + /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */ + wa_masked_dis(wal, + GEN7_COMMON_SLICE_CHICKEN1, + GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC); + + /* WaApplyL3ControlAndL3ChickenMode:ivb */ + wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL); + wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE); + + /* WaForceL3Serialization:ivb */ + wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); + + /* + * WaVSThreadDispatchOverride:ivb,vlv + * + * This actually overrides the dispatch + * mode for all thread types. + */ + wa_write_masked_or(wal, GEN7_FF_THREAD_MODE, + GEN7_FF_SCHED_MASK, + GEN7_FF_TS_SCHED_HW | + GEN7_FF_VS_SCHED_HW | + GEN7_FF_DS_SCHED_HW); + + if (0) { /* causes HiZ corruption on ivb:gt1 */ + /* enable HiZ Raw Stall Optimization */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); + } + + /* WaDisable4x2SubspanOptimization:ivb */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); +} + +static void +vlv_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableEarlyCull:vlv */ + wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); + + /* WaPsdDispatchEnable:vlv */ + /* WaDisablePSDDualDispatchEnable:vlv */ + wa_masked_en(wal, + GEN7_HALF_SLICE_CHICKEN1, + GEN7_MAX_PS_THREAD_DEP | + GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); + + /* WaDisable_RenderCache_OperationalFlush:vlv */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); + + /* WaForceL3Serialization:vlv */ + wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); + + /* + * WaVSThreadDispatchOverride:ivb,vlv + * + * This actually overrides the dispatch + * mode for all thread types. + */ + wa_write_masked_or(wal, + GEN7_FF_THREAD_MODE, + GEN7_FF_SCHED_MASK, + GEN7_FF_TS_SCHED_HW | + GEN7_FF_VS_SCHED_HW | + GEN7_FF_DS_SCHED_HW); + + /* + * BSpec says this must be set, even though + * WaDisable4x2SubspanOptimization isn't listed for VLV. + */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + /* + * WaIncreaseL3CreditsForVLVB0:vlv + * This is the hardware default actually. + */ + wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE); +} + +static void +hsw_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* L3 caching of data atomics doesn't work -- disable it. */ + wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE); + + wa_add(wal, + HSW_ROW_CHICKEN3, 0, + _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE), + 0 /* XXX does this reg exist? */); + + /* WaVSRefCountFullforceMissDisable:hsw */ + wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME); + + wa_masked_dis(wal, + CACHE_MODE_0_GEN7, + /* WaDisable_RenderCache_OperationalFlush:hsw */ + RC_OP_FLUSH_ENABLE | + /* enable HiZ Raw Stall Optimization */ + HIZ_RAW_STALL_OPT_DISABLE); + + /* WaDisable4x2SubspanOptimization:hsw */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + /* WaSampleCChickenBitEnable:hsw */ + wa_masked_en(wal, HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE); +} + +static void gen9_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) { /* WaDisableKillLogic:bxt,skl,kbl */ - if (!IS_COFFEELAKE(i915)) + if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915)) wa_write_or(wal, GAM_ECOCHK, ECOCHK_DIS_TLB); @@ -787,7 +1036,7 @@ cfl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) static void wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal) { - const struct sseu_dev_info *sseu = &RUNTIME_INFO(i915)->sseu; + const struct sseu_dev_info *sseu = &i915->gt.info.sseu; unsigned int slice, subslice; u32 l3_en, mcr, mcr_mask; @@ -953,7 +1202,7 @@ gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal) icl_gt_workarounds_init(i915, wal); else if (IS_CANNONLAKE(i915)) cnl_gt_workarounds_init(i915, wal); - else if (IS_COFFEELAKE(i915)) + else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915)) cfl_gt_workarounds_init(i915, wal); else if (IS_GEMINILAKE(i915)) glk_gt_workarounds_init(i915, wal); @@ -963,6 +1212,20 @@ gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal) bxt_gt_workarounds_init(i915, wal); else if (IS_SKYLAKE(i915)) skl_gt_workarounds_init(i915, wal); + else if (IS_HASWELL(i915)) + hsw_gt_workarounds_init(i915, wal); + else if (IS_VALLEYVIEW(i915)) + vlv_gt_workarounds_init(i915, wal); + else if (IS_IVYBRIDGE(i915)) + ivb_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 6)) + snb_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 5)) + ilk_gt_workarounds_init(i915, wal); + else if (IS_G4X(i915)) + g4x_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 4)) + gen4_gt_workarounds_init(i915, wal); else if (INTEL_GEN(i915) <= 8) return; else @@ -1187,6 +1450,18 @@ static void cfl_whitelist_build(struct intel_engine_cs *engine) RING_FORCE_TO_NONPRIV_RANGE_4); } +static void cml_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + whitelist_reg_ext(w, + RING_CTX_TIMESTAMP(engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); + + cfl_whitelist_build(engine); +} + static void cnl_whitelist_build(struct intel_engine_cs *engine) { struct i915_wa_list *w = &engine->whitelist; @@ -1237,9 +1512,15 @@ static void icl_whitelist_build(struct intel_engine_cs *engine) /* hucStatus2RegOffset */ whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base), RING_FORCE_TO_NONPRIV_ACCESS_RD); + whitelist_reg_ext(w, + RING_CTX_TIMESTAMP(engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); break; default: + whitelist_reg_ext(w, + RING_CTX_TIMESTAMP(engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); break; } } @@ -1271,6 +1552,9 @@ static void tgl_whitelist_build(struct intel_engine_cs *engine) whitelist_reg(w, HIZ_CHICKEN); break; default: + whitelist_reg_ext(w, + RING_CTX_TIMESTAMP(engine->mmio_base), + RING_FORCE_TO_NONPRIV_ACCESS_RD); break; } } @@ -1288,6 +1572,8 @@ void intel_engine_init_whitelist(struct intel_engine_cs *engine) icl_whitelist_build(engine); else if (IS_CANNONLAKE(i915)) cnl_whitelist_build(engine); + else if (IS_COMETLAKE(i915)) + cml_whitelist_build(engine); else if (IS_COFFEELAKE(i915)) cfl_whitelist_build(engine); else if (IS_GEMINILAKE(i915)) @@ -1363,11 +1649,6 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) GEN7_SARCHKMD, GEN7_DISABLE_SAMPLER_PREFETCH); - /* Wa_1407928979:tgl */ - wa_write_or(wal, - GEN7_FF_THREAD_MODE, - GEN12_FF_TESSELATION_DOP_GATE_DISABLE); - /* Wa_1408615072:tgl */ wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, VSUNIT_CLKGATE_DIS_TGL); @@ -1391,6 +1672,14 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) * Wa_14010229206:tgl */ wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN12_DISABLE_TDL_PUSH); + + /* + * Wa_1407928979:tgl A* + * Wa_18011464164:tgl B0+ + * Wa_22010931296:tgl B0+ + */ + wa_write_or(wal, GEN7_FF_THREAD_MODE, + GEN12_FF_TESSELATION_DOP_GATE_DISABLE); } if (IS_GEN(i915, 11)) { @@ -1484,6 +1773,12 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) wa_write_or(wal, GEN7_FF_THREAD_MODE, GEN12_FF_TESSELATION_DOP_GATE_DISABLE); + + /* Wa_22010271021:ehl */ + if (IS_ELKHARTLAKE(i915)) + wa_masked_en(wal, + GEN9_CS_DEBUG_MODE1, + FF_DOP_CLOCK_GATE_DISABLE); } if (IS_GEN_RANGE(i915, 9, 12)) { @@ -1493,7 +1788,10 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) GEN9_FFSC_PERCTX_PREEMPT_CTRL); } - if (IS_SKYLAKE(i915) || IS_KABYLAKE(i915) || IS_COFFEELAKE(i915)) { + if (IS_SKYLAKE(i915) || + IS_KABYLAKE(i915) || + IS_COFFEELAKE(i915) || + IS_COMETLAKE(i915)) { /* WaEnableGapsTsvCreditFix:skl,kbl,cfl */ wa_write_or(wal, GEN8_GARBCNTL, @@ -1577,6 +1875,21 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) 0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH), /* XXX bit doesn't stick on Broadwater */ IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH); + + if (IS_GEN(i915, 4)) + /* + * Disable CONSTANT_BUFFER before it is loaded from the context + * image. For as it is loaded, it is executed and the stored + * address may no longer be valid, leading to a GPU hang. + * + * This imposes the requirement that userspace reload their + * CONSTANT_BUFFER on every batch, fortunately a requirement + * they are already accustomed to from before contexts were + * enabled. + */ + wa_add(wal, ECOSKPD, + 0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE), + 0 /* XXX bit doesn't stick on Broadwater */); } static void @@ -1691,7 +2004,7 @@ wa_list_srm(struct i915_request *rq, const struct i915_wa_list *wal, struct i915_vma *vma) { - struct drm_i915_private *i915 = rq->i915; + struct drm_i915_private *i915 = rq->engine->i915; unsigned int i, count = 0; const struct i915_wa *wa; u32 srm, *cs; @@ -1780,7 +2093,7 @@ static int engine_wa_list_verify(struct intel_context *ce, err = 0; for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { - if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg))) + if (mcr_range(rq->engine->i915, i915_mmio_reg_offset(wa->reg))) continue; if (!wa_verify(wa, results[i], wal->name, from)) |