diff options
author | Jonathan Kim <Jonathan.Kim@amd.com> | 2024-05-21 13:22:15 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-07-25 17:43:41 -0400 |
commit | e06b71b2313a00579ba64a1cc43ad29d64cb8d4c (patch) | |
tree | a561cd62a26c646effdb424f6612cdcfb07c7730 /drivers/gpu/drm/amd/amdkfd/kfd_topology.c | |
parent | 60c30ba7ba2064066ec462236666058cbbf619c1 (diff) |
drm/amdkfd: allow users to target recommended SDMA engines
Certain GPUs have better copy performance over xGMI on specific
SDMA engines depending on the source and destination GPU.
Allow users to create SDMA queues on these recommended engines.
Close to 2x overall performance has been observed with this
optimization.
Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_topology.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index a9b3eda65a2cc..40771f8752cbc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -292,6 +292,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, iolink->max_bandwidth); sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size", iolink->rec_transfer_size); + sysfs_show_32bit_prop(buffer, offs, "recommended_sdma_engine_id_mask", + iolink->rec_sdma_eng_id_mask); sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags); return offs; @@ -1265,6 +1267,55 @@ static void kfd_set_iolink_non_coherent(struct kfd_topology_device *to_dev, } } +#define REC_SDMA_NUM_GPU 8 +static const int rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = { + { -1, 14, 12, 2, 4, 8, 10, 6 }, + { 14, -1, 2, 10, 8, 4, 6, 12 }, + { 10, 2, -1, 12, 14, 6, 4, 8 }, + { 2, 12, 10, -1, 6, 14, 8, 4 }, + { 4, 8, 14, 6, -1, 10, 12, 2 }, + { 8, 4, 6, 14, 12, -1, 2, 10 }, + { 10, 6, 4, 8, 12, 2, -1, 14 }, + { 6, 12, 8, 4, 2, 10, 14, -1 }}; + +static void kfd_set_recommended_sdma_engines(struct kfd_topology_device *to_dev, + struct kfd_iolink_properties *outbound_link, + struct kfd_iolink_properties *inbound_link) +{ + struct kfd_node *gpu = outbound_link->gpu; + struct amdgpu_device *adev = gpu->adev; + int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes; + bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu && + adev->aid_mask && num_xgmi_nodes && + (amdgpu_xcp_query_partition_mode(adev->xcp_mgr, AMDGPU_XCP_FL_NONE) == + AMDGPU_SPX_PARTITION_MODE) && + (!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8); + + if (support_rec_eng) { + int src_socket_id = adev->gmc.xgmi.physical_node_id; + int dst_socket_id = to_dev->gpu->adev->gmc.xgmi.physical_node_id; + + outbound_link->rec_sdma_eng_id_mask = + 1 << rec_sdma_eng_map[src_socket_id][dst_socket_id]; + inbound_link->rec_sdma_eng_id_mask = + 1 << rec_sdma_eng_map[dst_socket_id][src_socket_id]; + } else { + int num_sdma_eng = kfd_get_num_sdma_engines(gpu); + int i, eng_offset = 0; + + if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI && + kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) { + eng_offset = num_sdma_eng; + num_sdma_eng = kfd_get_num_xgmi_sdma_engines(gpu); + } + + for (i = 0; i < num_sdma_eng; i++) { + outbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset)); + inbound_link->rec_sdma_eng_id_mask |= (1 << (i + eng_offset)); + } + } +} + static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) { struct kfd_iolink_properties *link, *inbound_link; @@ -1303,6 +1354,7 @@ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED; kfd_set_iolink_no_atomics(peer_dev, dev, inbound_link); kfd_set_iolink_non_coherent(peer_dev, link, inbound_link); + kfd_set_recommended_sdma_engines(peer_dev, link, inbound_link); } } |