diff options
Diffstat (limited to 'drivers/accel/habanalabs/common/hldio.c')
-rw-r--r-- | drivers/accel/habanalabs/common/hldio.c | 437 |
1 files changed, 437 insertions, 0 deletions
diff --git a/drivers/accel/habanalabs/common/hldio.c b/drivers/accel/habanalabs/common/hldio.c new file mode 100644 index 000000000000..083ae5610875 --- /dev/null +++ b/drivers/accel/habanalabs/common/hldio.c @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright 2024 HabanaLabs, Ltd. + * All Rights Reserved. + */ + +#include "habanalabs.h" +#include "hldio.h" +#include <generated/uapi/linux/version.h> +#include <linux/pci-p2pdma.h> +#include <linux/blkdev.h> +#include <linux/vmalloc.h> + +/* + * NVMe Direct I/O implementation for habanalabs driver + * + * ASSUMPTIONS + * =========== + * 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless). + * 2. Only READ operations (can extend in the future). + * 3. No sparse files (can overcome this in the future). + * 4. Kernel version >= 6.9 + * 5. Requiring page alignment is OK (I don't see a solution to this one right, + * now, how do we read partial pages?) + * 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel. + * Theoretically I have a slight idea on how this could be solvable, but it + * is probably inacceptable for the upstream. Also may not work in the end. + * 7. Either make sure our cards and disks are under the same PCI bridge, or + * compile a custom kernel to hack around this. + */ + +#define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */ + +/* + * This struct contains all the useful data I could milk out of the file handle + * provided by the user. + * @TODO: right now it is retrieved on each IO, but can be done once with some + * dedicated IOCTL, call it for example HL_REGISTER_HANDLE. + */ +struct hl_dio_fd { + /* Back pointer in case we need it in async completion */ + struct hl_ctx *ctx; + /* Associated fd struct */ + struct file *filp; +}; + +/* + * This is a single IO descriptor + */ +struct hl_direct_io { + struct hl_dio_fd f; + struct kiocb kio; + struct bio_vec *bv; + struct iov_iter iter; + u64 device_va; + u64 off_bytes; + u64 len_bytes; + u32 type; +}; + +bool hl_device_supports_nvme(struct hl_device *hdev) +{ + return hdev->asic_prop.supports_nvme; +} + +static int hl_dio_fd_register(struct hl_ctx *ctx, int fd, struct hl_dio_fd *f) +{ + struct hl_device *hdev = ctx->hdev; + struct block_device *bd; + struct super_block *sb; + struct inode *inode; + struct gendisk *gd; + struct device *disk_dev; + int rc; + + f->filp = fget(fd); + if (!f->filp) { + rc = -ENOENT; + goto out; + } + + if (!(f->filp->f_flags & O_DIRECT)) { + dev_err(hdev->dev, "file is not in the direct mode\n"); + rc = -EINVAL; + goto fput; + } + + if (!f->filp->f_op->read_iter) { + dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n"); + rc = -EINVAL; + goto fput; + } + + inode = file_inode(f->filp); + sb = inode->i_sb; + bd = sb->s_bdev; + gd = bd->bd_disk; + + if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) { + dev_err(hdev->dev, "sparse files are not currently supported\n"); + rc = -EINVAL; + goto fput; + } + + if (!bd || !gd) { + dev_err(hdev->dev, "invalid block device\n"); + rc = -ENODEV; + goto fput; + } + /* Get the underlying device from the block device */ + disk_dev = disk_to_dev(gd); + if (!dma_pci_p2pdma_supported(disk_dev)) { + dev_err(hdev->dev, "device does not support PCI P2P DMA\n"); + rc = -EOPNOTSUPP; + goto fput; + } + + /* + * @TODO: Maybe we need additional checks here + */ + + f->ctx = ctx; + rc = 0; + + goto out; +fput: + fput(f->filp); +out: + return rc; +} + +static void hl_dio_fd_unregister(struct hl_dio_fd *f) +{ + fput(f->filp); +} + +static long hl_dio_count_io(struct hl_device *hdev) +{ + s64 sum = 0; + int i; + + for_each_possible_cpu(i) + sum += per_cpu(*hdev->hldio.inflight_ios, i); + + return sum; +} + +static bool hl_dio_get_iopath(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + + if (hdev->hldio.io_enabled) { + this_cpu_inc(*hdev->hldio.inflight_ios); + + /* Avoid race conditions */ + if (!hdev->hldio.io_enabled) { + this_cpu_dec(*hdev->hldio.inflight_ios); + return false; + } + + hl_ctx_get(ctx); + + return true; + } + + return false; +} + +static void hl_dio_put_iopath(struct hl_ctx *ctx) +{ + struct hl_device *hdev = ctx->hdev; + + hl_ctx_put(ctx); + this_cpu_dec(*hdev->hldio.inflight_ios); +} + +static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled) +{ + hdev->hldio.io_enabled = enabled; +} + +static bool hl_dio_validate_io(struct hl_device *hdev, struct hl_direct_io *io) +{ + if ((u64)io->device_va & ~PAGE_MASK) { + dev_dbg(hdev->dev, "device address must be 4K aligned\n"); + return false; + } + + if (io->len_bytes & ~PAGE_MASK) { + dev_dbg(hdev->dev, "IO length must be 4K aligned\n"); + return false; + } + + if (io->off_bytes & ~PAGE_MASK) { + dev_dbg(hdev->dev, "IO offset must be 4K aligned\n"); + return false; + } + + return true; +} + +static struct page *hl_dio_va2page(struct hl_device *hdev, struct hl_ctx *ctx, u64 device_va) +{ + struct hl_dio *hldio = &hdev->hldio; + u64 device_pa; + int rc, i; + + rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa); + if (rc) { + dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)", + device_va, rc); + return NULL; + } + + for (i = 0 ; i < hldio->np2prs ; ++i) { + if (device_pa >= hldio->p2prs[i].device_pa && + device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size) + return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >> + PAGE_SHIFT]; + } + + return NULL; +} + +static ssize_t hl_direct_io(struct hl_device *hdev, struct hl_direct_io *io) +{ + u64 npages, device_va; + ssize_t rc; + int i; + + if (!hl_dio_validate_io(hdev, io)) + return -EINVAL; + + if (!hl_dio_get_iopath(io->f.ctx)) { + dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n"); + return -ESHUTDOWN; + } + + init_sync_kiocb(&io->kio, io->f.filp); + io->kio.ki_pos = io->off_bytes; + + npages = (io->len_bytes >> PAGE_SHIFT); + + /* @TODO: this can be implemented smarter, vmalloc in iopath is not + * ideal. Maybe some variation of genpool. Number of pages may differ + * greatly, so maybe even use pools of different sizes and chose the + * closest one. + */ + io->bv = vzalloc(npages * sizeof(struct bio_vec)); + if (!io->bv) + return -ENOMEM; + + for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) { + io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va); + if (!io->bv[i].bv_page) { + dev_err(hdev->dev, "error getting page struct for device va %#llx", + device_va); + rc = -EFAULT; + goto cleanup; + } + io->bv[i].bv_offset = 0; + io->bv[i].bv_len = PAGE_SIZE; + } + + iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes); + if (io->f.filp->f_op && io->f.filp->f_op->read_iter) + rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter); + else + rc = -EINVAL; + +cleanup: + vfree(io->bv); + hl_dio_put_iopath(io->f.ctx); + + dev_dbg(hdev->dev, "IO ended with %ld\n", rc); + + return rc; +} + +/* + * @TODO: This function can be used as a callback for io completion under + * kio->ki_complete in order to implement async IO. + * Note that on more recent kernels there is no ret2. + */ +__maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2) +{ + struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio); + + dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret); + + /* Do something to copy result to user / notify completion */ + + hl_dio_put_iopath(io->f.ctx); + + hl_dio_fd_unregister(&io->f); +} + +/* + * DMA disk to ASIC, wait for results. Must be invoked from the user context + */ +int hl_dio_ssd2hl(struct hl_device *hdev, struct hl_ctx *ctx, int fd, + u64 device_va, off_t off_bytes, size_t len_bytes, + size_t *len_read) +{ + struct hl_direct_io *io; + ssize_t rc; + + dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes); + + io = kzalloc(sizeof(*io), GFP_KERNEL); + if (!io) { + rc = -ENOMEM; + goto out; + } + + *io = (struct hl_direct_io){ + .device_va = device_va, + .len_bytes = len_bytes, + .off_bytes = off_bytes, + .type = READ, + }; + + rc = hl_dio_fd_register(ctx, fd, &io->f); + if (rc) + goto kfree_io; + + rc = hl_direct_io(hdev, io); + if (rc >= 0) { + *len_read = rc; + rc = 0; + } + + /* This shall be called only in the case of a sync IO */ + hl_dio_fd_unregister(&io->f); +kfree_io: + kfree(io); +out: + return rc; +} + +static void hl_p2p_region_fini(struct hl_device *hdev, struct hl_p2p_region *p2pr) +{ + if (p2pr->p2ppages) { + vfree(p2pr->p2ppages); + p2pr->p2ppages = NULL; + } + + if (p2pr->p2pmem) { + dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n", + p2pr->p2pmem, p2pr->size); + pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size); + p2pr->p2pmem = NULL; + } +} + +void hl_p2p_region_fini_all(struct hl_device *hdev) +{ + int i; + + for (i = 0 ; i < hdev->hldio.np2prs ; ++i) + hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]); + + kvfree(hdev->hldio.p2prs); + hdev->hldio.p2prs = NULL; + hdev->hldio.np2prs = 0; +} + +int hl_p2p_region_init(struct hl_device *hdev, struct hl_p2p_region *p2pr) +{ + void *addr; + int rc, i; + + /* Start by publishing our p2p memory */ + rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset); + if (rc) { + dev_err(hdev->dev, "error adding p2p resource: %d\n", rc); + goto err; + } + + /* Alloc all p2p mem */ + p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size); + if (!p2pr->p2pmem) { + dev_err(hdev->dev, "error allocating p2p memory\n"); + rc = -ENOMEM; + goto err; + } + + p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *)); + if (!p2pr->p2ppages) { + rc = -ENOMEM; + goto err; + } + + for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) { + p2pr->p2ppages[i] = virt_to_page(addr); + if (!p2pr->p2ppages[i]) { + rc = -EFAULT; + goto err; + } + } + + return 0; +err: + hl_p2p_region_fini(hdev, p2pr); + return rc; +} + +int hl_dio_start(struct hl_device *hdev) +{ + dev_dbg(hdev->dev, "initializing HLDIO\n"); + + /* Initialize the IO counter and enable IO */ + hdev->hldio.inflight_ios = alloc_percpu(s64); + if (!hdev->hldio.inflight_ios) + return -ENOMEM; + + hl_dio_set_io_enabled(hdev, true); + + return 0; +} + +void hl_dio_stop(struct hl_device *hdev) +{ + dev_dbg(hdev->dev, "deinitializing HLDIO\n"); + + if (hdev->hldio.io_enabled) { + /* Wait for all the IO to finish */ + hl_dio_set_io_enabled(hdev, false); + hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT); + } + + if (hdev->hldio.inflight_ios) { + free_percpu(hdev->hldio.inflight_ios); + hdev->hldio.inflight_ios = NULL; + } +} |