diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-04 18:42:48 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-04 18:42:48 -0800 |
| commit | a3ebb59eee2e558e8f8f27fc3f75cd367f17cd8e (patch) | |
| tree | 934bbee766e0cce55cfcaec5dcae856c0361af37 /drivers/vfio | |
| parent | ce5cfb0fa20dc6454da039612e34325b7b4a8243 (diff) | |
| parent | d721f52e31553a848e0e9947ca15a49c5674aef3 (diff) | |
Merge tag 'vfio-v6.19-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Move libvfio selftest artifacts in preparation of more tightly
coupled integration with KVM selftests (David Matlack)
- Fix comment typo in mtty driver (Chu Guangqing)
- Support for new hardware revision in the hisi_acc vfio-pci variant
driver where the migration registers can now be accessed via the PF.
When enabled for this support, the full BAR can be exposed to the
user (Longfang Liu)
- Fix vfio cdev support for VF token passing, using the correct size
for the kernel structure, thereby actually allowing userspace to
provide a non-zero UUID token. Also set the match token callback for
the hisi_acc, fixing VF token support for this this vfio-pci variant
driver (Raghavendra Rao Ananta)
- Introduce internal callbacks on vfio devices to simplify and
consolidate duplicate code for generating VFIO_DEVICE_GET_REGION_INFO
data, removing various ioctl intercepts with a more structured
solution (Jason Gunthorpe)
- Introduce dma-buf support for vfio-pci devices, allowing MMIO regions
to be exposed through dma-buf objects with lifecycle managed through
move operations. This enables low-level interactions such as a
vfio-pci based SPDK drivers interacting directly with dma-buf capable
RDMA devices to enable peer-to-peer operations. IOMMUFD is also now
able to build upon this support to fill a long standing feature gap
versus the legacy vfio type1 IOMMU backend with an implementation of
P2P support for VM use cases that better manages the lifecycle of the
P2P mapping (Leon Romanovsky, Jason Gunthorpe, Vivek Kasireddy)
- Convert eventfd triggering for error and request signals to use RCU
mechanisms in order to avoid a 3-way lockdep reported deadlock issue
(Alex Williamson)
- Fix a 32-bit overflow introduced via dma-buf support manifesting with
large DMA buffers (Alex Mastro)
- Convert nvgrace-gpu vfio-pci variant driver to insert mappings on
fault rather than at mmap time. This conversion serves both to make
use of huge PFNMAPs but also to both avoid corrected RAS events
during reset by now being subject to vfio-pci-core's use of
unmap_mapping_range(), and to enable a device readiness test after
reset (Ankit Agrawal)
- Refactoring of vfio selftests to support multi-device tests and split
code to provide better separation between IOMMU and device objects.
This work also enables a new test suite addition to measure parallel
device initialization latency (David Matlack)
* tag 'vfio-v6.19-rc1' of https://github.com/awilliam/linux-vfio: (65 commits)
vfio: selftests: Add vfio_pci_device_init_perf_test
vfio: selftests: Eliminate INVALID_IOVA
vfio: selftests: Split libvfio.h into separate header files
vfio: selftests: Move vfio_selftests_*() helpers into libvfio.c
vfio: selftests: Rename vfio_util.h to libvfio.h
vfio: selftests: Stop passing device for IOMMU operations
vfio: selftests: Move IOVA allocator into iova_allocator.c
vfio: selftests: Move IOMMU library code into iommu.c
vfio: selftests: Rename struct vfio_dma_region to dma_region
vfio: selftests: Upgrade driver logging to dev_err()
vfio: selftests: Prefix logs with device BDF where relevant
vfio: selftests: Eliminate overly chatty logging
vfio: selftests: Support multiple devices in the same container/iommufd
vfio: selftests: Introduce struct iommu
vfio: selftests: Rename struct vfio_iommu_mode to iommu_mode
vfio: selftests: Allow passing multiple BDFs on the command line
vfio: selftests: Split run.sh into separate scripts
vfio: selftests: Move run.sh into scripts directory
vfio/nvgrace-gpu: wait for the GPU mem to be ready
vfio/nvgrace-gpu: Inform devmem unmapped after reset
...
Diffstat (limited to 'drivers/vfio')
25 files changed, 1067 insertions, 419 deletions
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 5dd5f5ad7686..253031b86b60 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -129,28 +129,22 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, - struct vfio_region_info __user *arg) +static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); - struct vfio_region_info info; - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= cdx_dev->res_count) + if (info->index >= cdx_dev->res_count) return -EINVAL; /* map offset to the physical address */ - info.offset = vfio_cdx_index_to_offset(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = vfio_cdx_index_to_offset(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev, @@ -219,8 +213,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev, switch (cmd) { case VFIO_DEVICE_GET_INFO: return vfio_cdx_ioctl_get_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_cdx_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_GET_IRQ_INFO: return vfio_cdx_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_SET_IRQS: @@ -284,6 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, + .get_region_info_caps = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 480cac3a0c27..8ceca24ac136 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -99,7 +99,7 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, return ret; if (user_size < minsz) return -EINVAL; - ret = copy_struct_from_user(&bind, minsz, arg, user_size); + ret = copy_struct_from_user(&bind, sizeof(bind), arg, user_size); if (ret) return ret; diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 76ccbab0e3d6..ba47100f28c1 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,6 +117,24 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } +static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + struct fsl_mc_device *mc_dev = vdev->mc_dev; + + if (info->index >= mc_dev->obj_desc.region_count) + return -EINVAL; + + /* map offset to the physical address */ + info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; +} + static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -149,30 +167,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -589,6 +583,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, + .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f54665..2b9fca00e9e8 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. +config VFIO_PCI_DMABUF + def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER + source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c..53f59226ae01 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,6 +2,7 @@ vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fde33f54e99e..cf45f6370c36 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) return 0; } +static void qm_xqc_reg_offsets(struct hisi_qm *qm, + u32 *eqc_addr, u32 *aeqc_addr) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + container_of(qm, struct hisi_acc_vf_core_device, vf_qm); + + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) { + *eqc_addr = QM_EQC_VF_DW0; + *aeqc_addr = QM_AEQC_VF_DW0; + } else { + *eqc_addr = QM_EQC_PF_DW0; + *aeqc_addr = QM_AEQC_PF_DW0; + } +} + static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_AEQC_DW\n"); return ret; @@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; /* Check VF state */ @@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_AEQC_DW\n"); return ret; @@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device; struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; struct pci_dev *vf_dev = vdev->pdev; + u32 val; - /* - * ACC VF dev BAR2 region consists of both functional register space - * and migration control register space. For migration to work, we - * need access to both. Hence, we map the entire BAR2 region here. - * But unnecessarily exposing the migration BAR region to the Guest - * has the potential to prevent/corrupt the Guest migration. Hence, - * we restrict access to the migration control space from - * Guest(Please see mmap/ioctl/read/write override functions). - * - * Please note that it is OK to expose the entire VF BAR if migration - * is not supported or required as this cannot affect the ACC PF - * configurations. - * - * Also the HiSilicon ACC VF devices supported by this driver on - * HiSilicon hardware platforms are integrated end point devices - * and the platform lacks the capability to perform any PCIe P2P - * between these devices. - */ + val = readl(pf_qm->io_base + QM_MIG_REGION_SEL); + if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN)) + hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL; + else + hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL; - vf_qm->io_base = - ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), - pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); - if (!vf_qm->io_base) - return -EIO; + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) { + /* + * On hardware platforms greater than QM_HW_V3, the migration function + * register is placed in the BAR2 configuration region of the PF, + * and each VF device occupies 8KB of configuration space. + */ + vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET + + hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE; + } else { + /* + * ACC VF dev BAR2 region consists of both functional register space + * and migration control register space. For migration to work, we + * need access to both. Hence, we map the entire BAR2 region here. + * But unnecessarily exposing the migration BAR region to the Guest + * has the potential to prevent/corrupt the Guest migration. Hence, + * we restrict access to the migration control space from + * Guest(Please see mmap/ioctl/read/write override functions). + * + * Please note that it is OK to expose the entire VF BAR if migration + * is not supported or required as this cannot affect the ACC PF + * configurations. + * + * Also the HiSilicon ACC VF devices supported by this driver on + * HiSilicon hardware platforms are integrated end point devices + * and the platform lacks the capability to perform any PCIe P2P + * between these devices. + */ + vf_qm->io_base = + ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), + pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); + if (!vf_qm->io_base) + return -EIO; + } vf_qm->fun_type = QM_HW_VF; + vf_qm->ver = pf_qm->ver; vf_qm->pdev = vf_dev; mutex_init(&vf_qm->mailbox_lock); @@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev) return !IS_ERR(pf_qm) ? pf_qm : NULL; } +static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev, + unsigned int index) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + hisi_acc_drvdata(vdev->pdev); + + /* + * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device + * BAR2 region encompasses both functional register space + * and migration control register space. + * only the functional region should be report to Guest. + */ + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + return (pci_resource_len(vdev->pdev, index) >> 1); + /* + * On the new HW device, the migration control register + * has been moved to the PF device BAR2 region. + * The VF device BAR2 is entirely functional register space. + */ + return pci_resource_len(vdev->pdev, index); +} + static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, size_t count, loff_t *ppos, size_t *new_count) @@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, if (index == VFIO_PCI_BAR2_REGION_INDEX) { loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); /* Check if access is for migration control region */ if (pos >= end) return -EINVAL; @@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev, index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); if (index == VFIO_PCI_BAR2_REGION_INDEX) { u64 req_len, pgoff, req_start; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); req_len = vma->vm_end - vma->vm_start; pgoff = vma->vm_pgoff & ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); @@ -1324,43 +1385,23 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, return vfio_pci_core_read(core_vdev, buf, new_count, ppos); } -static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); - if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + if (info->index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - /* - * ACC VF dev BAR2 region consists of both functional - * register space and migration control register space. - * Report only the functional region to Guest. - */ - info.size = pci_resource_len(pdev, info.index) / 2; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP; + info->size = hisi_acc_get_resource_len(vdev, info->index); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } - } - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + return 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1521,7 +1562,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) hisi_acc_vf_disable_fds(hisi_acc_vdev); mutex_lock(&hisi_acc_vdev->open_mutex); hisi_acc_vdev->dev_opened = false; - iounmap(vf_qm->io_base); + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + iounmap(vf_qm->io_base); mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_close_device(core_vdev); } @@ -1557,13 +1599,15 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, - .ioctl = hisi_acc_vfio_pci_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, .mmap = hisi_acc_vfio_pci_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -1577,6 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 91002ceeebc1..cd55eba64dfb 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -50,8 +50,10 @@ #define QM_QUE_ISO_CFG_V 0x0030 #define QM_PAGE_SIZE 0x0034 -#define QM_EQC_DW0 0X8000 -#define QM_AEQC_DW0 0X8020 +#define QM_EQC_VF_DW0 0X8000 +#define QM_AEQC_VF_DW0 0X8020 +#define QM_EQC_PF_DW0 0x1c00 +#define QM_AEQC_PF_DW0 0x1c20 #define ACC_DRV_MAJOR_VER 1 #define ACC_DRV_MINOR_VER 0 @@ -59,6 +61,22 @@ #define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC #define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE +#define QM_MIG_REGION_OFFSET 0x180000 +#define QM_MIG_REGION_SIZE 0x2000 + +/** + * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the latter 32KB of the VF's BAR2. + * The Guest is only provided with the first 32KB of the VF's BAR2. + * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the PF's BAR2, and the entire 64KB + * of the VF's BAR2 is allocated to the Guest. + */ +enum hw_drv_mode { + HW_ACC_MIG_VF_CTRL = 0, + HW_ACC_MIG_PF_CTRL, +}; + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ @@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device { struct pci_dev *vf_dev; struct hisi_qm *pf_qm; struct hisi_qm vf_qm; + enum hw_drv_mode drv_mode; /* * vf_qm_state represents the QM_VF_STATE register value. * It is set by Guest driver for the ACC VF dev indicating diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ec47e736a8e..9c5970411d07 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e346392b72f6..84d142a47ec6 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,6 +7,8 @@ #include <linux/vfio_pci_core.h> #include <linux/delay.h> #include <linux/jiffies.h> +#include <linux/pci-p2pdma.h> +#include <linux/pm_runtime.h> /* * The device memory usable to the workloads running in the VM is cached @@ -58,6 +60,8 @@ struct nvgrace_gpu_pci_core_device { /* Lock to control device memory kernel mapping */ struct mutex remap_lock; bool has_mig_hw_bug; + /* GPU has just been reset */ + bool reset_done; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -102,6 +106,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) mutex_init(&nvdev->remap_lock); } + /* + * GPU readiness is checked by reading the BAR0 registers. + * + * ioremap BAR0 to ensure that the BAR0 mapping is present before + * register reads on first fault before establishing any GPU + * memory mapping. + */ + ret = vfio_pci_core_setup_barmap(vdev, 0); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; + } + vfio_pci_core_finish_enable(vdev); return 0; @@ -130,6 +147,106 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } +static int nvgrace_gpu_wait_device_ready(void __iomem *io) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) + return 0; + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + + return -ETIME; +} + +/* + * If the GPU memory is accessed by the CPU while the GPU is not ready + * after reset, it can cause harmless corrected RAS events to be logged. + * Make sure the GPU is ready before establishing the mappings. + */ +static int +nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) +{ + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; + + lockdep_assert_held_read(&vdev->memory_lock); + + if (!nvdev->reset_done) + return 0; + + if (!__vfio_pci_memory_enabled(vdev)) + return -EIO; + + ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]); + if (ret) + return ret; + + nvdev->reset_done = false; + + return 0; +} + +static unsigned long addr_to_pgoff(struct vm_area_struct *vma, + unsigned long addr) +{ + u64 pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data; + struct vfio_pci_core_device *vdev = &nvdev->core_device; + unsigned int index = + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + vm_fault_t ret = VM_FAULT_FALLBACK; + struct mem_region *memregion; + unsigned long pfn, addr; + + memregion = nvgrace_gpu_memregion(index, nvdev); + if (!memregion) + return VM_FAULT_SIGBUS; + + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) { + if (vdev->pm_runtime_engaged || + nvgrace_gpu_check_device_ready(nvdev)) + return VM_FAULT_SIGBUS; + + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } + } + + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s order = %d pfn 0x%lx: 0x%x\n", + __func__, order, pfn, + (unsigned int)ret); + + return ret; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, +#endif +}; + static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { @@ -137,10 +254,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); struct mem_region *memregion; - unsigned long start_pfn; u64 req_len, pgoff, end; unsigned int index; - int ret = 0; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); @@ -157,17 +272,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || - check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) return -EOVERFLOW; /* - * Check that the mapping request does not go beyond available device - * memory size + * Check that the mapping request does not go beyond the exposed + * device memory size. */ if (end > memregion->memlength) return -EINVAL; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + /* * The carved out region of the device memory needs the NORMAL_NC * property. Communicate as such to the hypervisor. @@ -184,56 +300,31 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); } - /* - * Perform a PFN map to the memory and back the device BAR by the - * GPU memory. - * - * The available GPU memory size may not be power-of-2 aligned. The - * remainder is only backed by vfio_device_ops read/write handlers. - * - * During device reset, the GPU is safely disconnected to the CPU - * and access to the BAR will be immediately returned preventing - * machine check. - */ - ret = remap_pfn_range(vma, vma->vm_start, start_pfn, - req_len, vma->vm_page_prot); - if (ret) - return ret; - - vma->vm_pgoff = start_ |
