aboutsummaryrefslogtreecommitdiff
path: root/drivers/vfio
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/cdx/main.c29
-rw-r--r--drivers/vfio/device_cdev.c2
-rw-r--r--drivers/vfio/fsl-mc/vfio_fsl_mc.c43
-rw-r--r--drivers/vfio/pci/Kconfig3
-rw-r--r--drivers/vfio/pci/Makefile1
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c171
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h23
-rw-r--r--drivers/vfio/pci/mlx5/main.c1
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/main.c342
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.c1
-rw-r--r--drivers/vfio/pci/qat/main.c1
-rw-r--r--drivers/vfio/pci/vfio_pci.c6
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c23
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c300
-rw-r--r--drivers/vfio/pci/vfio_pci_dmabuf.c316
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c52
-rw-r--r--drivers/vfio/pci/vfio_pci_priv.h28
-rw-r--r--drivers/vfio/pci/virtio/common.h5
-rw-r--r--drivers/vfio/pci/virtio/legacy_io.c38
-rw-r--r--drivers/vfio/pci/virtio/main.c5
-rw-r--r--drivers/vfio/platform/vfio_amba.c1
-rw-r--r--drivers/vfio/platform/vfio_platform.c1
-rw-r--r--drivers/vfio/platform/vfio_platform_common.c40
-rw-r--r--drivers/vfio/platform/vfio_platform_private.h3
-rw-r--r--drivers/vfio/vfio_main.c51
25 files changed, 1067 insertions, 419 deletions
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index 5dd5f5ad7686..253031b86b60 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -129,28 +129,22 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
}
-static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev,
- struct vfio_region_info __user *arg)
+static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ struct vfio_cdx_device *vdev =
+ container_of(core_vdev, struct vfio_cdx_device, vdev);
struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev);
- struct vfio_region_info info;
-
- if (copy_from_user(&info, arg, minsz))
- return -EFAULT;
- if (info.argsz < minsz)
- return -EINVAL;
-
- if (info.index >= cdx_dev->res_count)
+ if (info->index >= cdx_dev->res_count)
return -EINVAL;
/* map offset to the physical address */
- info.offset = vfio_cdx_index_to_offset(info.index);
- info.size = vdev->regions[info.index].size;
- info.flags = vdev->regions[info.index].flags;
-
- return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
+ info->offset = vfio_cdx_index_to_offset(info->index);
+ info->size = vdev->regions[info->index].size;
+ info->flags = vdev->regions[info->index].flags;
+ return 0;
}
static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev,
@@ -219,8 +213,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev,
switch (cmd) {
case VFIO_DEVICE_GET_INFO:
return vfio_cdx_ioctl_get_info(vdev, uarg);
- case VFIO_DEVICE_GET_REGION_INFO:
- return vfio_cdx_ioctl_get_region_info(vdev, uarg);
case VFIO_DEVICE_GET_IRQ_INFO:
return vfio_cdx_ioctl_get_irq_info(vdev, uarg);
case VFIO_DEVICE_SET_IRQS:
@@ -284,6 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = {
.open_device = vfio_cdx_open_device,
.close_device = vfio_cdx_close_device,
.ioctl = vfio_cdx_ioctl,
+ .get_region_info_caps = vfio_cdx_ioctl_get_region_info,
.device_feature = vfio_cdx_ioctl_feature,
.mmap = vfio_cdx_mmap,
.bind_iommufd = vfio_iommufd_physical_bind,
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 480cac3a0c27..8ceca24ac136 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -99,7 +99,7 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
return ret;
if (user_size < minsz)
return -EINVAL;
- ret = copy_struct_from_user(&bind, minsz, arg, user_size);
+ ret = copy_struct_from_user(&bind, sizeof(bind), arg, user_size);
if (ret)
return ret;
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
index 76ccbab0e3d6..ba47100f28c1 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
@@ -117,6 +117,24 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev)
fsl_mc_cleanup_irq_pool(mc_cont);
}
+static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_fsl_mc_device *vdev =
+ container_of(core_vdev, struct vfio_fsl_mc_device, vdev);
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+
+ if (info->index >= mc_dev->obj_desc.region_count)
+ return -EINVAL;
+
+ /* map offset to the physical address */
+ info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index);
+ info->size = vdev->regions[info->index].size;
+ info->flags = vdev->regions[info->index].flags;
+ return 0;
+}
+
static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
@@ -149,30 +167,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev,
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
}
- case VFIO_DEVICE_GET_REGION_INFO:
- {
- struct vfio_region_info info;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- if (info.index >= mc_dev->obj_desc.region_count)
- return -EINVAL;
-
- /* map offset to the physical address */
- info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index);
- info.size = vdev->regions[info.index].size;
- info.flags = vdev->regions[info.index].flags;
-
- if (copy_to_user((void __user *)arg, &info, minsz))
- return -EFAULT;
- return 0;
- }
case VFIO_DEVICE_GET_IRQ_INFO:
{
struct vfio_irq_info info;
@@ -589,6 +583,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = {
.open_device = vfio_fsl_mc_open_device,
.close_device = vfio_fsl_mc_close_device,
.ioctl = vfio_fsl_mc_ioctl,
+ .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info,
.read = vfio_fsl_mc_read,
.write = vfio_fsl_mc_write,
.mmap = vfio_fsl_mc_mmap,
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 2b0172f54665..2b9fca00e9e8 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM
To enable s390x KVM vfio-pci extensions, say Y.
+config VFIO_PCI_DMABUF
+ def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER
+
source "drivers/vfio/pci/mlx5/Kconfig"
source "drivers/vfio/pci/hisilicon/Kconfig"
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index cf00c0a7e55c..53f59226ae01 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,6 +2,7 @@
vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
+vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o
obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
vfio-pci-y := vfio_pci.o
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index fde33f54e99e..cf45f6370c36 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr)
return 0;
}
+static void qm_xqc_reg_offsets(struct hisi_qm *qm,
+ u32 *eqc_addr, u32 *aeqc_addr)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev =
+ container_of(qm, struct hisi_acc_vf_core_device, vf_qm);
+
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) {
+ *eqc_addr = QM_EQC_VF_DW0;
+ *aeqc_addr = QM_AEQC_VF_DW0;
+ } else {
+ *eqc_addr = QM_EQC_PF_DW0;
+ *aeqc_addr = QM_AEQC_PF_DW0;
+ }
+}
+
static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
{
struct device *dev = &qm->pdev->dev;
+ u32 eqc_addr, aeqc_addr;
int ret;
ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1);
@@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
return ret;
}
+ qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr);
/* QM_EQC_DW has 7 regs */
- ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+ ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7);
if (ret) {
dev_err(dev, "failed to read QM_EQC_DW\n");
return ret;
}
/* QM_AEQC_DW has 7 regs */
- ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+ ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7);
if (ret) {
dev_err(dev, "failed to read QM_AEQC_DW\n");
return ret;
@@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
{
struct device *dev = &qm->pdev->dev;
+ u32 eqc_addr, aeqc_addr;
int ret;
/* Check VF state */
@@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
return ret;
}
+ qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr);
/* QM_EQC_DW has 7 regs */
- ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7);
+ ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7);
if (ret) {
dev_err(dev, "failed to write QM_EQC_DW\n");
return ret;
}
/* QM_AEQC_DW has 7 regs */
- ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7);
+ ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7);
if (ret) {
dev_err(dev, "failed to write QM_AEQC_DW\n");
return ret;
@@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm;
struct pci_dev *vf_dev = vdev->pdev;
+ u32 val;
- /*
- * ACC VF dev BAR2 region consists of both functional register space
- * and migration control register space. For migration to work, we
- * need access to both. Hence, we map the entire BAR2 region here.
- * But unnecessarily exposing the migration BAR region to the Guest
- * has the potential to prevent/corrupt the Guest migration. Hence,
- * we restrict access to the migration control space from
- * Guest(Please see mmap/ioctl/read/write override functions).
- *
- * Please note that it is OK to expose the entire VF BAR if migration
- * is not supported or required as this cannot affect the ACC PF
- * configurations.
- *
- * Also the HiSilicon ACC VF devices supported by this driver on
- * HiSilicon hardware platforms are integrated end point devices
- * and the platform lacks the capability to perform any PCIe P2P
- * between these devices.
- */
+ val = readl(pf_qm->io_base + QM_MIG_REGION_SEL);
+ if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN))
+ hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL;
+ else
+ hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL;
- vf_qm->io_base =
- ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX),
- pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX));
- if (!vf_qm->io_base)
- return -EIO;
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) {
+ /*
+ * On hardware platforms greater than QM_HW_V3, the migration function
+ * register is placed in the BAR2 configuration region of the PF,
+ * and each VF device occupies 8KB of configuration space.
+ */
+ vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET +
+ hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE;
+ } else {
+ /*
+ * ACC VF dev BAR2 region consists of both functional register space
+ * and migration control register space. For migration to work, we
+ * need access to both. Hence, we map the entire BAR2 region here.
+ * But unnecessarily exposing the migration BAR region to the Guest
+ * has the potential to prevent/corrupt the Guest migration. Hence,
+ * we restrict access to the migration control space from
+ * Guest(Please see mmap/ioctl/read/write override functions).
+ *
+ * Please note that it is OK to expose the entire VF BAR if migration
+ * is not supported or required as this cannot affect the ACC PF
+ * configurations.
+ *
+ * Also the HiSilicon ACC VF devices supported by this driver on
+ * HiSilicon hardware platforms are integrated end point devices
+ * and the platform lacks the capability to perform any PCIe P2P
+ * between these devices.
+ */
+ vf_qm->io_base =
+ ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX),
+ pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX));
+ if (!vf_qm->io_base)
+ return -EIO;
+ }
vf_qm->fun_type = QM_HW_VF;
+ vf_qm->ver = pf_qm->ver;
vf_qm->pdev = vf_dev;
mutex_init(&vf_qm->mailbox_lock);
@@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev)
return !IS_ERR(pf_qm) ? pf_qm : NULL;
}
+static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev,
+ unsigned int index)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev =
+ hisi_acc_drvdata(vdev->pdev);
+
+ /*
+ * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device
+ * BAR2 region encompasses both functional register space
+ * and migration control register space.
+ * only the functional region should be report to Guest.
+ */
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL)
+ return (pci_resource_len(vdev->pdev, index) >> 1);
+ /*
+ * On the new HW device, the migration control register
+ * has been moved to the PF device BAR2 region.
+ * The VF device BAR2 is entirely functional register space.
+ */
+ return pci_resource_len(vdev->pdev, index);
+}
+
static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev,
size_t count, loff_t *ppos,
size_t *new_count)
@@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev,
if (index == VFIO_PCI_BAR2_REGION_INDEX) {
loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- resource_size_t end = pci_resource_len(vdev->pdev, index) / 2;
+ resource_size_t end;
+ end = hisi_acc_get_resource_len(vdev, index);
/* Check if access is for migration control region */
if (pos >= end)
return -EINVAL;
@@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev,
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
if (index == VFIO_PCI_BAR2_REGION_INDEX) {
u64 req_len, pgoff, req_start;
- resource_size_t end = pci_resource_len(vdev->pdev, index) / 2;
+ resource_size_t end;
+ end = hisi_acc_get_resource_len(vdev, index);
req_len = vma->vm_end - vma->vm_start;
pgoff = vma->vm_pgoff &
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
@@ -1324,43 +1385,23 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev,
return vfio_pci_core_read(core_vdev, buf, new_count, ppos);
}
-static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
- unsigned long arg)
+static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
- if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
- struct vfio_pci_core_device *vdev =
- container_of(core_vdev, struct vfio_pci_core_device, vdev);
- struct pci_dev *pdev = vdev->pdev;
- struct vfio_region_info info;
- unsigned long minsz;
-
- minsz = offsetofend(struct vfio_region_info, offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
+ struct vfio_pci_core_device *vdev =
+ container_of(core_vdev, struct vfio_pci_core_device, vdev);
- if (info.index == VFIO_PCI_BAR2_REGION_INDEX) {
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ if (info->index != VFIO_PCI_BAR2_REGION_INDEX)
+ return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
- /*
- * ACC VF dev BAR2 region consists of both functional
- * register space and migration control register space.
- * Report only the functional region to Guest.
- */
- info.size = pci_resource_len(pdev, info.index) / 2;
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE |
- VFIO_REGION_INFO_FLAG_MMAP;
+ info->size = hisi_acc_get_resource_len(vdev, info->index);
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
- }
- }
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP;
+ return 0;
}
static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev)
@@ -1521,7 +1562,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
hisi_acc_vf_disable_fds(hisi_acc_vdev);
mutex_lock(&hisi_acc_vdev->open_mutex);
hisi_acc_vdev->dev_opened = false;
- iounmap(vf_qm->io_base);
+ if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL)
+ iounmap(vf_qm->io_base);
mutex_unlock(&hisi_acc_vdev->open_mutex);
vfio_pci_core_close_device(core_vdev);
}
@@ -1557,13 +1599,15 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
.release = vfio_pci_core_release_dev,
.open_device = hisi_acc_vfio_pci_open_device,
.close_device = hisi_acc_vfio_pci_close_device,
- .ioctl = hisi_acc_vfio_pci_ioctl,
+ .ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = hisi_acc_vfio_ioctl_get_region,
.device_feature = vfio_pci_core_ioctl_feature,
.read = hisi_acc_vfio_pci_read,
.write = hisi_acc_vfio_pci_write,
.mmap = hisi_acc_vfio_pci_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -1577,6 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.open_device = hisi_acc_vfio_pci_open_device,
.close_device = vfio_pci_core_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
index 91002ceeebc1..cd55eba64dfb 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -50,8 +50,10 @@
#define QM_QUE_ISO_CFG_V 0x0030
#define QM_PAGE_SIZE 0x0034
-#define QM_EQC_DW0 0X8000
-#define QM_AEQC_DW0 0X8020
+#define QM_EQC_VF_DW0 0X8000
+#define QM_AEQC_VF_DW0 0X8020
+#define QM_EQC_PF_DW0 0x1c00
+#define QM_AEQC_PF_DW0 0x1c20
#define ACC_DRV_MAJOR_VER 1
#define ACC_DRV_MINOR_VER 0
@@ -59,6 +61,22 @@
#define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC
#define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE
+#define QM_MIG_REGION_OFFSET 0x180000
+#define QM_MIG_REGION_SIZE 0x2000
+
+/**
+ * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live
+ * migration functionality is located in the latter 32KB of the VF's BAR2.
+ * The Guest is only provided with the first 32KB of the VF's BAR2.
+ * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live
+ * migration functionality is located in the PF's BAR2, and the entire 64KB
+ * of the VF's BAR2 is allocated to the Guest.
+ */
+enum hw_drv_mode {
+ HW_ACC_MIG_VF_CTRL = 0,
+ HW_ACC_MIG_PF_CTRL,
+};
+
struct acc_vf_data {
#define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state)
/* QM match information */
@@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device {
struct pci_dev *vf_dev;
struct hisi_qm *pf_qm;
struct hisi_qm vf_qm;
+ enum hw_drv_mode drv_mode;
/*
* vf_qm_state represents the QM_VF_STATE register value.
* It is set by Guest driver for the ACC VF dev indicating
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 7ec47e736a8e..9c5970411d07 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
.open_device = mlx5vf_pci_open_device,
.close_device = mlx5vf_pci_close_device,
.ioctl = vfio_pci_core_ioctl,
+ .get_region_info_caps = vfio_pci_ioctl_get_region_info,
.device_feature = vfio_pci_core_ioctl_feature,
.read = vfio_pci_core_read,
.write = vfio_pci_core_write,
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index e346392b72f6..84d142a47ec6 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -7,6 +7,8 @@
#include <linux/vfio_pci_core.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
+#include <linux/pci-p2pdma.h>
+#include <linux/pm_runtime.h>
/*
* The device memory usable to the workloads running in the VM is cached
@@ -58,6 +60,8 @@ struct nvgrace_gpu_pci_core_device {
/* Lock to control device memory kernel mapping */
struct mutex remap_lock;
bool has_mig_hw_bug;
+ /* GPU has just been reset */
+ bool reset_done;
};
static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -102,6 +106,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
mutex_init(&nvdev->remap_lock);
}
+ /*
+ * GPU readiness is checked by reading the BAR0 registers.
+ *
+ * ioremap BAR0 to ensure that the BAR0 mapping is present before
+ * register reads on first fault before establishing any GPU
+ * memory mapping.
+ */
+ ret = vfio_pci_core_setup_barmap(vdev, 0);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
+ }
+
vfio_pci_core_finish_enable(vdev);
return 0;
@@ -130,6 +147,106 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
vfio_pci_core_close_device(core_vdev);
}
+static int nvgrace_gpu_wait_device_ready(void __iomem *io)
+{
+ unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
+
+ do {
+ if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
+ (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY))
+ return 0;
+ msleep(POLL_QUANTUM_MS);
+ } while (!time_after(jiffies, timeout));
+
+ return -ETIME;
+}
+
+/*
+ * If the GPU memory is accessed by the CPU while the GPU is not ready
+ * after reset, it can cause harmless corrected RAS events to be logged.
+ * Make sure the GPU is ready before establishing the mappings.
+ */
+static int
+nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
+{
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ int ret;
+
+ lockdep_assert_held_read(&vdev->memory_lock);
+
+ if (!nvdev->reset_done)
+ return 0;
+
+ if (!__vfio_pci_memory_enabled(vdev))
+ return -EIO;
+
+ ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
+ if (ret)
+ return ret;
+
+ nvdev->reset_done = false;
+
+ return 0;
+}
+
+static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ u64 pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+ return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data;
+ struct vfio_pci_core_device *vdev = &nvdev->core_device;
+ unsigned int index =
+ vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+ struct mem_region *memregion;
+ unsigned long pfn, addr;
+
+ memregion = nvgrace_gpu_memregion(index, nvdev);
+ if (!memregion)
+ return VM_FAULT_SIGBUS;
+
+ addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+ pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
+
+ if (is_aligned_for_order(vma, addr, pfn, order)) {
+ scoped_guard(rwsem_read, &vdev->memory_lock) {
+ if (vdev->pm_runtime_engaged ||
+ nvgrace_gpu_check_device_ready(nvdev))
+ return VM_FAULT_SIGBUS;
+
+ ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
+ }
+ }
+
+ dev_dbg_ratelimited(&vdev->pdev->dev,
+ "%s order = %d pfn 0x%lx: 0x%x\n",
+ __func__, order, pfn,
+ (unsigned int)ret);
+
+ return ret;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+ return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+ .fault = nvgrace_gpu_vfio_pci_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+ .huge_fault = nvgrace_gpu_vfio_pci_huge_fault,
+#endif
+};
+
static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
struct vm_area_struct *vma)
{
@@ -137,10 +254,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
struct mem_region *memregion;
- unsigned long start_pfn;
u64 req_len, pgoff, end;
unsigned int index;
- int ret = 0;
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
@@ -157,17 +272,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
- check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
return -EOVERFLOW;
/*
- * Check that the mapping request does not go beyond available device
- * memory size
+ * Check that the mapping request does not go beyond the exposed
+ * device memory size.
*/
if (end > memregion->memlength)
return -EINVAL;
+ vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+
/*
* The carved out region of the device memory needs the NORMAL_NC
* property. Communicate as such to the hypervisor.
@@ -184,56 +300,31 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
}
- /*
- * Perform a PFN map to the memory and back the device BAR by the
- * GPU memory.
- *
- * The available GPU memory size may not be power-of-2 aligned. The
- * remainder is only backed by vfio_device_ops read/write handlers.
- *
- * During device reset, the GPU is safely disconnected to the CPU
- * and access to the BAR will be immediately returned preventing
- * machine check.
- */
- ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
- req_len, vma->vm_page_prot);
- if (ret)
- return ret;
-
- vma->vm_pgoff = start_pfn;
+ vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
+ vma->vm_private_data = nvdev;
return 0;
}
-static long
-nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned long arg)
+static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps)
{
struct nvgrace_gpu_pci_core_device *nvdev =
container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
core_device.vdev);
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
struct vfio_region_info_cap_sparse_mmap *sparse;
- struct vfio_region_info info;
struct mem_region *memregion;
u32 size;
int ret;
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
/*
* Request to determine the BAR region information. Send the
* GPU memory information.
*/
- memregion = nvgrace_gpu_memregion(info.index, nvdev);
+ memregion = nvgrace_gpu_memregion(info->index, nvdev);
if (!memregion)
- return vfio_pci_core_ioctl(core_vdev,
- VFIO_DEVICE_GET_REGION_INFO, arg);
+ return vfio_pci_ioctl_get_region_info(core_vdev, info, caps);
size = struct_size(sparse, areas, 1);
@@ -252,49 +343,28 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
sparse->header.version = 1;
- ret = vfio_info_add_capability(&caps, &sparse->header, size);
+ ret = vfio_info_add_capability(caps, &sparse->header, size);
kfree(sparse);
if (ret)
return ret;
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index);
/*
* The region memory size may not be power-of-2 aligned.
* Given that the memory is a BAR and may not be
* aligned, roundup to the next power-of-2.
*/
- info.size = memregion->bar_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
+ info->size = memregion->bar_size;
+ info->flags = VFIO_REGION_INFO_FLAG_READ |
VFIO_REGION_INFO_FLAG_WRITE |
VFIO_REGION_INFO_FLAG_MMAP;
-
- if (caps.size) {
- info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
- if (info.argsz < sizeof(info) + caps.size) {
- info.argsz = sizeof(info) + caps.size;
- info.cap_offset = 0;
- } else {
- vfio_info_cap_shift(&caps, sizeof(info));
- if (copy_to_user((void __user *)arg +
- sizeof(info), caps.buf,
- caps.size)) {
- kfree(caps.buf);
- return -EFAULT;
- }
- info.cap_offset = sizeof(info);
- }
- kfree(caps.buf);
- }
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
+ return 0;
}
static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
unsigned int cmd, unsigned long arg)
{
switch (cmd) {
- case VFIO_DEVICE_GET_REGION_INFO:
- return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg);
case VFIO_DEVICE_IOEVENTFD:
return -ENOTTY;
case VFIO_DEVICE_RESET:
@@ -510,6 +580,7 @@ static ssize_t
nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,