diff options
83 files changed, 12804 insertions, 1466 deletions
diff --git a/.clang-format b/.clang-format index bcf60344f9f5..b62836419ea3 100644 --- a/.clang-format +++ b/.clang-format @@ -441,8 +441,11 @@ ForEachMacros: - 'inet_lhash2_for_each_icsk' - 'inet_lhash2_for_each_icsk_continue' - 'inet_lhash2_for_each_icsk_rcu' + - 'interval_tree_for_each_double_span' + - 'interval_tree_for_each_span' - 'intlist__for_each_entry' - 'intlist__for_each_entry_safe' + - 'iopt_for_each_contig_area' - 'kcore_copy__for_each_phdr' - 'key_for_each' - 'key_for_each_safe' diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index c78da9ce0ec4..f16337bdb852 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -25,6 +25,7 @@ place where this information is gathered. ebpf/index ioctl/index iommu + iommufd media/index netlink/index sysfs-platform_profile diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst index 5f81e2a24a5c..eb045fc495a4 100644 --- a/Documentation/userspace-api/ioctl/ioctl-number.rst +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst @@ -105,6 +105,7 @@ Code Seq# Include File Comments '8' all SNP8023 advanced NIC card <mailto:mcr@solidum.com> ';' 64-7F linux/vfio.h +';' 80-FF linux/iommufd.h '=' 00-3f uapi/linux/ptp_clock.h <mailto:richardcochran@gmail.com> '@' 00-0F linux/radeonfb.h conflict! '@' 00-0F drivers/video/aty/aty128fb.c conflict! diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst new file mode 100644 index 000000000000..79dd9eb51587 --- /dev/null +++ b/Documentation/userspace-api/iommufd.rst @@ -0,0 +1,223 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +======= +IOMMUFD +======= + +:Author: Jason Gunthorpe +:Author: Kevin Tian + +Overview +======== + +IOMMUFD is the user API to control the IOMMU subsystem as it relates to managing +IO page tables from userspace using file descriptors. It intends to be general +and consumable by any driver that wants to expose DMA to userspace. These +drivers are eventually expected to deprecate any internal IOMMU logic +they may already/historically implement (e.g. vfio_iommu_type1.c). + +At minimum iommufd provides universal support of managing I/O address spaces and +I/O page tables for all IOMMUs, with room in the design to add non-generic +features to cater to specific hardware functionality. + +In this context the capital letter (IOMMUFD) refers to the subsystem while the +small letter (iommufd) refers to the file descriptors created via /dev/iommu for +use by userspace. + +Key Concepts +============ + +User Visible Objects +-------------------- + +Following IOMMUFD objects are exposed to userspace: + +- IOMMUFD_OBJ_IOAS, representing an I/O address space (IOAS), allowing map/unmap + of user space memory into ranges of I/O Virtual Address (IOVA). + + The IOAS is a functional replacement for the VFIO container, and like the VFIO + container it copies an IOVA map to a list of iommu_domains held within it. + +- IOMMUFD_OBJ_DEVICE, representing a device that is bound to iommufd by an + external driver. + +- IOMMUFD_OBJ_HW_PAGETABLE, representing an actual hardware I/O page table + (i.e. a single struct iommu_domain) managed by the iommu driver. + + The IOAS has a list of HW_PAGETABLES that share the same IOVA mapping and + it will synchronize its mapping with each member HW_PAGETABLE. + +All user-visible objects are destroyed via the IOMMU_DESTROY uAPI. + +The diagram below shows relationship between user-visible objects and kernel +datastructures (external to iommufd), with numbers referred to operations +creating the objects and links:: + + _________________________________________________________ + | iommufd | + | [1] | + | _________________ | + | | | | + | | | | + | | | | + | | | | + | | | | + | | | | + | | | [3] [2] | + | | | ____________ __________ | + | | IOAS |<--| |<------| | | + | | | |HW_PAGETABLE| | DEVICE | | + | | | |____________| |__________| | + | | | | | | + | | | | | | + | | | | | | + | | | | | | + | | | | | | + | |_________________| | | | + | | | | | + |_________|___________________|___________________|_______| + | | | + | _____v______ _______v_____ + | PFN storage | | | | + |------------>|iommu_domain| |struct device| + |____________| |_____________| + +1. IOMMUFD_OBJ_IOAS is created via the IOMMU_IOAS_ALLOC uAPI. An iommufd can + hold multiple IOAS objects. IOAS is the most generic object and does not + expose interfaces that are specific to single IOMMU drivers. All operations + on the IOAS must operate equally on each of the iommu_domains inside of it. + +2. IOMMUFD_OBJ_DEVICE is created when an external driver calls the IOMMUFD kAPI + to bind a device to an iommufd. The driver is expected to implement a set of + ioctls to allow userspace to initiate the binding operation. Successful + completion of this operation establishes the desired DMA ownership over the + device. The driver must also set the driver_managed_dma flag and must not + touch the device until this operation succeeds. + +3. IOMMUFD_OBJ_HW_PAGETABLE is created when an external driver calls the IOMMUFD + kAPI to attach a bound device to an IOAS. Similarly the external driver uAPI + allows userspace to initiate the attaching operation. If a compatible + pagetable already exists then it is reused for the attachment. Otherwise a + new pagetable object and iommu_domain is created. Successful completion of + this operation sets up the linkages among IOAS, device and iommu_domain. Once + this completes the device could do DMA. + + Every iommu_domain inside the IOAS is also represented to userspace as a + HW_PAGETABLE object. + + .. note:: + + Future IOMMUFD updates will provide an API to create and manipulate the + HW_PAGETABLE directly. + +A device can only bind to an iommufd due to DMA ownership claim and attach to at +most one IOAS object (no support of PASID yet). + +Kernel Datastructure +-------------------- + +User visible objects are backed by following datastructures: + +- iommufd_ioas for IOMMUFD_OBJ_IOAS. +- iommufd_device for IOMMUFD_OBJ_DEVICE. +- iommufd_hw_pagetable for IOMMUFD_OBJ_HW_PAGETABLE. + +Several terminologies when looking at these datastructures: + +- Automatic domain - refers to an iommu domain created automatically when + attaching a device to an IOAS object. This is compatible to the semantics of + VFIO type1. + +- Manual domain - refers to an iommu domain designated by the user as the + target pagetable to be attached to by a device. Though currently there are + no uAPIs to directly create such domain, the datastructure and algorithms + are ready for handling that use case. + +- In-kernel user - refers to something like a VFIO mdev that is using the + IOMMUFD access interface to access the IOAS. This starts by creating an + iommufd_access object that is similar to the domain binding a physical device + would do. The access object will then allow converting IOVA ranges into struct + page * lists, or doing direct read/write to an IOVA. + +iommufd_ioas serves as the metadata datastructure to manage how IOVA ranges are +mapped to memory pages, composed of: + +- struct io_pagetable holding the IOVA map +- struct iopt_area's representing populated portions of IOVA +- struct iopt_pages representing the storage of PFNs +- struct iommu_domain representing the IO page table in the IOMMU +- struct iopt_pages_access representing in-kernel users of PFNs +- struct xarray pinned_pfns holding a list of pages pinned by in-kernel users + +Each iopt_pages represents a logical linear array of full PFNs. The PFNs are +ultimately derived from userspace VAs via an mm_struct. Once they have been +pinned the PFNs are stored in IOPTEs of an iommu_domain or inside the pinned_pfns +xarray if they have been pinned through an iommufd_access. + +PFN have to be copied between all combinations of storage locations, depending +on what domains are present and what kinds of in-kernel "software access" users +exist. The mechanism ensures that a page is pinned only once. + +An io_pagetable is composed of iopt_areas pointing at iopt_pages, along with a +list of iommu_domains that mirror the IOVA to PFN map. + +Multiple io_pagetable-s, through their iopt_area-s, can share a single +iopt_pages which avoids multi-pinning and double accounting of page +consumption. + +iommufd_ioas is sharable between subsystems, e.g. VFIO and VDPA, as long as +devices managed by different subsystems are bound to a same iommufd. + +IOMMUFD User API +================ + +.. kernel-doc:: include/uapi/linux/iommufd.h + +IOMMUFD Kernel API +================== + +The IOMMUFD kAPI is device-centric with group-related tricks managed behind the +scene. This allows the external drivers calling such kAPI to implement a simple +device-centric uAPI for connecting its device to an iommufd, instead of +explicitly imposing the group semantics in its uAPI as VFIO does. + +.. kernel-doc:: drivers/iommu/iommufd/device.c + :export: + +.. kernel-doc:: drivers/iommu/iommufd/main.c + :export: + +VFIO and IOMMUFD +---------------- + +Connecting a VFIO device to iommufd can be done in two ways. + +First is a VFIO compatible way by directly implementing the /dev/vfio/vfio +container IOCTLs by mapping them into io_pagetable operations. Doing so allows +the use of iommufd in legacy VFIO applications by symlinking /dev/vfio/vfio to +/dev/iommufd or extending VFIO to SET_CONTAINER using an iommufd instead of a +container fd. + +The second approach directly extends VFIO to support a new set of device-centric +user API based on aforementioned IOMMUFD kernel API. It requires userspace +change but better matches the IOMMUFD API semantics and easier to support new +iommufd features when comparing it to the first approach. + +Currently both approaches are still work-in-progress. + +There are still a few gaps to be resolved to catch up with VFIO type1, as +documented in iommufd_vfio_check_extension(). + +Future TODOs +============ + +Currently IOMMUFD supports only kernel-managed I/O page table, similar to VFIO +type1. New features on the radar include: + + - Binding iommu_domain's to PASID/SSID + - Userspace page tables, for ARM, x86 and S390 + - Kernel bypass'd invalidation of user page tables + - Re-use of the KVM page table in the IOMMU + - Dirty page tracking in the IOMMU + - Runtime Increase/Decrease of IOPTE size + - PRI support with faults resolved in userspace diff --git a/MAINTAINERS b/MAINTAINERS index 2ada2bddb809..abb08d7cc92d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10793,6 +10793,18 @@ F: drivers/iommu/dma-iommu.h F: drivers/iommu/iova.c F: include/linux/iova.h +IOMMUFD +M: Jason Gunthorpe <jgg@nvidia.com> +M: Kevin Tian <kevin.tian@intel.com> +L: iommu@lists.linux.dev +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd.git +F: Documentation/userspace-api/iommufd.rst +F: drivers/iommu/iommufd/ +F: include/linux/iommufd.h +F: include/uapi/linux/iommufd.h +F: tools/testing/selftests/iommu/ + IOMMU SUBSYSTEM M: Joerg Roedel <joro@8bytes.org> M: Will Deacon <will@kernel.org> diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index a9b96b18772f..e13e92609943 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -6,7 +6,6 @@ #include <linux/pci.h> #include <linux/device.h> #include <linux/sched/task.h> -#include <linux/intel-svm.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/cdev.h> #include <linux/fs.h> @@ -100,7 +99,7 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) filp->private_data = ctx; if (device_user_pasid_enabled(idxd)) { - sva = iommu_sva_bind_device(dev, current->mm, NULL); + sva = iommu_sva_bind_device(dev, current->mm); if (IS_ERR(sva)) { rc = PTR_ERR(sva); dev_err(dev, "pasid allocation failed: %d\n", rc); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 09cbf0c179ba..529ea09c9094 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -14,7 +14,6 @@ #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/device.h> #include <linux/idr.h> -#include <linux/intel-svm.h> #include <linux/iommu.h> #include <uapi/linux/idxd.h> #include <linux/dmaengine.h> @@ -502,29 +501,7 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d static int idxd_enable_system_pasid(struct idxd_device *idxd) { - int flags; - unsigned int pasid; - struct iommu_sva *sva; - - flags = SVM_FLAG_SUPERVISOR_MODE; - - sva = iommu_sva_bind_device(&idxd->pdev->dev, NULL, &flags); - if (IS_ERR(sva)) { - dev_warn(&idxd->pdev->dev, - "iommu sva bind failed: %ld\n", PTR_ERR(sva)); - return PTR_ERR(sva); - } - - pasid = iommu_sva_get_pasid(sva); - if (pasid == IOMMU_PASID_INVALID) { - iommu_sva_unbind_device(sva); - return -ENODEV; - } - - idxd->sva = sva; - idxd->pasid = pasid; - dev_dbg(&idxd->pdev->dev, "system pasid: %u\n", pasid); - return 0; + return -EOPNOTSUPP; } static void idxd_disable_system_pasid(struct idxd_device *idxd) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 077892a9aa8f..539447eda665 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -669,9 +669,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) vgpu->attached = true; - kvmgt_protect_table_init(vgpu); - gvt_cache_init(vgpu); - vgpu->track_node.track_write = kvmgt_page_track_write; vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot; kvm_get_kvm(vgpu->vfio_device.kvm); @@ -715,6 +712,11 @@ static void intel_vgpu_close_device(struct vfio_device *vfio_dev) kvmgt_protect_table_destroy(vgpu); gvt_cache_destroy(vgpu); + WARN_ON(vgpu->nr_cache_entries); + + vgpu->gfn_cache = RB_ROOT; + vgpu->dma_addr_cache = RB_ROOT; + intel_vgpu_release_msi_eventfd_ctx(vgpu); |
