summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-04 18:42:48 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-04 18:42:48 -0800
commita3ebb59eee2e558e8f8f27fc3f75cd367f17cd8e (patch)
tree934bbee766e0cce55cfcaec5dcae856c0361af37 /include
parentce5cfb0fa20dc6454da039612e34325b7b4a8243 (diff)
parentd721f52e31553a848e0e9947ca15a49c5674aef3 (diff)
Merge tag 'vfio-v6.19-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - Move libvfio selftest artifacts in preparation of more tightly coupled integration with KVM selftests (David Matlack) - Fix comment typo in mtty driver (Chu Guangqing) - Support for new hardware revision in the hisi_acc vfio-pci variant driver where the migration registers can now be accessed via the PF. When enabled for this support, the full BAR can be exposed to the user (Longfang Liu) - Fix vfio cdev support for VF token passing, using the correct size for the kernel structure, thereby actually allowing userspace to provide a non-zero UUID token. Also set the match token callback for the hisi_acc, fixing VF token support for this this vfio-pci variant driver (Raghavendra Rao Ananta) - Introduce internal callbacks on vfio devices to simplify and consolidate duplicate code for generating VFIO_DEVICE_GET_REGION_INFO data, removing various ioctl intercepts with a more structured solution (Jason Gunthorpe) - Introduce dma-buf support for vfio-pci devices, allowing MMIO regions to be exposed through dma-buf objects with lifecycle managed through move operations. This enables low-level interactions such as a vfio-pci based SPDK drivers interacting directly with dma-buf capable RDMA devices to enable peer-to-peer operations. IOMMUFD is also now able to build upon this support to fill a long standing feature gap versus the legacy vfio type1 IOMMU backend with an implementation of P2P support for VM use cases that better manages the lifecycle of the P2P mapping (Leon Romanovsky, Jason Gunthorpe, Vivek Kasireddy) - Convert eventfd triggering for error and request signals to use RCU mechanisms in order to avoid a 3-way lockdep reported deadlock issue (Alex Williamson) - Fix a 32-bit overflow introduced via dma-buf support manifesting with large DMA buffers (Alex Mastro) - Convert nvgrace-gpu vfio-pci variant driver to insert mappings on fault rather than at mmap time. This conversion serves both to make use of huge PFNMAPs but also to both avoid corrected RAS events during reset by now being subject to vfio-pci-core's use of unmap_mapping_range(), and to enable a device readiness test after reset (Ankit Agrawal) - Refactoring of vfio selftests to support multi-device tests and split code to provide better separation between IOMMU and device objects. This work also enables a new test suite addition to measure parallel device initialization latency (David Matlack) * tag 'vfio-v6.19-rc1' of https://github.com/awilliam/linux-vfio: (65 commits) vfio: selftests: Add vfio_pci_device_init_perf_test vfio: selftests: Eliminate INVALID_IOVA vfio: selftests: Split libvfio.h into separate header files vfio: selftests: Move vfio_selftests_*() helpers into libvfio.c vfio: selftests: Rename vfio_util.h to libvfio.h vfio: selftests: Stop passing device for IOMMU operations vfio: selftests: Move IOVA allocator into iova_allocator.c vfio: selftests: Move IOMMU library code into iommu.c vfio: selftests: Rename struct vfio_dma_region to dma_region vfio: selftests: Upgrade driver logging to dev_err() vfio: selftests: Prefix logs with device BDF where relevant vfio: selftests: Eliminate overly chatty logging vfio: selftests: Support multiple devices in the same container/iommufd vfio: selftests: Introduce struct iommu vfio: selftests: Rename struct vfio_iommu_mode to iommu_mode vfio: selftests: Allow passing multiple BDFs on the command line vfio: selftests: Split run.sh into separate scripts vfio: selftests: Move run.sh into scripts directory vfio/nvgrace-gpu: wait for the GPU mem to be ready vfio/nvgrace-gpu: Inform devmem unmapped after reset ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/dma-buf-mapping.h17
-rw-r--r--include/linux/dma-buf.h11
-rw-r--r--include/linux/hisi_acc_qm.h3
-rw-r--r--include/linux/pci-p2pdma.h120
-rw-r--r--include/linux/vfio.h6
-rw-r--r--include/linux/vfio_pci_core.h69
-rw-r--r--include/uapi/linux/vfio.h28
7 files changed, 205 insertions, 49 deletions
diff --git a/include/linux/dma-buf-mapping.h b/include/linux/dma-buf-mapping.h
new file mode 100644
index 000000000000..a3c0ce2d3a42
--- /dev/null
+++ b/include/linux/dma-buf-mapping.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * DMA BUF Mapping Helpers
+ *
+ */
+#ifndef __DMA_BUF_MAPPING_H__
+#define __DMA_BUF_MAPPING_H__
+#include <linux/dma-buf.h>
+
+struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach,
+ struct p2pdma_provider *provider,
+ struct dma_buf_phys_vec *phys_vec,
+ size_t nr_ranges, size_t size,
+ enum dma_data_direction dir);
+void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt,
+ enum dma_data_direction dir);
+#endif
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index d58e329ac0e7..0bc492090237 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/dma-fence.h>
#include <linux/wait.h>
+#include <linux/pci-p2pdma.h>
struct device;
struct dma_buf;
@@ -531,6 +532,16 @@ struct dma_buf_export_info {
};
/**
+ * struct dma_buf_phys_vec - describe continuous chunk of memory
+ * @paddr: physical address of that chunk
+ * @len: Length of this chunk
+ */
+struct dma_buf_phys_vec {
+ phys_addr_t paddr;
+ size_t len;
+};
+
+/**
* DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters
* @name: export-info name
*
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index c4690e365ade..ca1ec437a3ca 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -99,6 +99,9 @@
#define QM_DEV_ALG_MAX_LEN 256
+#define QM_MIG_REGION_SEL 0x100198
+#define QM_MIG_REGION_EN BIT(0)
+
/* uacce mode of the driver */
#define UACCE_MODE_NOUACCE 0 /* don't use uacce */
#define UACCE_MODE_SVA 1 /* use uacce sva mode */
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 951f81a38f3a..517e121d2598 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -16,7 +16,58 @@
struct block_device;
struct scatterlist;
+/**
+ * struct p2pdma_provider
+ *
+ * A p2pdma provider is a range of MMIO address space available to the CPU.
+ */
+struct p2pdma_provider {
+ struct device *owner;
+ u64 bus_offset;
+};
+
+enum pci_p2pdma_map_type {
+ /*
+ * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before
+ * the mapping type has been calculated. Exported routines for the API
+ * will never return this value.
+ */
+ PCI_P2PDMA_MAP_UNKNOWN = 0,
+
+ /*
+ * Not a PCI P2PDMA transfer.
+ */
+ PCI_P2PDMA_MAP_NONE,
+
+ /*
+ * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
+ * traverse the host bridge and the host bridge is not in the
+ * allowlist. DMA Mapping routines should return an error when
+ * this is returned.
+ */
+ PCI_P2PDMA_MAP_NOT_SUPPORTED,
+
+ /*
+ * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to
+ * each other directly through a PCI switch and the transaction will
+ * not traverse the host bridge. Such a mapping should program
+ * the DMA engine with PCI bus addresses.
+ */
+ PCI_P2PDMA_MAP_BUS_ADDR,
+
+ /*
+ * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
+ * to each other, but the transaction traverses a host bridge on the
+ * allowlist. In this case, a normal mapping either with CPU physical
+ * addresses (in the case of dma-direct) or IOVA addresses (in the
+ * case of IOMMUs) should be used to program the DMA engine.
+ */
+ PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
+};
+
#ifdef CONFIG_PCI_P2PDMA
+int pcim_p2pdma_init(struct pci_dev *pdev);
+struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar);
int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
u64 offset);
int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients,
@@ -33,7 +84,18 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
bool *use_p2pdma);
ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
bool use_p2pdma);
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider,
+ struct device *dev);
#else /* CONFIG_PCI_P2PDMA */
+static inline int pcim_p2pdma_init(struct pci_dev *pdev)
+{
+ return -EOPNOTSUPP;
+}
+static inline struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev,
+ int bar)
+{
+ return NULL;
+}
static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
size_t size, u64 offset)
{
@@ -85,6 +147,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
{
return sprintf(page, "none\n");
}
+static inline enum pci_p2pdma_map_type
+pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev)
+{
+ return PCI_P2PDMA_MAP_NOT_SUPPORTED;
+}
#endif /* CONFIG_PCI_P2PDMA */
@@ -99,51 +166,12 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client)
return pci_p2pmem_find_many(&client, 1);
}
-enum pci_p2pdma_map_type {
- /*
- * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before
- * the mapping type has been calculated. Exported routines for the API
- * will never return this value.
- */
- PCI_P2PDMA_MAP_UNKNOWN = 0,
-
- /*
- * Not a PCI P2PDMA transfer.
- */
- PCI_P2PDMA_MAP_NONE,
-
- /*
- * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
- * traverse the host bridge and the host bridge is not in the
- * allowlist. DMA Mapping routines should return an error when
- * this is returned.
- */
- PCI_P2PDMA_MAP_NOT_SUPPORTED,
-
- /*
- * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to
- * each other directly through a PCI switch and the transaction will
- * not traverse the host bridge. Such a mapping should program
- * the DMA engine with PCI bus addresses.
- */
- PCI_P2PDMA_MAP_BUS_ADDR,
-
- /*
- * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
- * to each other, but the transaction traverses a host bridge on the
- * allowlist. In this case, a normal mapping either with CPU physical
- * addresses (in the case of dma-direct) or IOVA addresses (in the
- * case of IOMMUs) should be used to program the DMA engine.
- */
- PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
-};
-
struct pci_p2pdma_map_state {
- struct dev_pagemap *pgmap;
+ struct p2pdma_provider *mem;
enum pci_p2pdma_map_type map;
- u64 bus_off;
};
+
/* helper for pci_p2pdma_state(), do not use directly */
void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
struct device *dev, struct page *page);
@@ -162,8 +190,7 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev,
struct page *page)
{
if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
- if (state->pgmap != page_pgmap(page))
- __pci_p2pdma_update_state(state, dev, page);
+ __pci_p2pdma_update_state(state, dev, page);
return state->map;
}
return PCI_P2PDMA_MAP_NONE;
@@ -172,16 +199,15 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev,
/**
* pci_p2pdma_bus_addr_map - Translate a physical address to a bus address
* for a PCI_P2PDMA_MAP_BUS_ADDR transfer.
- * @state: P2P state structure
+ * @provider: P2P provider structure
* @paddr: physical address to map
*
* Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer.
*/
static inline dma_addr_t
-pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr)
+pci_p2pdma_bus_addr_map(struct p2pdma_provider *provider, phys_addr_t paddr)
{
- WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR);
- return paddr + state->bus_off;
+ return paddr + provider->bus_offset;
}
#endif /* _LINUX_PCI_P2P_H */
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index eb563f538dee..e90859956514 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -21,6 +21,7 @@ struct kvm;
struct iommufd_ctx;
struct iommufd_device;
struct iommufd_access;
+struct vfio_info_cap;
/*
* VFIO devices can be placed in a set, this allows all devices to share this
@@ -132,6 +133,9 @@ struct vfio_device_ops {
size_t count, loff_t *size);
long (*ioctl)(struct vfio_device *vdev, unsigned int cmd,
unsigned long arg);
+ int (*get_region_info_caps)(struct vfio_device *vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps);
int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
void (*request)(struct vfio_device *vdev, unsigned int count);
int (*match)(struct vfio_device *vdev, char *buf);
@@ -297,6 +301,8 @@ static inline void vfio_put_device(struct vfio_device *device)
int vfio_register_group_dev(struct vfio_device *device);
int vfio_register_emulated_iommu_dev(struct vfio_device *device);
void vfio_unregister_group_dev(struct vfio_device *device);
+bool vfio_device_try_get_registration(struct vfio_device *device);
+void vfio_device_put_registration(struct vfio_device *device);
int vfio_assign_device_set(struct vfio_device *device, void *set_id);
unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set);
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index f541044e42a2..336a0e58b443 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <linux/vfio.h>
#include <linux/irqbypass.h>
+#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/uuid.h>
#include <linux/notifier.h>
@@ -26,6 +27,13 @@
struct vfio_pci_core_device;
struct vfio_pci_region;
+struct p2pdma_provider;
+struct dma_buf_phys_vec;
+
+struct vfio_pci_eventfd {
+ struct eventfd_ctx *ctx;
+ struct rcu_head rcu;
+};
struct vfio_pci_regops {
ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
@@ -49,9 +57,48 @@ struct vfio_pci_region {
u32 flags;
};
+struct vfio_pci_device_ops {
+ int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev,
+ struct p2pdma_provider **provider,
+ unsigned int region_index,
+ struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges);
+};
+
+#if IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)
+int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges, phys_addr_t start,
+ phys_addr_t len);
+int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
+ struct p2pdma_provider **provider,
+ unsigned int region_index,
+ struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges);
+#else
+static inline int
+vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges,
+ size_t nr_ranges, phys_addr_t start,
+ phys_addr_t len)
+{
+ return -EINVAL;
+}
+static inline int vfio_pci_core_get_dmabuf_phys(
+ struct vfio_pci_core_device *vdev, struct p2pdma_provider **provider,
+ unsigned int region_index, struct dma_buf_phys_vec *phys_vec,
+ struct vfio_region_dma_range *dma_ranges, size_t nr_ranges)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
struct vfio_pci_core_device {
struct vfio_device vdev;
struct pci_dev *pdev;
+ const struct vfio_pci_device_ops *pci_ops;
void __iomem *barmap[PCI_STD_NUM_BARS];
bool bar_mmap_supported[PCI_STD_NUM_BARS];
u8 *pci_config_map;
@@ -83,8 +130,8 @@ struct vfio_pci_core_device {
struct pci_saved_state *pci_saved_state;
struct pci_saved_state *pm_save;
int ioeventfds_nr;
- struct eventfd_ctx *err_trigger;
- struct eventfd_ctx *req_trigger;
+ struct vfio_pci_eventfd __rcu *err_trigger;
+ struct vfio_pci_eventfd __rcu *req_trigger;
struct eventfd_ctx *pm_wake_eventfd_ctx;
struct list_head dummy_resources_list;
struct mutex ioeventfds_lock;
@@ -94,6 +141,7 @@ struct vfio_pci_core_device {
struct vfio_pci_core_device *sriov_pf_core_dev;
struct notifier_block nb;
struct rw_semaphore memory_lock;
+ struct list_head dmabufs;
};
/* Will be exported for vfio pci drivers usage */
@@ -115,10 +163,16 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
unsigned long arg);
int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
void __user *arg, size_t argsz);
+int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ struct vfio_region_info *info,
+ struct vfio_info_cap *caps);
ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
size_t count, loff_t *ppos);
ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
size_t count, loff_t *ppos);
+vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev,
+ struct vm_fault *vmf, unsigned long pfn,
+ unsigned int order);
int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
@@ -134,6 +188,7 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
void __iomem *io, char __user *buf,
loff_t off, size_t count, size_t x_start,
size_t x_end, bool iswrite);
+bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
loff_t reg_start, size_t reg_cnt,
loff_t *buf_offset,
@@ -161,4 +216,14 @@ VFIO_IOREAD_DECLARATION(32)
VFIO_IOREAD_DECLARATION(64)
#endif
+static inline bool is_aligned_for_order(struct vm_area_struct *vma,
+ unsigned long addr,
+ unsigned long pfn,
+ unsigned int order)
+{
+ return !(order && (addr < vma->vm_start ||
+ addr + (PAGE_SIZE << order) > vma->vm_end ||
+ !IS_ALIGNED(pfn, 1 << order)));
+}
+
#endif /* VFIO_PCI_CORE_H */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 75100bf009ba..ac2329f24141 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/ioctl.h>
+#include <linux/stddef.h>
#define VFIO_API_VERSION 0
@@ -1478,6 +1479,33 @@ struct vfio_device_feature_bus_master {
};
#define VFIO_DEVICE_FEATURE_BUS_MASTER 10
+/**
+ * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
+ * regions selected.
+ *
+ * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
+ * etc. offset/length specify a slice of the region to create the dmabuf from.
+ * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
+ *
+ * flags should be 0.
+ *
+ * Return: The fd number on success, -1 and errno is set on failure.
+ */
+#define VFIO_DEVICE_FEATURE_DMA_BUF 11
+
+struct vfio_region_dma_range {
+ __u64 offset;
+ __u64 length;
+};
+
+struct vfio_device_feature_dma_buf {
+ __u32 region_index;
+ __u32 open_flags;
+ __u32 flags;
+ __u32 nr_ranges;
+ struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges);
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**