summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS2
-rw-r--r--drivers/gpu/drm/i915/gvt/kvmgt.c1
-rw-r--r--drivers/s390/cio/vfio_ccw_chp.c5
-rw-r--r--drivers/s390/cio/vfio_ccw_drv.c174
-rw-r--r--drivers/s390/cio/vfio_ccw_fsm.c27
-rw-r--r--drivers/s390/cio/vfio_ccw_ops.c107
-rw-r--r--drivers/s390/cio/vfio_ccw_private.h37
-rw-r--r--drivers/s390/crypto/vfio_ap_drv.c2
-rw-r--r--drivers/s390/crypto/vfio_ap_ops.c6
-rw-r--r--drivers/vfio/Kconfig7
-rw-r--r--drivers/vfio/Makefile5
-rw-r--r--drivers/vfio/fsl-mc/vfio_fsl_mc.c1
-rw-r--r--drivers/vfio/iova_bitmap.c33
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c156
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h2
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c413
-rw-r--r--drivers/vfio/pci/mlx5/cmd.h96
-rw-r--r--drivers/vfio/pci/mlx5/main.c784
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c15
-rw-r--r--drivers/vfio/platform/vfio_amba.c1
-rw-r--r--drivers/vfio/platform/vfio_platform.c1
-rw-r--r--drivers/vfio/platform/vfio_platform_common.c3
-rw-r--r--drivers/vfio/vfio.h13
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c65
-rw-r--r--drivers/vfio/vfio_main.c145
-rw-r--r--drivers/vfio/vfio_spapr_eeh.c107
-rw-r--r--drivers/vfio/virqfd.c17
-rw-r--r--include/linux/mlx5/mlx5_ifc.h14
-rw-r--r--include/linux/vfio.h31
-rw-r--r--include/uapi/linux/vfio.h136
-rw-r--r--samples/vfio-mdev/mbochs.c8
-rw-r--r--samples/vfio-mdev/mdpy-fb.c8
-rw-r--r--samples/vfio-mdev/mdpy.c8
-rw-r--r--samples/vfio-mdev/mtty.c8
34 files changed, 1792 insertions, 646 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 096ae475e21c..4d75ffe9affa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21781,7 +21781,7 @@ M: Alex Williamson <alex.williamson@redhat.com>
R: Cornelia Huck <cohuck@redhat.com>
L: kvm@vger.kernel.org
S: Maintained
-T: git git://github.com/awilliam/linux-vfio.git
+T: git https://github.com/awilliam/linux-vfio.git
F: Documentation/ABI/testing/sysfs-devices-vfio-dev
F: Documentation/driver-api/vfio.rst
F: drivers/vfio/
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 539447eda665..f5451adcd489 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1465,7 +1465,6 @@ static void intel_vgpu_release_dev(struct vfio_device *vfio_dev)
struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev);
intel_gvt_destroy_vgpu(vgpu);
- vfio_free_device(vfio_dev);
}
static const struct vfio_device_ops intel_vgpu_dev_ops = {
diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c
index 13b26a1c7988..d3f3a611f95b 100644
--- a/drivers/s390/cio/vfio_ccw_chp.c
+++ b/drivers/s390/cio/vfio_ccw_chp.c
@@ -16,6 +16,7 @@ static ssize_t vfio_ccw_schib_region_read(struct vfio_ccw_private *private,
char __user *buf, size_t count,
loff_t *ppos)
{
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS;
loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK;
struct ccw_schib_region *region;
@@ -27,12 +28,12 @@ static ssize_t vfio_ccw_schib_region_read(struct vfio_ccw_private *private,
mutex_lock(&private->io_mutex);
region = private->region[i].data;
- if (cio_update_schib(private->sch)) {
+ if (cio_update_schib(sch)) {
ret = -ENODEV;
goto out;
}
- memcpy(region, &private->sch->schib, sizeof(*region));
+ memcpy(region, &sch->schib, sizeof(*region));
if (copy_to_user(buf, (void *)region + pos, count)) {
ret = -EFAULT;
diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c
index 7f5402fe857a..54aba7cceb33 100644
--- a/drivers/s390/cio/vfio_ccw_drv.c
+++ b/drivers/s390/cio/vfio_ccw_drv.c
@@ -23,10 +23,10 @@
#include "vfio_ccw_private.h"
struct workqueue_struct *vfio_ccw_work_q;
-static struct kmem_cache *vfio_ccw_io_region;
-static struct kmem_cache *vfio_ccw_cmd_region;
-static struct kmem_cache *vfio_ccw_schib_region;
-static struct kmem_cache *vfio_ccw_crw_region;
+struct kmem_cache *vfio_ccw_io_region;
+struct kmem_cache *vfio_ccw_cmd_region;
+struct kmem_cache *vfio_ccw_schib_region;
+struct kmem_cache *vfio_ccw_crw_region;
debug_info_t *vfio_ccw_debug_msg_id;
debug_info_t *vfio_ccw_debug_trace_id;
@@ -36,10 +36,19 @@ debug_info_t *vfio_ccw_debug_trace_id;
*/
int vfio_ccw_sch_quiesce(struct subchannel *sch)
{
- struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev);
DECLARE_COMPLETION_ONSTACK(completion);
int iretry, ret = 0;
+ /*
+ * Probably an impossible situation, after being called through
+ * FSM callbacks. But in the event it did, register a warning
+ * and return as if things were fine.
+ */
+ if (WARN_ON(!private))
+ return 0;
+
iretry = 255;
do {
@@ -70,7 +79,7 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch)
return ret;
}
-static void vfio_ccw_sch_io_todo(struct work_struct *work)
+void vfio_ccw_sch_io_todo(struct work_struct *work)
{
struct vfio_ccw_private *private;
struct irb *irb;
@@ -106,7 +115,7 @@ static void vfio_ccw_sch_io_todo(struct work_struct *work)
eventfd_signal(private->io_trigger, 1);
}
-static void vfio_ccw_crw_todo(struct work_struct *work)
+void vfio_ccw_crw_todo(struct work_struct *work)
{
struct vfio_ccw_private *private;
@@ -121,90 +130,39 @@ static void vfio_ccw_crw_todo(struct work_struct *work)
*/
static void vfio_ccw_sch_irq(struct subchannel *sch)
{
- struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev);
+
+ /*
+ * The subchannel should still be disabled at this point,
+ * so an interrupt would be quite surprising. As with an
+ * interrupt while the FSM is closed, let's attempt to
+ * disable the subchannel again.
+ */
+ if (!private) {
+ VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: unexpected interrupt\n",
+ sch->schid.cssid, sch->schid.ssid,
+ sch->schid.sch_no);
+
+ cio_disable_subchannel(sch);
+ return;
+ }
inc_irq_stat(IRQIO_CIO);
vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT);
}
-static struct vfio_ccw_private *vfio_ccw_alloc_private(struct subchannel *sch)
+static void vfio_ccw_free_parent(struct device *dev)
{
- struct vfio_ccw_private *private;
+ struct vfio_ccw_parent *parent = container_of(dev, struct vfio_ccw_parent, dev);
- private = kzalloc(sizeof(*private), GFP_KERNEL);
- if (!private)
- return ERR_PTR(-ENOMEM);
-
- private->sch = sch;
- mutex_init(&private->io_mutex);
- private->state = VFIO_CCW_STATE_STANDBY;
- INIT_LIST_HEAD(&private->crw);
- INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo);
- INIT_WORK(&private->crw_work, vfio_ccw_crw_todo);
-
- private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1),
- GFP_KERNEL);
- if (!private->cp.guest_cp)
- goto out_free_private;
-
- private->io_region = kmem_cache_zalloc(vfio_ccw_io_region,
- GFP_KERNEL | GFP_DMA);
- if (!private->io_region)
- goto out_free_cp;
-
- private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region,
- GFP_KERNEL | GFP_DMA);
- if (!private->cmd_region)
- goto out_free_io;
-
- private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region,
- GFP_KERNEL | GFP_DMA);
-
- if (!private->schib_region)
- goto out_free_cmd;
-
- private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region,
- GFP_KERNEL | GFP_DMA);
-
- if (!private->crw_region)
- goto out_free_schib;
- return private;
-
-out_free_schib:
- kmem_cache_free(vfio_ccw_schib_region, private->schib_region);
-out_free_cmd:
- kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region);
-out_free_io:
- kmem_cache_free(vfio_ccw_io_region, private->io_region);
-out_free_cp:
- kfree(private->cp.guest_cp);
-out_free_private:
- mutex_destroy(&private->io_mutex);
- kfree(private);
- return ERR_PTR(-ENOMEM);
+ kfree(parent);
}
-static void vfio_ccw_free_private(struct vfio_ccw_private *private)
-{
- struct vfio_ccw_crw *crw, *temp;
-
- list_for_each_entry_safe(crw, temp, &private->crw, next) {
- list_del(&crw->next);
- kfree(crw);
- }
-
- kmem_cache_free(vfio_ccw_crw_region, private->crw_region);
- kmem_cache_free(vfio_ccw_schib_region, private->schib_region);
- kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region);
- kmem_cache_free(vfio_ccw_io_region, private->io_region);
- kfree(private->cp.guest_cp);
- mutex_destroy(&private->io_mutex);
- kfree(private);
-}
static int vfio_ccw_sch_probe(struct subchannel *sch)
{
struct pmcw *pmcw = &sch->schib.pmcw;
- struct vfio_ccw_private *private;
+ struct vfio_ccw_parent *parent;
int ret = -ENOMEM;
if (pmcw->qf) {
@@ -213,42 +171,50 @@ static int vfio_ccw_sch_probe(struct subchannel *sch)
return -ENODEV;
}
- private = vfio_ccw_alloc_private(sch);
- if (IS_ERR(private))
- return PTR_ERR(private);
+ parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+ if (!parent)
+ return -ENOMEM;
+
+ dev_set_name(&parent->dev, "parent");
+ parent->dev.parent = &sch->dev;
+ parent->dev.release = &vfio_ccw_free_parent;
+ ret = device_register(&parent->dev);
+ if (ret)
+ goto out_free;
- dev_set_drvdata(&sch->dev, private);
+ dev_set_drvdata(&sch->dev, parent);
- private->mdev_type.sysfs_name = "io";
- private->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)";
- private->mdev_types[0] = &private->mdev_type;
- ret = mdev_register_parent(&private->parent, &sch->dev,
+ parent->mdev_type.sysfs_name = "io";
+ parent->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)";
+ parent->mdev_types[0] = &parent->mdev_type;
+ ret = mdev_register_parent(&parent->parent, &sch->dev,
&vfio_ccw_mdev_driver,
- private->mdev_types, 1);
+ parent->mdev_types, 1);
if (ret)
- goto out_free;
+ goto out_unreg;
VFIO_CCW_MSG_EVENT(4, "bound to subchannel %x.%x.%04x\n",
sch->schid.cssid, sch->schid.ssid,
sch->schid.sch_no);
return 0;
+out_unreg:
+ device_del(&parent->dev);
out_free:
+ put_device(&parent->dev);
dev_set_drvdata(&sch->dev, NULL);
- vfio_ccw_free_private(private);
return ret;
}
static void vfio_ccw_sch_remove(struct subchannel *sch)
{
- struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev);
- mdev_unregister_parent(&private->parent);
+ mdev_unregister_parent(&parent->parent);
+ device_unregister(&parent->dev);
dev_set_drvdata(&sch->dev, NULL);
- vfio_ccw_free_private(private);
-
VFIO_CCW_MSG_EVENT(4, "unbound from subchannel %x.%x.%04x\n",
sch->schid.cssid, sch->schid.ssid,
sch->schid.sch_no);
@@ -256,7 +222,11 @@ static void vfio_ccw_sch_remove(struct subchannel *sch)
static void vfio_ccw_sch_shutdown(struct subchannel *sch)
{
- struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev);
+
+ if (WARN_ON(!private))
+ return;
vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_CLOSE);
vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER);
@@ -274,7 +244,8 @@ static void vfio_ccw_sch_shutdown(struct subchannel *sch)
*/
static int vfio_ccw_sch_event(struct subchannel *sch, int process)
{
- struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev);
unsigned long flags;
int rc = -EAGAIN;
@@ -287,8 +258,10 @@ static int vfio_ccw_sch_event(struct subchannel *sch, int process)
rc = 0;
- if (cio_update_schib(sch))
- vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER);
+ if (cio_update_schib(sch)) {
+ if (private)
+ vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER);
+ }
out_unlock:
spin_unlock_irqrestore(sch->lock, flags);
@@ -326,14 +299,15 @@ static void vfio_ccw_queue_crw(struct vfio_ccw_private *private,
static int vfio_ccw_chp_event(struct subchannel *sch,
struct chp_link *link, int event)
{
- struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev);
int mask = chp_ssd_get_mask(&sch->ssd_info, link);
int retry = 255;
if (!private || !mask)
return 0;
- trace_vfio_ccw_chp_event(private->sch->schid, mask, event);
+ trace_vfio_ccw_chp_event(sch->schid, mask, event);
VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: mask=0x%x event=%d\n",
sch->schid.cssid,
sch->schid.ssid, sch->schid.sch_no,
diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c
index 0a5e8b4a6743..2784a4e4d2be 100644
--- a/drivers/s390/cio/vfio_ccw_fsm.c
+++ b/drivers/s390/cio/vfio_ccw_fsm.c
@@ -18,15 +18,13 @@
static int fsm_io_helper(struct vfio_ccw_private *private)
{
- struct subchannel *sch;
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
union orb *orb;
int ccode;
__u8 lpm;
unsigned long flags;
int ret;
- sch = private->sch;
-
spin_lock_irqsave(sch->lock, flags);
orb = cp_get_orb(&private->cp, (u32)virt_to_phys(sch), sch->lpm);
@@ -80,13 +78,11 @@ out:
static int fsm_do_halt(struct vfio_ccw_private *private)
{
- struct subchannel *sch;
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
unsigned long flags;
int ccode;
int ret;
- sch = private->sch;
-
spin_lock_irqsave(sch->lock, flags);
VFIO_CCW_TRACE_EVENT(2, "haltIO");
@@ -121,13 +117,11 @@ static int fsm_do_halt(struct vfio_ccw_private *private)
static int fsm_do_clear(struct vfio_ccw_private *private)
{
- struct subchannel *sch;
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
unsigned long flags;
int ccode;
int ret;
- sch = private->sch;
-
spin_lock_irqsave(sch->lock, flags);
VFIO_CCW_TRACE_EVENT(2, "clearIO");
@@ -160,7 +154,7 @@ static int fsm_do_clear(struct vfio_ccw_private *private)
static void fsm_notoper(struct vfio_ccw_private *private,
enum vfio_ccw_event event)
{
- struct subchannel *sch = private->sch;
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: notoper event %x state %x\n",
sch->schid.cssid,
@@ -228,7 +222,7 @@ static void fsm_async_retry(struct vfio_ccw_private *private,
static void fsm_disabled_irq(struct vfio_ccw_private *private,
enum vfio_ccw_event event)
{
- struct subchannel *sch = private->sch;
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
/*
* An interrupt in a disabled state means a previous disable was not
@@ -238,7 +232,9 @@ static void fsm_disabled_irq(struct vfio_ccw_private *private,
}
inline struct subchannel_id get_schid(struct vfio_ccw_private *p)
{
- return p->sch->schid;
+ struct subchannel *sch = to_subchannel(p->vdev.dev->parent);
+
+ return sch->schid;
}
/*
@@ -360,10 +356,11 @@ static void fsm_async_request(struct vfio_ccw_private *private,
static void fsm_irq(struct vfio_ccw_private *private,
enum vfio_ccw_event event)
{
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
struct irb *irb = this_cpu_ptr(&cio_irb);
VFIO_CCW_TRACE_EVENT(6, "IRQ");
- VFIO_CCW_TRACE_EVENT(6, dev_name(&private->sch->dev));
+ VFIO_CCW_TRACE_EVENT(6, dev_name(&sch->dev));
memcpy(&private->irb, irb, sizeof(*irb));
@@ -376,7 +373,7 @@ static void fsm_irq(struct vfio_ccw_private *private,
static void fsm_open(struct vfio_ccw_private *private,
enum vfio_ccw_event event)
{
- struct subchannel *sch = private->sch;
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
int ret;
spin_lock_irq(sch->lock);
@@ -397,7 +394,7 @@ err_unlock:
static void fsm_close(struct vfio_ccw_private *private,
enum vfio_ccw_event event)
{
- struct subchannel *sch = private->sch;
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
int ret;
spin_lock_irq(sch->lock);
diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
index 560453d99c24..5b53b94f13c7 100644
--- a/drivers/s390/cio/vfio_ccw_ops.c
+++ b/drivers/s390/cio/vfio_ccw_ops.c
@@ -49,26 +49,70 @@ static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev)
struct vfio_ccw_private *private =
container_of(vdev, struct vfio_ccw_private, vdev);
- init_completion(&private->release_comp);
+ mutex_init(&private->io_mutex);
+ private->state = VFIO_CCW_STATE_STANDBY;
+ INIT_LIST_HEAD(&private->crw);
+ INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo);
+ INIT_WORK(&private->crw_work, vfio_ccw_crw_todo);
+
+ private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1),
+ GFP_KERNEL);
+ if (!private->cp.guest_cp)
+ goto out_free_private;
+
+ private->io_region = kmem_cache_zalloc(vfio_ccw_io_region,
+ GFP_KERNEL | GFP_DMA);
+ if (!private->io_region)
+ goto out_free_cp;
+
+ private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region,
+ GFP_KERNEL | GFP_DMA);
+ if (!private->cmd_region)
+ goto out_free_io;
+
+ private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region,
+ GFP_KERNEL | GFP_DMA);
+ if (!private->schib_region)
+ goto out_free_cmd;
+
+ private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region,
+ GFP_KERNEL | GFP_DMA);
+ if (!private->crw_region)
+ goto out_free_schib;
+
return 0;
+
+out_free_schib:
+ kmem_cache_free(vfio_ccw_schib_region, private->schib_region);
+out_free_cmd:
+ kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region);
+out_free_io:
+ kmem_cache_free(vfio_ccw_io_region, private->io_region);
+out_free_cp:
+ kfree(private->cp.guest_cp);
+out_free_private:
+ mutex_destroy(&private->io_mutex);
+ return -ENOMEM;
}
static int vfio_ccw_mdev_probe(struct mdev_device *mdev)
{
- struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent);
+ struct subchannel *sch = to_subchannel(mdev->dev.parent);
+ struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_private *private;
int ret;
- if (private->state == VFIO_CCW_STATE_NOT_OPER)
- return -ENODEV;
+ private = vfio_alloc_device(vfio_ccw_private, vdev, &mdev->dev,
+ &vfio_ccw_dev_ops);
+ if (IS_ERR(private))
+ return PTR_ERR(private);
- ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops);
- if (ret)
- return ret;
+ dev_set_drvdata(&parent->dev, private);
VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n",
- private->sch->schid.cssid,
- private->sch->schid.ssid,
- private->sch->schid.sch_no);
+ sch->schid.cssid,
+ sch->schid.ssid,
+ sch->schid.sch_no);
ret = vfio_register_emulated_iommu_dev(&private->vdev);
if (ret)
@@ -77,6 +121,7 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev)
return 0;
err_put_vdev:
+ dev_set_drvdata(&parent->dev, NULL);
vfio_put_device(&private->vdev);
return ret;
}
@@ -85,40 +130,36 @@ static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev)
{
struct vfio_ccw_private *private =
container_of(vdev, struct vfio_ccw_private, vdev);
+ struct vfio_ccw_crw *crw, *temp;
- /*
- * We cannot free vfio_ccw_private here because it includes
- * parent info which must be free'ed by css driver.
- *
- * Use a workaround by memset'ing the core device part and
- * then notifying the remove path that all active references
- * to this device have been released.
- */
- memset(vdev, 0, sizeof(*vdev));
- complete(&private->release_comp);
+ list_for_each_entry_safe(crw, temp, &private->crw, next) {
+ list_del(&crw->next);
+ kfree(crw);
+ }
+
+ kmem_cache_free(vfio_ccw_crw_region, private->crw_region);
+ kmem_cache_free(vfio_ccw_schib_region, private->schib_region);
+ kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region);
+ kmem_cache_free(vfio_ccw_io_region, private->io_region);
+ kfree(private->cp.guest_cp);
+ mutex_destroy(&private->io_mutex);
}
static void vfio_ccw_mdev_remove(struct mdev_device *mdev)
{
- struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent);
+ struct subchannel *sch = to_subchannel(mdev->dev.parent);
+ struct vfio_ccw_parent *parent = dev_get_drvdata(&sch->dev);
+ struct vfio_ccw_private *private = dev_get_drvdata(&parent->dev);
VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: remove\n",
- private->sch->schid.cssid,
- private->sch->schid.ssid,
- private->sch->schid.sch_no);
+ sch->schid.cssid,
+ sch->schid.ssid,
+ sch->schid.sch_no);
vfio_unregister_group_dev(&private->vdev);
+ dev_set_drvdata(&parent->dev, NULL);
vfio_put_device(&private->vdev);
- /*
- * Wait for all active references on mdev are released so it
- * is safe to defer kfree() to a later point.
- *
- * TODO: the clean fix is to split parent/mdev info from ccw
- * private structure so each can be managed in its own life
- * cycle.
- */
- wait_for_completion(&private->release_comp);
}
static int vfio_ccw_mdev_open_device(struct vfio_device *vdev)
diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h
index bd5fb81456af..b441ae6700fd 100644
--- a/drivers/s390/cio/vfio_ccw_private.h
+++ b/drivers/s390/cio/vfio_ccw_private.h
@@ -68,9 +68,23 @@ struct vfio_ccw_crw {
};
/**
+ * struct vfio_ccw_parent
+ *
+ * @dev: embedded device struct
+ * @parent: parent data structures for mdevs created
+ * @mdev_type(s): identifying information for mdevs created
+ */
+struct vfio_ccw_parent {
+ struct device dev;
+
+ struct mdev_parent parent;
+ struct mdev_type mdev_type;
+ struct mdev_type *mdev_types[1];
+};
+
+/**
* struct vfio_ccw_private
* @vdev: Embedded VFIO device
- * @sch: pointer to the subchannel
* @state: internal state of the device
* @completion: synchronization helper of the I/O completion
* @io_region: MMIO region to input/output I/O arguments/results
@@ -88,12 +102,9 @@ struct vfio_ccw_crw {
* @req_trigger: eventfd ctx for signaling userspace to return device
* @io_work: work for deferral process of I/O handling
* @crw_work: work for deferral process of CRW handling
- * @release_comp: synchronization helper for vfio device release
- * @parent: parent data structures for mdevs created
*/
struct vfio_ccw_private {
struct vfio_device vdev;
- struct subchannel *sch;
int state;
struct completion *completion;
struct ccw_io_region *io_region;
@@ -114,15 +125,11 @@ struct vfio_ccw_private {
struct eventfd_ctx *req_trigger;
struct work_struct io_work;
struct work_struct crw_work;
-
- struct completion release_comp;
-
- struct mdev_parent parent;
- struct mdev_type mdev_type;
- struct mdev_type *mdev_types[1];
} __aligned(8);
int vfio_ccw_sch_quiesce(struct subchannel *sch);
+void vfio_ccw_sch_io_todo(struct work_struct *work);
+void vfio_ccw_crw_todo(struct work_struct *work);
extern struct mdev_driver vfio_ccw_mdev_driver;
@@ -162,12 +169,18 @@ extern fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS];
static inline void vfio_ccw_fsm_event(struct vfio_ccw_private *private,
enum vfio_ccw_event event)
{
- trace_vfio_ccw_fsm_event(private->sch->schid, private->state, event);
+ struct subchannel *sch = to_subchannel(private->vdev.dev->parent);
+
+ if (sch)
+ trace_vfio_ccw_fsm_event(sch->schid, private->state, event);
vfio_ccw_jumptable[private->state][event](private, event);
}
extern struct workqueue_struct *vfio_ccw_work_q;
-
+extern struct kmem_cache *vfio_ccw_io_region;
+extern struct kmem_cache *vfio_ccw_cmd_region;
+extern struct kmem_cache *vfio_ccw_schib_region;
+extern struct kmem_cache *vfio_ccw_crw_region;
/* s390 debug feature, similar to base cio */
extern debug_info_t *vfio_ccw_debug_msg_id;
diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
index f43cfeabd2cc..997b524bdd2b 100644
--- a/drivers/s390/crypto/vfio_ap_drv.c
+++ b/drivers/s390/crypto/vfio_ap_drv.c
@@ -122,7 +122,7 @@ static int vfio_ap_matrix_dev_create(void)
return 0;
matrix_drv_err:
- device_unregister(&matrix_dev->device);
+ device_del(&matrix_dev->device);
matrix_reg_err:
put_device(&matrix_dev->device);
matrix_alloc_err:
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 9720aed2ac27..9c01957e56b3 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -765,11 +765,6 @@ static void vfio_ap_mdev_unlink_fr_queues(struct ap_matrix_mdev *matrix_mdev)
}
}
-static void vfio_ap_mdev_release_dev(struct vfio_device *vdev)
-{
- vfio_free_device(vdev);
-}
-
static void vfio_ap_mdev_remove(struct mdev_device *mdev)
{
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(&mdev->dev);
@@ -1800,7 +1795,6 @@ static const struct attribute_group vfio_queue_attr_group = {
static const struct vfio_device_ops vfio_ap_matrix_dev_ops = {
.init = vfio_ap_mdev_init_dev,
- .release = vfio_ap_mdev_release_dev,
.open_device = vfio_ap_mdev_open_device,
.close_device = vfio_ap_mdev_close_device,
.ioctl = vfio_ap_mdev_ioctl,
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 286c1663bd75..a8f544629467 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -48,13 +48,8 @@ config VFIO_NOIOMMU
If you don't know what to do here, say N.
endif
-config VFIO_SPAPR_EEH
- tristate
- depends on EEH && VFIO_IOMMU_SPAPR_TCE
- default VFIO
-
config VFIO_VIRQFD
- tristate
+ bool
select EVENTFD
default n
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 3783db7e8082..70e7dcb302ef 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,6 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-vfio_virqfd-y := virqfd.o
-
obj-$(CONFIG_VFIO) += vfio.o
vfio-y += vfio_main.o \
@@ -8,11 +6,10 @@ vfio-y += vfio_main.o \
iova_bitmap.o
vfio-$(CONFIG_IOMMUFD) += iommufd.o
vfio-$(CONFIG_VFIO_CONTAINER) += container.o
+vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o
-obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o
obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
-obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
obj-$(CONFIG_VFIO_PCI) += pci/
obj-$(CONFIG_VFIO_PLATFORM) += platform/
obj-$(CONFIG_VFIO_MDEV) += mdev/
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
index 5cd4bb476440..defeb8510ace 100644
--- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
@@ -568,7 +568,6 @@ static void vfio_fsl_mc_release_dev(struct vfio_device *core_vdev)
vfio_fsl_uninit_device(vdev);
mutex_destroy(&vdev->igate);
- vfio_free_device(core_vdev);
}
static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev)
diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c
index 6631e8befe1b..0848f920efb7 100644
--- a/drivers/vfio/iova_bitmap.c
+++ b/drivers/vfio/iova_bitmap.c
@@ -5,6 +5,7 @@
*/
#include <linux/iova_bitmap.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/highmem.h>
#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
@@ -295,11 +296,13 @@ void iova_bitmap_free(struct iova_bitmap *bitmap)
*/
static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap)
{
- unsigned long remaining;
+ unsigned long remaining, bytes;
+
+ bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff;
remaining = bitmap->mapped_total_index - bitmap->mapped_base_index;
remaining = min_t(unsigned long, remaining,
- (bitmap->mapped.npages << PAGE_SHIFT) / sizeof(*bitmap->bitmap));
+ bytes / sizeof(*bitmap->bitmap));
return remaining;
}
@@ -394,29 +397,27 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
* Set the bits corresponding to the range [iova .. iova+length-1] in
* the user bitmap.
*
- * Return: The number of bits set.
*/
void iova_bitmap_set(struct iova_bitmap *bitmap,
unsigned long iova, size_t length)
{
struct iova_bitmap_map *mapped = &bitmap->mapped;
- unsigned long offset = (iova - mapped->iova) >> mapped->pgshift;
- unsigned long nbits = max_t(unsigned long, 1, length >> mapped->pgshift);
- unsigned long page_idx = offset / BITS_PER_PAGE;
- unsigned long page_offset = mapped->pgoff;
- void *kaddr;
-
- offset = offset % BITS_PER_PAGE;
+ unsigned long cur_bit = ((iova - mapped->iova) >>
+ mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+ unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
+ mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
do {
- unsigned long size = min(BITS_PER_PAGE - offset, nbits);
+ unsigned int page_idx = cur_bit / BITS_PER_PAGE;
+ unsigned int offset = cur_bit % BITS_PER_PAGE;
+ unsigned int nbits = min(BITS_PER_PAGE - offset,
+ last_bit - cur_bit + 1);
+ void *kaddr;
kaddr = kmap_local_page(mapped->pages[page_idx]);
- bitmap_set(kaddr + page_offset, offset, size);
+ bitmap_set(kaddr, offset, nbits);
kunmap_local(kaddr);
- page_offset = offset = 0;
- nbits -= size;
- page_idx++;
- } while (nbits > 0);
+ cur_bit += nbits;
+ } while (cur_bit <= last_bit);
}
EXPORT_SYMBOL_GPL(iova_bitmap_set);
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 40019b11c5a9..0bba3b05c6c7 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -360,8 +360,8 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
u32 que_iso_state;
int ret;
- if (migf->total_length < QM_MATCH_SIZE)
- return -EINVAL;
+ if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done)
+ return 0;
if (vf_data->acc_magic != ACC_DEV_MAGIC) {
dev_err(dev, "failed to match ACC_DEV_MAGIC\n");
@@ -406,6 +406,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
}
hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+ hisi_acc_vdev->match_done = true;
return 0;
}
@@ -493,10 +494,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
struct device *dev = &vf_qm->pdev->dev;
int ret;
- ret = vf_qm_get_match_data(hisi_acc_vdev, vf_data);
- if (ret)
- return ret;
-
if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
/* Update state and return with match data */
vf_data->vf_qm_state = QM_NOT_READY;
@@ -673,12 +670,6 @@ static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev)
struct hisi_acc_vf_migration_file *migf = hisi_acc_vdev->resuming_migf;
int ret;
- /* Check dev compatibility */
- ret = vf_qm_check_match(hisi_acc_vdev, migf);
- if (ret) {
- dev_err(dev, "failed to match the VF!\n");
- return ret;
- }
/* Recover data to VF */
ret = vf_qm_load_data(hisi_acc_vdev, migf);
if (ret) {
@@ -732,6 +723,10 @@ static ssize_t hisi_acc_vf_resume_write(struct file *filp, const char __user *bu
*pos += len;
done = len;
migf->total_length += len;
+
+ ret = vf_qm_check_match(migf->hisi_acc_vdev, migf);
+ if (ret)
+ done = -EFAULT;
out_unlock:
mutex_unlock(&migf->lock);
return done;
@@ -764,9 +759,58 @@ hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev)
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
+ migf->hisi_acc_vdev = hisi_acc_vdev;
return migf;
}
+static long hisi_acc_vf_precopy_ioctl(struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct hisi_acc_vf_migration_file *migf = filp->private_data;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = migf->hisi_acc_vdev;
+ loff_t *pos = &filp->f_pos;
+ struct vfio_precopy_info info;
+ unsigned long minsz;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&hisi_acc_vdev->state_mutex);
+ if (hisi_acc_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY) {
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
+ return -EINVAL;
+ }
+
+ mutex_lock(&migf->lock);
+
+ if (migf->disabled) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (*pos > migf->total_length) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ info.dirty_bytes = 0;
+ info.initial_bytes = migf->total_length - *pos;
+
+ ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+out:
+ mutex_unlock(&migf->lock);
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
+ return ret;
+}
+
static ssize_t hisi_acc_vf_save_read(struct file *filp, char __user *buf, size_t len,
loff_t *pos)
{
@@ -807,12 +851,14 @@ out_unlock:
static const struct file_operations hisi_acc_vf_save_fops = {
.owner = THIS_MODULE,
.read = hisi_acc_vf_save_read,
+ .unlocked_ioctl = hisi_acc_vf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.release = hisi_acc_vf_release_file,
.llseek = no_llseek,
};
static struct hisi_acc_vf_migration_file *
-hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
struct hisi_acc_vf_migration_file *migf;
int ret;
@@ -832,8 +878,9 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev)
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
+ migf->hisi_acc_vdev = hisi_acc_vdev;
- ret = vf_qm_state_save(hisi_acc_vdev, migf);
+ ret = vf_qm_get_match_data(hisi_acc_vdev, &migf->vf_data);
if (ret) {
fput(migf->filp);
return ERR_PTR(ret);
@@ -842,6 +889,44 @@ hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev)
return migf;
}
+static struct hisi_acc_vf_migration_file *
+hisi_acc_vf_pre_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ struct hisi_acc_vf_migration_file *migf;
+
+ migf = hisi_acc_open_saving_migf(hisi_acc_vdev);
+ if (IS_ERR(migf))
+ return migf;
+
+ migf->total_length = QM_MATCH_SIZE;
+ return migf;
+}
+
+static struct hisi_acc_vf_migration_file *
+hisi_acc_vf_stop_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, bool open)
+{
+ int ret;
+ struct hisi_acc_vf_migration_file *migf = NULL;
+
+ if (open) {
+ /*
+ * Userspace didn't use PRECOPY support. Hence saving_migf
+ * is not opened yet.
+ */
+ migf = hisi_acc_open_saving_migf(hisi_acc_vdev);
+ if (IS_ERR(migf))
+ return migf;
+ } else {
+ migf = hisi_acc_vdev->saving_migf;
+ }
+
+ ret = vf_qm_state_save(hisi_acc_vdev, migf);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return open ? migf : NULL;
+}
+
static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
struct device *dev = &hisi_acc_vdev->vf_dev->dev;
@@ -869,6 +954,31 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev,
u32 cur = hisi_acc_vdev->mig_state;
int ret;
+ if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) {
+ struct hisi_acc_vf_migration_file *migf;
+
+ migf = hisi_acc_vf_pre_copy(hisi_acc_vdev);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ hisi_acc_vdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct hisi_acc_vf_migration_file *migf;
+
+ ret = hisi_acc_vf_stop_device(hisi_acc_vdev);
+ if (ret)
+ return ERR_PTR(ret);
+
+ migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, false);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+
+ return NULL;
+ }
+
if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_STOP) {
ret = hisi_acc_vf_stop_device(hisi_acc_vdev);
if (ret)
@@ -879,7 +989,7 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev,
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
struct hisi_acc_vf_migration_file *migf;
- migf = hisi_acc_vf_stop_copy(hisi_acc_vdev);
+ migf = hisi_acc_vf_stop_copy(hisi_acc_vdev, true);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
@@ -911,6 +1021,11 @@ hisi_acc_vf_set_device_state(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return NULL;
}
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) {
+ hisi_acc_vf_disable_fds(hisi_acc_vdev);
+ return NULL;
+ }
+
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING) {
hisi_acc_vf_start_device(hisi_acc_vdev);
return NULL;
@@ -958,6 +1073,14 @@ hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev,
}
static int
+hisi_acc_vfio_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ *stop_copy_length = sizeof(struct acc_vf_data);
+ return 0;
+}
+
+static int
hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state *curr_state)
{
@@ -1213,6 +1336,7 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = {
.migration_set_state = hisi_acc_vfio_pci_set_device_state,
.migration_get_state = hisi_acc_vfio_pci_get_device_state,
+ .migration_get_data_size = hisi_acc_vfio_pci_get_data_size,
};
static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
@@ -1227,7 +1351,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
hisi_acc_vdev->vf_dev = pdev;
mutex_init(&hisi_acc_vdev->state_mutex);
- core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY;
+ core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY;
core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops;
return vfio_pci_core_init_dev(core_vdev);
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
index 67343325b320..dcabfeec6ca1 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -91,12 +91,14 @@ struct hisi_acc_vf_migration_file {
struct mutex lock;
bool disabled;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev;
struct acc_vf_data vf_data;
size_t total_length;
};
struct hisi_acc_vf_core_device {
struct vfio_pci_core_device core_device;
+ u8 match_done:1;
u8 deferred_reset:1;
/* For migration state */
struct mutex state_mutex;
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index c604b70437a5..64e68d13cb98 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -14,18 +14,36 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
+ int err;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
+ /*
+ * In case PRE_COPY is used, saving_migf is exposed while the device is
+ * running. Make sure to run only once there is no active save command.
+ * Running both in parallel, might end-up with a failure in the save
+ * command once it will try to turn on 'tracking' on a suspended device.
+ */
+ if (migf) {
+ err = wait_for_completion_interruptible(&migf->save_comp);
+ if (err)
+ return err;
+ }
+
MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
- return mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
+ err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
+ if (migf)
+ complete(&migf->save_comp);
+
+ return err;
}
int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
@@ -45,23 +63,54 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
}
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size)
+ size_t *state_size, u8 query_flags)
{
u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
+ bool inc = query_flags & MLX5VF_QUERY_INC;
int ret;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
+ /*
+ * In case PRE_COPY is used, saving_migf is exposed while device is
+ * running. Make sure to run only once there is no active save command.
+ * Running both in parallel, might end-up with a failure in the
+ * incremental query command on un-tracked vhca.
+ */
+ if (inc) {
+ ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
+ if (ret)
+ return ret;
+ if (mvdev->saving_migf->state ==
+ MLX5_MIGF_STATE_PRE_COPY_ERROR) {
+ /*
+ * In case we had a PRE_COPY error, only query full
+ * image for final image
+ */
+ if (!(query_flags & MLX5VF_QUERY_FINAL)) {
+ *state_size = 0;
+ complete(&mvdev->saving_migf->save_comp);
+ return 0;
+ }
+ query_flags &= ~MLX5VF_QUERY_INC;
+ }
+ }
+
MLX5_SET(query_vhca_migration_state_in, in, opcode,
MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
+ MLX5_SET(query_vhca_migration_state_in, in, incremental,
+ query_flags & MLX5VF_QUERY_INC);
ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
out);
+ if (inc)
+ complete(&mvdev->saving_migf->save_comp);
+
if (ret)
return ret;
@@ -173,6 +222,11 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
mvdev->core_device.vdev.log_ops = log_ops;
+ if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
+ MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state))
+ mvdev->core_device.vdev.migration_flags |=
+ VFIO_MIGRATION_PRE_COPY;
+
end:
mlx5_vf_put_core_dev(mvdev->mdev);
}
@@ -210,11 +264,11 @@ err_exec:
}
static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
- struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf,
struct mlx5_vhca_recv_buf *recv_buf,
u32 *mkey)
{
- size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) :
+ size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
recv_buf->npages;
int err = 0, inlen;
__be64 *mtt;
@@ -232,10 +286,10 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
DIV_ROUND_UP(npages, 2));
mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
- if (migf) {
+ if (buf) {
struct sg_dma_page_iter dma_iter;
- for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0)
+ for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
} else {
int i;
@@ -255,35 +309,195 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
MLX5_SET(mkc, mkc, qpn, 0xffffff);
MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
- MLX5_SET64(mkc, mkc, len,
- migf ? migf->total_length : (npages * PAGE_SIZE));
+ MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
kvfree(in);
return err;
}
+static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
+ struct mlx5_core_dev *mdev = mvdev->mdev;
+ int ret;
+
+ lockdep_assert_held(&mvdev->state_mutex);
+ if (mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ if (buf->dmaed || !buf->allocated_length)
+ return -EINVAL;
+
+ ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+ if (ret)
+ return ret;
+
+ ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
+ if (ret)
+ goto err;
+
+ buf->dmaed = true;
+
+ return 0;
+err:
+ dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+ return ret;
+}
+
+void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ struct mlx5_vf_migration_file *migf = buf->migf;
+ struct sg_page_iter sg_iter;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ WARN_ON(migf->mvdev->mdev_detach);
+
+ if (buf->dmaed) {
+ mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
+ dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
+ buf->dma_dir, 0);
+ }
+
+ /* Undo alloc_pages_bulk_array() */
+ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
+ __free_page(sg_page_iter_page(&sg_iter));
+ sg_free_append_table(&buf->table);
+ kfree(buf);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length,
+ enum dma_data_direction dma_dir)
+{
+ struct mlx5_vhca_data_buffer *buf;
+ int ret;
+
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ buf->dma_dir = dma_dir;
+ buf->migf = migf;
+ if (length) {
+ ret = mlx5vf_add_migration_pages(buf,
+ DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (ret)
+ goto end;
+
+ if (dma_dir != DMA_NONE) {
+ ret = mlx5vf_dma_data_buffer(buf);
+ if (ret)
+ goto end;
+ }
+ }
+
+ return buf;
+end:
+ mlx5vf_free_data_buffer(buf);
+ return ERR_PTR(ret);
+}
+
+void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
+{
+ spin_lock_irq(&buf->migf->list_lock);
+ list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
+ spin_unlock_irq(&buf->migf->list_lock);
+}
+
+struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir)
+{
+ struct mlx5_vhca_data_buffer *buf, *temp_buf;
+ struct list_head free_list;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return ERR_PTR(-ENOTCONN);
+
+ INIT_LIST_HEAD(&free_list);
+
+ spin_lock_irq(&migf->list_lock);
+ list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
+ if (buf->dma_dir == dma_dir) {
+ list_del_init(&buf->buf_elm);
+ if (buf->allocated_length >= length) {
+ spin_unlock_irq(&migf->list_lock);
+ goto found;
+ }
+ /*
+ * Prevent holding redundant buffers. Put in a free
+ * list and call at the end not under the spin lock
+ * (&migf->list_lock) to mlx5vf_free_data_buffer which
+ * might sleep.
+ */
+ list_add(&buf->buf_elm, &free_list);
+ }
+ }
+ spin_unlock_irq(&migf->list_lock);
+ buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
+
+found:
+ while ((temp_buf = list_first_entry_or_null(&free_list,
+ struct mlx5_vhca_data_buffer, buf_elm))) {
+ list_del(&temp_buf->buf_elm);
+ mlx5vf_free_data_buffer(temp_buf);
+ }
+
+ return buf;
+}
+
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
{
struct mlx5vf_async_data *async_data = container_of(_work,
struct mlx5vf_async_data, work);
struct mlx5_vf_migration_file *migf = container_of(async_data,
struct mlx5_vf_migration_file, async_data);
- struct mlx5_core_dev *mdev = migf->mvdev->mdev;
mutex_lock(&migf->lock);
if (async_data->status) {
- migf->is_err = true;
+ mlx5vf_put_data_buffer(async_data->buf);
+ if (async_data->header_buf)
+ mlx5vf_put_data_buffer(async_data->header_buf);
+ if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
+ migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
+ else
+ migf->state = MLX5_MIGF_STATE_ERROR;
wake_up_interruptible(&migf->poll_wait);
}
mutex_unlock(&migf->lock);
-
- mlx5_core_destroy_mkey(mdev, async_data->mkey);
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
- mlx5_core_dealloc_pd(mdev, async_data->pdn);
kvfree(async_data->out);
+ complete(&migf->save_comp);
fput(migf->filp);
}
+static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
+ size_t image_size)
+{
+ struct mlx5_vf_migration_file *migf = header_buf->migf;
+ struct mlx5_vf_migration_header header = {};
+ unsigned long flags;
+ struct page *page;
+ u8 *to_buff;
+
+ header.image_size = cpu_to_le64(image_size);
+ page = mlx5vf_get_migration_page(header_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &header, sizeof(header));
+ kunmap_local(to_buff);
+ header_buf->length = sizeof(header);
+ header_buf->header_image_size = image_size;
+ header_buf->start_pos = header_buf->migf->max_pos;
+ migf->max_pos += header_buf->length;
+ spin_lock_irqsave(&migf->list_lock, flags);
+ list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+ spin_unlock_irqrestore(&migf->list_lock, flags);
+ return 0;
+}
+
static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
{
struct mlx5vf_async_data *async_data = container_of(context,
@@ -292,67 +506,96 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
struct mlx5_vf_migration_file, async_data);
if (!status) {
- WRITE_ONCE(migf->total_length,
- MLX5_GET(save_vhca_state_out, async_data->out,
- actual_image_size));
+ size_t image_size;
+ unsigned long flags;
+
+ image_size = MLX5_GET(save_vhca_state_out, async_data->out,
+ actual_image_size);
+ if (async_data->header_buf) {
+ status = add_buf_header(async_data->header_buf, image_size);
+ if (status)
+ goto err;
+ }
+ async_data->buf->length = image_size;
+ async_data->buf->start_pos = migf->max_pos;
+ migf->max_pos += async_data->buf->length;
+ spin_lock_irqsave(&migf->list_lock, flags);
+ list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
+ spin_unlock_irqrestore(&migf->list_lock, flags);
+ migf->state = async_data->last_chunk ?
+ MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
wake_up_interruptible(&migf->poll_wait);
}
+err:
/*
* The error and the cleanup flows can't run from an
* interrupt context
*/
+ if (status == -EREMOTEIO)
+ status = MLX5_GET(save_vhca_state_out, async_data->out, status);
async_data->status = status;
queue_work(migf->mvdev->cb_wq, &async_data->work);
}
int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf)
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf, bool inc,
+ bool track)
{
u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
+ struct mlx5_vhca_data_buffer *header_buf = NULL;
struct mlx5vf_async_data *async_data;
- struct mlx5_core_dev *mdev;
- u32 pdn, mkey;
int err;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
- mdev = mvdev->mdev;
- err = mlx5_core_alloc_pd(mdev, &pdn);
+ err = wait_for_completion_interruptible(&migf->save_comp);
if (err)
return err;
- err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE,
- 0);
- if (err)
- goto err_dma_map;
-
- err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
- if (err)
- goto err_create_mkey;
+ if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
+ /*
+ * In case we had a PRE_COPY error, SAVE is triggered only for
+ * the final image, read device full image.
+ */
+ inc = false;
MLX5_SET(save_vhca_state_in, in, opcode,
MLX5_CMD_OP_SAVE_VHCA_STATE);
MLX5_SET(save_vhca_state_in, in, op_mod, 0);
MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
- MLX5_SET(save_vhca_state_in, in, mkey, mkey);
- MLX5_SET(save_vhca_state_in, in, size, migf->total_length);
+ MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
+ MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
+ MLX5_SET(save_vhca_state_in, in, incremental, inc);
+ MLX5_SET(save_vhca_state_in, in, set_track, track);
async_data = &migf->async_data;
+ async_data->buf = buf;
+ async_data->last_chunk = !track;
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
goto err_out;
}
- /* no data exists till the callback comes back */
- migf->total_length = 0;
+ if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ header_buf = mlx5vf_get_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(header_buf)) {
+ err = PTR_ERR(header_buf);
+ goto err_free;
+ }
+ }
+
+ if (async_data->last_chunk)
+ migf->state = MLX5_MIGF_STATE_SAVE_LAST;
+
+ async_data->header_buf = header_buf;
get_file(migf->filp);
- async_data->mkey = mkey;
- async_data->pdn = pdn;
err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
async_data->out,
out_size, mlx5vf_save_callback,
@@ -363,68 +606,92 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
return 0;
err_exec:
+ if (header_buf)
+ mlx5vf_put_data_buffer(header_buf);
fput(migf->filp);
+err_free:
kvfree(async_data->out);
err_out:
- mlx5_core_destroy_mkey(mdev, mkey);
-err_create_mkey:
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_FROM_DEVICE, 0);
-err_dma_map:
- mlx5_core_dealloc_pd(mdev, pdn);
+ complete(&migf->save_comp);
return err;
}
int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf)
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf)
{
- struct mlx5_core_dev *mdev;
- u32 out[MLX5_ST_SZ_DW(save_vhca_state_out)] = {};
- u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
- u32 pdn, mkey;
+ u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
+ u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
int err;
lockdep_assert_held(&mvdev->state_mutex);
if (mvdev->mdev_detach)
return -ENOTCONN;
- mutex_lock(&migf->lock);
- if (!migf->total_length) {
- err = -EINVAL;
- goto end;
+ if (!buf->dmaed) {
+ err = mlx5vf_dma_data_buffer(buf);
+ if (err)
+ return err;
}
- mdev = mvdev->mdev;
- err = mlx5_core_alloc_pd(mdev, &pdn);
- if (err)
- goto end;
-
- err = dma_map_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
- if (err)
- goto err_reg;
-
- err = _create_mkey(mdev, pdn, migf, NULL, &mkey);
- if (err)
- goto err_mkey;
-
MLX5_SET(load_vhca_state_in, in, opcode,
MLX5_CMD_OP_LOAD_VHCA_STATE);
MLX5_SET(load_vhca_state_in, in, op_mod, 0);
MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
- MLX5_SET(load_vhca_state_in, in, mkey, mkey);
- MLX5_SET(load_vhca_state_in, in, size, migf->total_length);
+ MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
+ MLX5_SET(load_vhca_state_in, in, size, buf->length);
+ return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
+}
- err = mlx5_cmd_exec_inout(mdev, load_vhca_state, in, out);
+int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
+{
+ int err;
- mlx5_core_destroy_mkey(mdev, mkey);
-err_mkey:
- dma_unmap_sgtable(mdev->device, &migf->table.sgt, DMA_TO_DEVICE, 0);
-err_reg:
- mlx5_core_dealloc_pd(mdev, pdn);
-end:
- mutex_unlock(&migf->lock);
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return -ENOTCONN;
+
+ err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
return err;
}
+void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
+{
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ if (migf->mvdev->mdev_detach)
+ return;
+
+ mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
+}
+
+void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
+{
+ struct mlx5_vhca_data_buffer *entry;
+
+ lockdep_assert_held(&migf->mvdev->state_mutex);
+ WARN_ON(migf->mvdev->mdev_detach);
+
+ if (migf->buf) {
+ mlx5vf_free_data_buffer(migf->buf);
+ migf->buf = NULL;
+ }
+
+ if (migf->buf_header) {
+ mlx5vf_free_data_buffer(migf->buf_header);
+ migf->buf_header = NULL;
+ }
+
+ list_splice(&migf->avail_list, &migf->buf_list);
+
+ while ((entry = list_first_entry_or_null(&migf->buf_list,
+ struct mlx5_vhca_data_buffer, buf_elm))) {
+ list_del(&entry->buf_elm);
+ mlx5vf_free_data_buffer(entry);
+ }
+
+ mlx5vf_cmd_dealloc_pd(migf);
+}
+
static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
u32 req_nodes)
{
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 921d5720a1e5..5483171d57ad 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -12,31 +12,74 @@
#include <linux/mlx5/cq.h>
#include <linux/mlx5/qp.h>
+#define MLX5VF_PRE_COPY_SUPP(mvdev) \
+ ((mvdev)->core_device.vdev.migration_flags & VFIO_MIGRATION_PRE_COPY)
+
+enum mlx5_vf_migf_state {
+ MLX5_MIGF_STATE_ERROR = 1,
+ MLX5_MIGF_STATE_PRE_COPY_ERROR,
+ MLX5_MIGF_STATE_PRE_COPY,
+ MLX5_MIGF_STATE_SAVE_LAST,
+ MLX5_MIGF_STATE_COMPLETE,
+};
+
+enum mlx5_vf_load_state {
+ MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER,
+ MLX5_VF_LOAD_STATE_READ_HEADER,
+ MLX5_VF_LOAD_STATE_PREP_IMAGE,
+ MLX5_VF_LOAD_STATE_READ_IMAGE,
+ MLX5_VF_LOAD_STATE_LOAD_IMAGE,
+};
+
+struct mlx5_vf_migration_header {
+ __le64 image_size;
+ /* For future use in case we may need to change the kernel protocol */
+ __le64 flags;
+};
+
+struct mlx5_vhca_data_buffer {
+ struct sg_append_table table;
+ loff_t start_pos;
+ u64 length;
+ u64 allocated_length;
+ u64 header_image_size;
+ u32 mkey;
+ enum dma_data_direction dma_dir;
+ u8 dmaed:1;
+ struct list_head buf_elm;
+ struct mlx5_vf_migration_file *migf;
+ /* Optimize mlx5vf_get_migration_page() for sequential access */
+ struct scatterlist *last_offset_sg;
+ unsigned int sg_last_entry;
+ unsigned long last_offset;
+};
+
struct mlx5vf_async_data {
struct mlx5_async_work cb_work;
struct work_struct work;
+ struct mlx5_vhca_data_buffer *buf;
+ struct mlx5_vhca_data_buffer *header_buf;
int status;
- u32 pdn;
- u32 mkey;
+ u8 last_chunk:1;
void *out;
};
struct mlx5_vf_migration_file {
struct file *filp;
struct mutex lock;
- u8 disabled:1;
- u8 is_err:1;
+ enum mlx5_vf_migf_state state;
- struct sg_append_table table;
- size_t total_length;
- size_t allocated_length;
-
- /* Optimize mlx5vf_get_migration_page() for sequential access */
- struct scatterlist *last_offset_sg;
- unsigned int sg_last_entry;
- unsigned long last_offset;
+ enum mlx5_vf_load_state load_state;
+ u32 pdn;
+ loff_t max_pos;
+ struct mlx5_vhca_data_buffer *buf;
+ struct mlx5_vhca_data_buffer *buf_header;
+ spinlock_t list_lock;
+ struct list_head buf_list;
+ struct list_head avail_list;
struct mlx5vf_pci_core_device *mvdev;
wait_queue_head_t poll_wait;
+ struct completion save_comp;
struct mlx5_async_ctx async_ctx;
struct mlx5vf_async_data async_data;
};
@@ -113,19 +156,42 @@ struct mlx5vf_pci_core_device {
struct mlx5_core_dev *mdev;
};
+enum {
+ MLX5VF_QUERY_INC = (1UL << 0),
+ MLX5VF_QUERY_FINAL = (1UL << 1),
+};
+
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
- size_t *state_size);
+ size_t *state_size, u8 query_flags);
void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
const struct vfio_migration_ops *mig_ops,
const struct vfio_log_ops *log_ops);
void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev);
int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf);
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf, bool inc,
+ bool track);
int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
- struct mlx5_vf_migration_file *migf);
+ struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *buf);
+int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf);
+void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf);
+void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf);
+struct mlx5_vhca_data_buffer *
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir);
+void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf);
+struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
+ size_t length, enum dma_data_direction dma_dir);
+void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf);
+int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages);
+struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+ unsigned long offset);
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 32d1f38d351e..9feb89c6d939 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -32,8 +32,8 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
core_device);
}
-static struct page *
-mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf,
+struct page *
+mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
unsigned long offset)
{
unsigned long cur_offset = 0;
@@ -41,20 +41,20 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf,
unsigned int i;
/* All accesses are sequential */
- if (offset < migf->last_offset || !migf->last_offset_sg) {
- migf->last_offset = 0;
- migf->last_offset_sg = migf->table.sgt.sgl;
- migf->sg_last_entry = 0;
+ if (offset < buf->last_offset || !buf->last_offset_sg) {
+ buf->last_offset = 0;
+ buf->last_offset_sg = buf->table.sgt.sgl;
+ buf->sg_last_entry = 0;
}
- cur_offset = migf->last_offset;
+ cur_offset = buf->last_offset;
- for_each_sg(migf->last_offset_sg, sg,
- migf->table.sgt.orig_nents - migf->sg_last_entry, i) {
+ for_each_sg(buf->last_offset_sg, sg,
+ buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
if (offset < sg->length + cur_offset) {
- migf->last_offset_sg = sg;
- migf->sg_last_entry += i;
- migf->last_offset = cur_offset;
+ buf->last_offset_sg = sg;
+ buf->sg_last_entry += i;
+ buf->last_offset = cur_offset;
return nth_page(sg_page(sg),
(offset - cur_offset) / PAGE_SIZE);
}
@@ -63,8 +63,8 @@ mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf,
return NULL;
}
-static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf,
- unsigned int npages)
+int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
+ unsigned int npages)
{
unsigned int to_alloc = npages;
struct page **page_list;
@@ -85,13 +85,13 @@ static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf,
}
to_alloc -= filled;
ret = sg_alloc_append_table_from_pages(
- &migf->table, page_list, filled, 0,
+ &buf->table, page_list, filled, 0,
filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
GFP_KERNEL);
if (ret)
goto err;
- migf->allocated_length += filled * PAGE_SIZE;
+ buf->allocated_length += filled * PAGE_SIZE;
/* clean input for another bulk allocation */
memset(page_list, 0, filled * sizeof(*page_list));
to_fill = min_t(unsigned int, to_alloc,
@@ -108,16 +108,8 @@ err:
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
{
- struct sg_page_iter sg_iter;
-
mutex_lock(&migf->lock);
- /* Undo alloc_pages_bulk_array() */
- for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0)
- __free_page(sg_page_iter_page(&sg_iter));
- sg_free_append_table(&migf->table);
- migf->disabled = true;
- migf->total_length = 0;
- migf->allocated_length = 0;
+ migf->state = MLX5_MIGF_STATE_ERROR;
migf->filp->f_pos = 0;
mutex_unlock(&migf->lock);
}
@@ -132,10 +124,91 @@ static int mlx5vf_release_file(struct inode *inode, struct file *filp)
return 0;
}
+static struct mlx5_vhca_data_buffer *
+mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
+ bool *end_of_data)
+{
+ struct mlx5_vhca_data_buffer *buf;
+ bool found = false;
+
+ *end_of_data = false;
+ spin_lock_irq(&migf->list_lock);
+ if (list_empty(&migf->buf_list)) {
+ *end_of_data = true;
+ goto end;
+ }
+
+ buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
+ buf_elm);
+ if (pos >= buf->start_pos &&
+ pos < buf->start_pos + buf->length) {
+ found = true;
+ goto end;
+ }
+
+ /*
+ * As we use a stream based FD we may expect having the data always
+ * on first chunk
+ */
+ migf->state = MLX5_MIGF_STATE_ERROR;
+
+end:
+ spin_unlock_irq(&migf->list_lock);
+ return found ? buf : NULL;
+}
+
+static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
+ char __user **buf, size_t *len, loff_t *pos)
+{
+ unsigned long offset;
+ ssize_t done = 0;
+ size_t copy_len;
+
+ copy_len = min_t(size_t,
+ vhca_buf->start_pos + vhca_buf->length - *pos, *len);
+ while (copy_len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+ offset -= page_offset;
+ page = mlx5vf_get_migration_page(vhca_buf, offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ ret = copy_to_user(*buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (ret)
+ return -EFAULT;
+ *pos += page_len;
+ *len -= page_len;
+ *buf += page_len;
+ done += page_len;
+ copy_len -= page_len;
+ }
+
+ if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
+ spin_lock_irq(&vhca_buf->migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ spin_unlock_irq(&vhca_buf->migf->list_lock);
+ }
+
+ return done;
+}
+
static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5_vhca_data_buffer *vhca_buf;
+ bool first_loop_call = true;
+ bool end_of_data;
ssize_t done = 0;
if (pos)
@@ -144,52 +217,56 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
if (!(filp->f_flags & O_NONBLOCK)) {
if (wait_event_interruptible(migf->poll_wait,
- READ_ONCE(migf->total_length) || migf->is_err))
+ !list_empty(&migf->buf_list) ||
+ migf->state == MLX5_MIGF_STATE_ERROR ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_COMPLETE))
return -ERESTARTSYS;
}
mutex_lock(&migf->lock);
- if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) {
- done = -EAGAIN;
- goto out_unlock;
- }
- if (*pos > migf->total_length) {
- done = -EINVAL;
- goto out_unlock;
- }
- if (migf->disabled || migf->is_err) {
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
done = -ENODEV;
goto out_unlock;
}
- len = min_t(size_t, migf->total_length - *pos, len);
while (len) {
- size_t page_offset;
- struct page *page;
- size_t page_len;
- u8 *from_buff;
- int ret;
+ ssize_t count;
+
+ vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
+ &end_of_data);
+ if (first_loop_call) {
+ first_loop_call = false;
+ /* Temporary end of file as part of PRE_COPY */
+ if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
+ migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
+ done = -ENOMSG;
+ goto out_unlock;
+ }
+
+ if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
+ if (filp->f_flags & O_NONBLOCK) {
+ done = -EAGAIN;
+ goto out_unlock;
+ }
+ }
+ }
+
+ if (end_of_data)
+ goto out_unlock;
- page_offset = (*pos) % PAGE_SIZE;
- page = mlx5vf_get_migration_page(migf, *pos - page_offset);
- if (!page) {
- if (done == 0)
- done = -EINVAL;
+ if (!vhca_buf) {
+ done = -EINVAL;
goto out_unlock;
}
- page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
- from_buff = kmap_local_page(page);
- ret = copy_to_user(buf, from_buff + page_offset, page_len);
- kunmap_local(from_buff);
- if (ret) {
- done = -EFAULT;
+ count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
+ if (count < 0) {
+ done = count;
goto out_unlock;
}
- *pos += page_len;
- len -= page_len;
- done += page_len;
- buf += page_len;
+ done += count;
}
out_unlock:
@@ -206,27 +283,188 @@ static __poll_t mlx5vf_save_poll(struct file *filp,
poll_wait(filp, &migf->poll_wait, wait);
mutex_lock(&migf->lock);
- if (migf->disabled || migf->is_err)
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
- else if (READ_ONCE(migf->total_length))
+ else if (!list_empty(&migf->buf_list) ||
+ migf->state == MLX5_MIGF_STATE_COMPLETE)
pollflags = EPOLLIN | EPOLLRDNORM;
mutex_unlock(&migf->lock);
return pollflags;
}
+/*
+ * FD is exposed and user can use it after receiving an error.
+ * Mark migf in error, and wake the user.
+ */
+static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
+{
+ migf->state = MLX5_MIGF_STATE_ERROR;
+ wake_up_interruptible(&migf->poll_wait);
+}
+
+static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+ struct mlx5_vhca_data_buffer *buf;
+ struct vfio_precopy_info info = {};
+ loff_t *pos = &filp->f_pos;
+ unsigned long minsz;
+ size_t inc_length = 0;
+ bool end_of_data;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&mvdev->state_mutex);
+ if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+ mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ ret = -EINVAL;
+ goto err_state_unlock;
+ }
+
+ /*
+ * We can't issue a SAVE command when the device is suspended, so as
+ * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
+ * bytes that can't be read.
+ */
+ if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
+ /*
+ * Once the query returns it's guaranteed that there is no
+ * active SAVE command.
+ * As so, the other code below is safe with the proper locks.
+ */
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
+ MLX5VF_QUERY_INC);
+ if (ret)
+ goto err_state_unlock;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
+ goto err_migf_unlock;
+ }
+
+ buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
+ if (buf) {
+ if (buf->start_pos == 0) {
+ info.initial_bytes = buf->header_image_size - *pos;
+ } else if (buf->start_pos ==
+ sizeof(struct mlx5_vf_migration_header)) {
+ /* First data buffer following the header */
+ info.initial_bytes = buf->start_pos +
+ buf->length - *pos;
+ } else {
+ info.dirty_bytes = buf->start_pos + buf->length - *pos;
+ }
+ } else {
+ if (!end_of_data) {
+ ret = -EINVAL;
+ goto err_migf_unlock;
+ }
+
+ info.dirty_bytes = inc_length;
+ }
+
+ if (!end_of_data || !inc_length) {
+ mutex_unlock(&migf->lock);
+ goto done;
+ }
+
+ mutex_unlock(&migf->lock);
+ /*
+ * We finished transferring the current state and the device has a
+ * dirty state, save a new state to be ready for.
+ */
+ buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ mlx5vf_mark_err(migf);
+ goto err_state_unlock;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
+ if (ret) {
+ mlx5vf_mark_err(migf);
+ mlx5vf_put_data_buffer(buf);
+ goto err_state_unlock;
+ }
+
+done:
+ mlx5vf_state_mutex_unlock(mvdev);
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+ return 0;
+
+err_migf_unlock:
+ mutex_unlock(&migf->lock);
+err_state_unlock:
+ mlx5vf_state_mutex_unlock(mvdev);
+ return ret;
+}
+
static const struct file_operations mlx5vf_save_fops = {
.owner = THIS_MODULE,
.read = mlx5vf_save_read,
.poll = mlx5vf_save_poll,
+ .unlocked_ioctl = mlx5vf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.release = mlx5vf_release_file,
.llseek = no_llseek,
};
+static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
+{
+ struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
+ struct mlx5_vhca_data_buffer *buf;
+ size_t length;
+ int ret;
+
+ if (migf->state == MLX5_MIGF_STATE_ERROR)
+ return -ENODEV;
+
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+ MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
+ if (ret)
+ goto err;
+
+ buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto err;
+ }
+
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
+ if (ret)
+ goto err_save;
+
+ return 0;
+
+err_save:
+ mlx5vf_put_data_buffer(buf);
+err:
+ mlx5vf_mark_err(migf);
+ return ret;
+}
+
static struct mlx5_vf_migration_file *
-mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev)
+mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
{
struct mlx5_vf_migration_file *migf;
+ struct mlx5_vhca_data_buffer *buf;
+ size_t length;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL);
@@ -236,43 +474,211 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev)
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
O_RDONLY);
if (IS_ERR(migf->filp)) {
- int err = PTR_ERR(migf->filp);
-
- kfree(migf);
- return ERR_PTR(err);
+ ret = PTR_ERR(migf->filp);
+ goto end;
}
+ migf->mvdev = mvdev;
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out_free;
+
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
init_waitqueue_head(&migf->poll_wait);
+ init_completion(&migf->save_comp);
+ /*
+ * save_comp is being used as a binary semaphore built from
+ * a completion. A normal mutex cannot be used because the lock is
+ * passed between kernel threads and lockdep can't model this.
+ */
+ complete(&migf->save_comp);
mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
- ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
- &migf->total_length);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
if (ret)
- goto out_free;
+ goto out_pd;
- ret = mlx5vf_add_migration_pages(
- migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE));
- if (ret)
- goto out_free;
+ buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_pd;
+ }
- migf->mvdev = mvdev;
- ret = mlx5vf_cmd_save_vhca_state(mvdev, migf);
+ ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
if (ret)
- goto out_free;
+ goto out_save;
return migf;
+out_save:
+ mlx5vf_free_data_buffer(buf);
+out_pd:
+ mlx5vf_cmd_dealloc_pd(migf);
out_free:
fput(migf->filp);
+end:
+ kfree(migf);
return ERR_PTR(ret);
}
+static int
+mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ unsigned long offset;
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+
+ page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + page_offset, *buf, page_len);
+ kunmap_local(to_buff);
+ if (ret)
+ return -EFAULT;
+
+ *pos += page_len;
+ *done += page_len;
+ *buf += page_len;
+ *len -= page_len;
+ vhca_buf->length += page_len;
+ return 0;
+}
+
+static int
+mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
+ loff_t requested_length,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ int ret;
+
+ if (requested_length > MAX_MIGRATION_SIZE)
+ return -ENOMEM;
+
+ if (vhca_buf->allocated_length < requested_length) {
+ ret = mlx5vf_add_migration_pages(
+ vhca_buf,
+ DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
+ PAGE_SIZE));
+ if (ret)
+ return ret;
+ }
+
+ while (*len) {
+ ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
+ done);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static ssize_t
+mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *vhca_buf,
+ size_t image_size, const char __user **buf,
+ size_t *len, loff_t *pos, ssize_t *done,
+ bool *has_work)
+{
+ size_t copy_len, to_copy;
+ int ret;
+
+ to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
+ done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == image_size) {
+ migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
+ migf->max_pos += image_size;
+ *has_work = true;
+ }
+
+ return 0;
+}
+
+static int
+mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
+ struct mlx5_vhca_data_buffer *vhca_buf,
+ const char __user **buf,
+ size_t *len, loff_t *pos,
+ ssize_t *done, bool *has_work)
+{
+ struct page *page;
+ size_t copy_len;
+ u8 *to_buff;
+ int ret;
+
+ copy_len = min_t(size_t, *len,
+ sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
+ page = mlx5vf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto end;
+ }
+
+ *buf += copy_len;
+ *pos += copy_len;
+ *done += copy_len;
+ *len -= copy_len;
+ vhca_buf->length += copy_len;
+ if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
+ u64 flags;
+
+ vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff);
+ if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ flags = le64_to_cpup((__le64 *)(to_buff +
+ offsetof(struct mlx5_vf_migration_header, flags)));
+ if (flags) {
+ ret = -EOPNOTSUPP;
+ goto end;
+ }
+
+ migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
+ migf->max_pos += vhca_buf->length;
+ *has_work = true;
+ }
+end:
+ kunmap_local(to_buff);
+ return ret;
+}
+
static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
+ struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
+ struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
loff_t requested_length;
+ bool has_work = false;
ssize_t done = 0;
+ int ret = 0;
if (pos)
return -ESPIPE;
@@ -282,56 +688,83 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
check_add_overflow((loff_t)len, *pos, &requested_length))
return -EINVAL;
- if (requested_length > MAX_MIGRATION_SIZE)
- return -ENOMEM;
-
+ mutex_lock(&migf->mvdev->state_mutex);
mutex_lock(&migf->lock);
- if (migf->disabled) {
- done = -ENODEV;
+ if (migf->state == MLX5_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
goto out_unlock;
}
- if (migf->allocated_length < requested_length) {
- done = mlx5vf_add_migration_pages(
- migf,
- DIV_ROUND_UP(requested_length - migf->allocated_length,
- PAGE_SIZE));
- if (done)
- goto out_unlock;
- }
-
- while (len) {
- size_t page_offset;
- struct page *page;
- size_t page_len;
- u8 *to_buff;
- int ret;
-
- page_offset = (*pos) % PAGE_SIZE;
- page = mlx5vf_get_migration_page(migf, *pos - page_offset);
- if (!page) {
- if (done == 0)
- done = -EINVAL;
- goto out_unlock;
+ while (len || has_work) {
+ has_work = false;
+ switch (migf->load_state) {
+ case MLX5_VF_LOAD_STATE_READ_HEADER:
+ ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
+ &buf, &len, pos,
+ &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_PREP_IMAGE:
+ {
+ u64 size = vhca_buf_header->header_image_size;
+
+ if (vhca_buf->allocated_length < size) {
+ mlx5vf_free_data_buffer(vhca_buf);
+
+ migf->buf = mlx5vf_alloc_data_buffer(migf,
+ size, DMA_TO_DEVICE);
+ if (IS_ERR(migf->buf)) {
+ ret = PTR_ERR(migf->buf);
+ migf->buf = NULL;
+ goto out_unlock;
+ }
+
+ vhca_buf = migf->buf;
+ }
+
+ vhca_buf->start_pos = migf->max_pos;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
+ break;
}
+ case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
+ ret = mlx5vf_resume_read_image_no_header(vhca_buf,
+ requested_length,
+ &buf, &len, pos, &done);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_READ_IMAGE:
+ ret = mlx5vf_resume_read_image(migf, vhca_buf,
+ vhca_buf_header->header_image_size,
+ &buf, &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
+ ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
+ if (ret)
+ goto out_unlock;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+
+ /* prep header buf for next image */
+ vhca_buf_header->length = 0;
+ vhca_buf_header->header_image_size = 0;
+ /* prep data buf for next image */
+ vhca_buf->length = 0;
- page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
- to_buff = kmap_local_page(page);
- ret = copy_from_user(to_buff + page_offset, buf, page_len);
- kunmap_local(to_buff);
- if (ret) {
- done = -EFAULT;
- goto out_unlock;
+ break;
+ default:
+ break;
}
- *pos += page_len;
- len -= page_len;
- done += page_len;
- buf += page_len;
- migf->total_length += page_len;
}
+
out_unlock:
+ if (ret)
+ migf->state = MLX5_MIGF_STATE_ERROR;
mutex_unlock(&migf->lock);
- return done;
+ mlx5vf_state_mutex_unlock(migf->mvdev);
+ return ret ? ret : done;
}
static const struct file_operations mlx5vf_resume_fops = {
@@ -345,6 +778,8 @@ static struct mlx5_vf_migration_file *
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
{
struct mlx5_vf_migration_file *migf;
+ struct mlx5_vhca_data_buffer *buf;
+ int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL);
if (!migf)
@@ -353,20 +788,59 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
O_WRONLY);
if (IS_ERR(migf->filp)) {
- int err = PTR_ERR(migf->filp);
+ ret = PTR_ERR(migf->filp);
+ goto end;
+ }
- kfree(migf);
- return ERR_PTR(err);
+ migf->mvdev = mvdev;
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out_free;
+
+ buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_pd;
+ }
+
+ migf->buf = buf;
+ if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ buf = mlx5vf_alloc_data_buffer(migf,
+ sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_buf;
+ }
+
+ migf->buf_header = buf;
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
+ } else {
+ /* Initial state will be to read the image */
+ migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
}
+
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
return migf;
+out_buf:
+ mlx5vf_free_data_buffer(migf->buf);
+out_pd:
+ mlx5vf_cmd_dealloc_pd(migf);
+out_free:
+ fput(migf->filp);
+end:
+ kfree(migf);
+ return ERR_PTR(ret);
}
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
{
if (mvdev->resuming_migf) {
mlx5vf_disable_fd(mvdev->resuming_migf);
+ mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
fput(mvdev->resuming_migf->filp);
mvdev->resuming_migf = NULL;
}
@@ -374,6 +848,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
cancel_work_sync(&mvdev->saving_migf->async_data.work);
mlx5vf_disable_fd(mvdev->saving_migf);
+ mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
fput(mvdev->saving_migf->filp);
mvdev->saving_migf = NULL;
}
@@ -402,7 +877,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return NULL;
}
- if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
ret = mlx5vf_cmd_suspend_vhca(mvdev,
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
if (ret)
@@ -410,7 +886,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return NULL;
}
- if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
ret = mlx5vf_cmd_resume_vhca(mvdev,
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
if (ret)
@@ -421,7 +898,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
struct mlx5_vf_migration_file *migf;
- migf = mlx5vf_pci_save_device_data(mvdev);
+ migf = mlx5vf_pci_save_device_data(mvdev, false);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
@@ -429,7 +906,10 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
return migf->filp;
}
- if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) {
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+ new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
mlx5vf_disable_fds(mvdev);
return NULL;
}
@@ -446,14 +926,39 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
}
if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
- ret = mlx5vf_cmd_load_vhca_state(mvdev,
- mvdev->resuming_migf);
- if (ret)
- return ERR_PTR(ret);
+ if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
+ ret = mlx5vf_cmd_load_vhca_state(mvdev,
+ mvdev->resuming_migf,
+ mvdev->resuming_migf->buf);
+ if (ret)
+ return ERR_PTR(ret);
+ }
mlx5vf_disable_fds(mvdev);
return NULL;
}
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct mlx5_vf_migration_file *migf;
+
+ migf = mlx5vf_pci_save_device_data(mvdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ mvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ ret = mlx5vf_cmd_suspend_vhca(mvdev,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
+ if (ret)
+ return ERR_PTR(ret);
+ ret = mlx5vf_pci_save_device_inc_data(mvdev);
+ return ret ? ERR_PTR(ret) : NULL;
+ }
+
/*
* vfio_mig_get_next_state() does not use arcs other than the above
*/
@@ -512,6 +1017,23 @@ mlx5vf_pci_set_device_state(struct vfio_device *vdev,
return res;
}
+static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct mlx5vf_pci_core_device *mvdev = container_of(
+ vdev, struct mlx5vf_pci_core_device, core_device.vdev);
+ size_t state_size;
+ int ret;
+
+ mutex_lock(&mvdev->state_mutex);
+ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
+ &state_size, 0);
+ if (!ret)
+ *stop_copy_length = state_size;
+ mlx5vf_state_mutex_unlock(mvdev);
+ return ret;
+}
+
static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state *curr_state)
{
@@ -577,6 +1099,7 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
.migration_set_state = mlx5vf_pci_set_device_state,
.migration_get_state = mlx5vf_pci_get_device_state,
+ .migration_get_data_size = mlx5vf_pci_get_data_size,
};
static const struct vfio_log_ops mlx5vf_pci_log_ops = {
@@ -679,18 +1202,7 @@ static struct pci_driver mlx5vf_pci_driver = {
.driver_managed_dma = true,
};
-static void __exit mlx5vf_pci_cleanup(void)
-{
- pci_unregister_driver(&mlx5vf_pci_driver);
-}
-
-static int __init mlx5vf_pci_init(void)
-{
- return pci_register_driver(&mlx5vf_pci_driver);
-}
-
-module_init(mlx5vf_pci_init);
-module_exit(mlx5vf_pci_cleanup);
+module_pci_driver(mlx5vf_pci_driver);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index e030c2120183..26a541cc64d1 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -27,6 +27,9 @@
#include <linux/vgaarb.h>
#include <linux/nospec.h>
#include <linux/sched/mm.h>
+#if IS_ENABLED(CONFIG_EEH)
+#include <asm/eeh.h>
+#endif
#include "vfio_pci_priv.h"
@@ -686,7 +689,9 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev)
vdev->sriov_pf_core_dev->vf_token->users--;
mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
}
- vfio_spapr_pci_eeh_release(vdev->pdev);
+#if IS_ENABLED(CONFIG_EEH)
+ eeh_dev_release(vdev->pdev);
+#endif
vfio_pci_core_disable(vdev);
mutex_lock(&vdev->igate);
@@ -705,7 +710,9 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
{
vfio_pci_probe_mmaps(vdev);
- vfio_spapr_pci_eeh_open(vdev->pdev);
+#if IS_ENABLED(CONFIG_EEH)
+ eeh_dev_open(vdev->pdev);
+#endif
if (vdev->sriov_pf_core_dev) {
mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
@@ -2109,7 +2116,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
mutex_destroy(&vdev->vma_lock);
kfree(vdev->region);
kfree(vdev->pm_save);
- vfio_free_device(core_vdev);
}
EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev);
@@ -2128,7 +2134,8 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
if (vdev->vdev.mig_ops) {
if (!(vdev->vdev.mig_ops->migration_get_state &&
- vdev->vdev.mig_ops->migration_set_state) ||
+ vdev->vdev.mig_ops->migration_set_state &&
+ vdev->vdev.mig_ops->migration_get_data_size) ||
!(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY))
return -EINVAL;
}
diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
index 5a046098d0bd..83fe54015595 100644
--- a/drivers/vfio/platform/vfio_amba.c
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -95,7 +95,6 @@ static void vfio_amba_release_dev(struct vfio_device *core_vdev)
vfio_platform_release_common(vdev);
kfree(vdev->name);
- vfio_free_device(core_vdev);
}
static void vfio_amba_remove(struct amba_device *adev)
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index b87c3b708783..22a1efca32a8 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -83,7 +83,6 @@ static void vfio_platform_release_dev(struct vfio_device *core_vdev)
container_of(core_vdev, struct vfio_platform_device, vdev);
vfio_platform_release_common(vdev);
- vfio_free_device(core_vdev);
}
static int vfio_platform_remove(struct platform_device *pdev)
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 55dc4f43c31e..1a0a238ffa35 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -72,12 +72,11 @@ static int vfio_platform_acpi_call_reset(struct vfio_platform_device *vdev,
const char **extra_dbg)
{
#ifdef CONFIG_ACPI
- struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
struct device *dev = vdev->device;
acpi_handle handle = ACPI_HANDLE(dev);
acpi_status acpi_ret;
- acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, &buffer);
+ acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, NULL);
if (ACPI_FAILURE(acpi_ret)) {
if (extra_dbg)
*extra_dbg = acpi_format_exception(acpi_ret);
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index 2e05418fd18d..f8219a438bfb 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -232,6 +232,19 @@ static inline void vfio_iommufd_unbind(struct vfio_device *device)
}
#endif
+#if IS_ENABLED(CONFIG_VFIO_VIRQFD)
+int __init vfio_virqfd_init(void);
+void vfio_virqfd_exit(void);
+#else
+static inline int __init vfio_virqfd_init(void)
+{
+ return 0;
+}
+static inline void vfio_virqfd_exit(void)
+{
+}
+#endif
+
#ifdef CONFIG_VFIO_NOIOMMU
extern bool vfio_noiommu __read_mostly;
#else
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 169f07ac162d..60a50ce8701e 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2013 IBM Corp. All rights reserved.
* Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ * Copyright Gavin Shan, IBM Corporation 2014.
*
* Derived from original vfio_iommu_type1.c:
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
@@ -773,6 +774,57 @@ static long tce_iommu_create_default_window(struct tce_container *container)
return ret;
}
+static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group,
+ unsigned long arg)
+{
+ struct eeh_pe *pe;
+ struct vfio_eeh_pe_op op;
+ unsigned long minsz;
+
+ pe = eeh_iommu_group_to_pe(group);
+ if (!pe)
+ return -ENODEV;
+
+ minsz = offsetofend(struct vfio_eeh_pe_op, op);
+ if (copy_from_user(&op, (void __user *)arg, minsz))
+ return -EFAULT;
+ if (op.argsz < minsz || op.flags)
+ return -EINVAL;
+
+ switch (op.op) {
+ case VFIO_EEH_PE_DISABLE:
+ return eeh_pe_set_option(pe, EEH_OPT_DISABLE);
+ case VFIO_EEH_PE_ENABLE:
+ return eeh_pe_set_option(pe, EEH_OPT_ENABLE);
+ case VFIO_EEH_PE_UNFREEZE_IO:
+ return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
+ case VFIO_EEH_PE_UNFREEZE_DMA:
+ return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
+ case VFIO_EEH_PE_GET_STATE:
+ return eeh_pe_get_state(pe);
+ break;
+ case VFIO_EEH_PE_RESET_DEACTIVATE:
+ return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
+ case VFIO_EEH_PE_RESET_HOT:
+ return eeh_pe_reset(pe, EEH_RESET_HOT, true);
+ case VFIO_EEH_PE_RESET_FUNDAMENTAL:
+ return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
+ case VFIO_EEH_PE_CONFIGURE:
+ return eeh_pe_configure(pe);
+ case VFIO_EEH_PE_INJECT_ERR:
+ minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
+ if (op.argsz < minsz)
+ return -EINVAL;
+ if (copy_from_user(&op, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ return eeh_pe_inject_err(pe, op.err.type, op.err.func,
+ op.err.addr, op.err.mask);
+ default:
+ return -EINVAL;
+ }
+}
+
static long tce_iommu_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
@@ -785,14 +837,12 @@ static long tce_iommu_ioctl(void *iommu_data,
switch (arg) {
case VFIO_SPAPR_TCE_IOMMU:
case VFIO_SPAPR_TCE_v2_IOMMU:
- ret = 1;
- break;
+ return 1;
+ case VFIO_EEH:
+ return eeh_enabled();
default:
- ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
- break;
+ return 0;
}
-
- return (ret < 0) ? 0 : ret;
}
/*
@@ -1046,8 +1096,7 @@ static long tce_iommu_ioctl(void *iommu_data,
ret = 0;
list_for_each_entry(tcegrp, &container->group_list, next) {
- ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
- cmd, arg);
+ ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg);
if (ret)
return ret;
}
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index e21ff965141e..5177bb061b17 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -158,15 +158,15 @@ static void vfio_device_release(struct device *dev)
vfio_release_device_set(device);
ida_free(&vfio.device_ida, device->index);
- /*
- * kvfree() cannot be done here due to a life cycle mess in
- * vfio-ccw. Before the ccw part is fixed all drivers are
- * required to support @release and call vfio_free_device()
- * from there.
- */
- device->ops->release(device);
+ if (device->ops->release)
+ device->ops->release(device);
+
+ kvfree(device);
}
+static int vfio_init_device(struct vfio_device *device, struct device *dev,
+ const struct vfio_device_ops *ops);
+
/*
* Allocate and initialize vfio_device so it can be registered to vfio
* core.
@@ -205,11 +205,9 @@ EXPORT_SYMBOL_GPL(_vfio_alloc_device);
/*
* Initialize a vfio_device so it can be registered to vfio core.
- *
- * Only vfio-ccw driver should call this interface.
*/
-int vfio_init_device(struct vfio_device *device, struct device *dev,
- const struct vfio_device_ops *ops)
+static int vfio_init_device(struct vfio_device *device, struct device *dev,
+ const struct vfio_device_ops *ops)
{
int ret;
@@ -241,18 +239,6 @@ out_uninit:
ida_free(&vfio.device_ida, device->index);
return ret;
}
-EXPORT_SYMBOL_GPL(vfio_init_device);
-
-/*
- * The helper called by driver @release callback to free the device
- * structure. Drivers which don't have private data to clean can
- * simply use this helper as its @release.
- */
-void vfio_free_device(struct vfio_device *device)
-{
- kvfree(device);
-}
-EXPORT_SYMBOL_GPL(vfio_free_device);
static int __vfio_register_dev(struct vfio_device *device,
enum vfio_group_type type)
@@ -504,7 +490,7 @@ int vfio_mig_get_next_state(struct vfio_device *device,
enum vfio_device_mig_state new_fsm,
enum vfio_device_mig_state *next_fsm)
{
- enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
+ enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
/*
* The coding in this table requires the driver to implement the
* following FSM arcs:
@@ -519,30 +505,65 @@ int vfio_mig_get_next_state(struct vfio_device *device,
* RUNNING_P2P -> RUNNING
* RUNNING_P2P -> STOP
* STOP -> RUNNING_P2P
- * Without P2P the driver must implement:
+ *
+ * If precopy is supported then the driver must support these additional
+ * FSM arcs:
+ * RUNNING -> PRE_COPY
+ * PRE_COPY -> RUNNING
+ * PRE_COPY -> STOP_COPY
+ * However, if precopy and P2P are supported together then the driver
+ * must support these additional arcs beyond the P2P arcs above:
+ * PRE_COPY -> RUNNING
+ * PRE_COPY -> PRE_COPY_P2P
+ * PRE_COPY_P2P -> PRE_COPY
+ * PRE_COPY_P2P -> RUNNING_P2P
+ * PRE_COPY_P2P -> STOP_COPY
+ * RUNNING -> PRE_COPY
+ * RUNNING_P2P -> PRE_COPY_P2P
+ *
+ * Without P2P and precopy the driver must implement:
* RUNNING -> STOP
* STOP -> RUNNING
*
* The coding will step through multiple states for some combination
* transitions; if all optional features are supported, this means the
* following ones:
+ * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
+ * PRE_COPY -> RUNNING -> RUNNING_P2P
+ * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
+ * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
+ * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
+ * PRE_COPY_P2P -> RUNNING_P2P -> STOP
+ * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
* RESUMING -> STOP -> RUNNING_P2P
+ * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
* RESUMING -> STOP -> RUNNING_P2P -> RUNNING
+ * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
* RESUMING -> STOP -> STOP_COPY
+ * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
* RUNNING -> RUNNING_P2P -> STOP
* RUNNING -> RUNNING_P2P -> STOP -> RESUMING
* RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
+ * RUNNING_P2P -> RUNNING -> PRE_COPY
* RUNNING_P2P -> STOP -> RESUMING
* RUNNING_P2P -> STOP -> STOP_COPY
+ * STOP -> RUNNING_P2P -> PRE_COPY_P2P
* STOP -> RUNNING_P2P -> RUNNING
+ * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
* STOP_COPY -> STOP -> RESUMING
* STOP_COPY -> STOP -> RUNNING_P2P
* STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
+ *
+ * The following transitions are blocked:
+ * STOP_COPY -> PRE_COPY
+ * STOP_COPY -> PRE_COPY_P2P
*/
static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
[VFIO_DEVICE_STATE_STOP] = {
[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
@@ -551,14 +572,38 @@ int vfio_mig_get_next_state(struct vfio_device *device,
[VFIO_DEVICE_STATE_RUNNING] = {
[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+ [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
},
+ [VFIO_DEVICE_STATE_PRE_COPY] = {
+ [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
+ [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+ [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
+ [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
+ [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
+ [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
+ [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+ },
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
+ [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
+ [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
+ [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+ },
[VFIO_DEVICE_STATE_STOP_COPY] = {
[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
@@ -567,6 +612,8 @@ int vfio_mig_get_next_state(struct vfio_device *device,
[VFIO_DEVICE_STATE_RESUMING] = {
[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
@@ -575,6 +622,8 @@ int vfio_mig_get_next_state(struct vfio_device *device,
[VFIO_DEVICE_STATE_RUNNING_P2P] = {
[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+ [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
@@ -583,6 +632,8 @@ int vfio_mig_get_next_state(struct vfio_device *device,
[VFIO_DEVICE_STATE_ERROR] = {
[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
+ [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
@@ -593,6 +644,11 @@ int vfio_mig_get_next_state(struct vfio_device *device,
static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
+ [VFIO_DEVICE_STATE_PRE_COPY] =
+ VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
+ [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
+ VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY,
[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
[VFIO_DEVICE_STATE_RUNNING_P2P] =
@@ -704,6 +760,34 @@ out_copy:
return 0;
}
+static int
+vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
+ u32 flags, void __user *arg,
+ size_t argsz)
+{
+ struct vfio_device_feature_mig_data_size data_size = {};
+ unsigned long stop_copy_length;
+ int ret;
+
+ if (!device->mig_ops)
+ return -ENOTTY;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+ sizeof(data_size));
+ if (ret != 1)
+ return ret;
+
+ ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
+ if (ret)
+ return ret;
+
+ data_size.stop_copy_length = stop_copy_length;
+ if (copy_to_user(arg, &data_size, sizeof(data_size)))
+ return -EFAULT;
+
+ return 0;
+}
+
static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
u32 flags, void __user *arg,
size_t argsz)
@@ -931,6 +1015,10 @@ static int vfio_ioctl_device_feature(struct vfio_device *device,
return vfio_ioctl_device_feature_logging_report(
device, feature.flags, arg->data,
feature.argsz - minsz);
+ case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
+ return vfio_ioctl_device_feature_migration_data_size(
+ device, feature.flags, arg->data,
+ feature.argsz - minsz);
default:
if (unlikely(!device->ops->device_feature))
return -EINVAL;
@@ -1260,6 +1348,10 @@ static int __init vfio_init(void)
if (ret)
return ret;
+ ret = vfio_virqfd_init();
+ if (ret)
+ goto err_virqfd;
+
/* /sys/class/vfio-dev/vfioX */
vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
if (IS_ERR(vfio.device_class)) {
@@ -1271,6 +1363,8 @@ static int __init vfio_init(void)
return 0;
err_dev_class:
+ vfio_virqfd_exit();
+err_virqfd:
vfio_group_cleanup();
return ret;
}
@@ -1280,6 +1374,7 @@ static void __exit vfio_cleanup(void)
ida_destroy(&vfio.device_ida);
class_destroy(vfio.device_class);
vfio.device_class = NULL;
+ vfio_virqfd_exit();
vfio_group_cleanup();
xa_destroy(&vfio_device_set_xa);
}
diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
deleted file mode 100644
index 67f55ac1d459..000000000000
--- a/drivers/vfio/vfio_spapr_eeh.c
+++ /dev/null
@@ -1,107 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * EEH functionality support for VFIO devices. The feature is only
- * available on sPAPR compatible platforms.
- *
- * Copyright Gavin Shan, IBM Corporation 2014.
- */
-
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/vfio.h>
-#include <asm/eeh.h>
-
-#define DRIVER_VERSION "0.1"
-#define DRIVER_AUTHOR "Gavin Shan, IBM Corporation"
-#define DRIVER_DESC "VFIO IOMMU SPAPR EEH"
-
-/* We might build address mapping here for "fast" path later */
-void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
-{
- eeh_dev_open(pdev);
-}
-EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open);
-
-void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
-{
- eeh_dev_release(pdev);
-}
-EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release);
-
-long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
- unsigned int cmd, unsigned long arg)
-{
- struct eeh_pe *pe;
- struct vfio_eeh_pe_op op;
- unsigned long minsz;
- long ret = -EINVAL;
-
- switch (cmd) {
- case VFIO_CHECK_EXTENSION:
- if (arg == VFIO_EEH)
- ret = eeh_enabled() ? 1 : 0;
- else
- ret = 0;
- break;
- case VFIO_EEH_PE_OP:
- pe = eeh_iommu_group_to_pe(group);
- if (!pe)
- return -ENODEV;
-
- minsz = offsetofend(struct vfio_eeh_pe_op, op);
- if (copy_from_user(&op, (void __user *)arg, minsz))
- return -EFAULT;
- if (op.argsz < minsz || op.flags)
- return -EINVAL;
-
- switch (op.op) {
- case VFIO_EEH_PE_DISABLE:
- ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE);
- break;
- case VFIO_EEH_PE_ENABLE:
- ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE);
- break;
- case VFIO_EEH_PE_UNFREEZE_IO:
- ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
- break;
- case VFIO_EEH_PE_UNFREEZE_DMA:
- ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
- break;
- case VFIO_EEH_PE_GET_STATE:
- ret = eeh_pe_get_state(pe);
- break;
- case VFIO_EEH_PE_RESET_DEACTIVATE:
- ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
- break;
- case VFIO_EEH_PE_RESET_HOT:
- ret = eeh_pe_reset(pe, EEH_RESET_HOT, true);
- break;
- case VFIO_EEH_PE_RESET_FUNDAMENTAL:
- ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
- break;
- case VFIO_EEH_PE_CONFIGURE:
- ret = eeh_pe_configure(pe);
- break;
- case VFIO_EEH_PE_INJECT_ERR:
- minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
- if (op.argsz < minsz)
- return -EINVAL;
- if (copy_from_user(&op, (void __user *)arg, minsz))
- return -EFAULT;
-
- ret = eeh_pe_inject_err(pe, op.err.type, op.err.func,
- op.err.addr, op.err.mask);
- break;
- default:
- ret = -EINVAL;
- }
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl);
-
-MODULE_VERSION(DRIVER_VERSION);
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR(DRIVER_AUTHOR);
-MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 414e98d82b02..497a17b37865 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -12,15 +12,12 @@
#include <linux/file.h>
#include <linux/module.h>
#include <linux/slab.h>
-
-#define DRIVER_VERSION "0.1"
-#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
-#define DRIVER_DESC "IRQFD support for VFIO bus drivers"
+#include "vfio.h"
static struct workqueue_struct *vfio_irqfd_cleanup_wq;
static DEFINE_SPINLOCK(virqfd_lock);
-static int __init vfio_virqfd_init(void)
+int __init vfio_virqfd_init(void)
{
vfio_irqfd_cleanup_wq =
create_singlethread_workqueue("vfio-irqfd-cleanup");
@@ -30,7 +27,7 @@ static int __init vfio_virqfd_init(void)
return 0;
}
-static void __exit vfio_virqfd_exit(void)
+void vfio_virqfd_exit(void)
{
destroy_workqueue(vfio_irqfd_cleanup_wq);
}
@@ -216,11 +213,3 @@ void vfio_virqfd_disable(struct virqfd **pvirqfd)
flush_workqueue(vfio_irqfd_cleanup_wq);
}
EXPORT_SYMBOL_GPL(vfio_virqfd_disable);
-
-module_init(vfio_virqfd_init);
-module_exit(vfio_virqfd_exit);
-
-MODULE_VERSION(DRIVER_VERSION);
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR(DRIVER_AUTHOR);
-MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 152d2d7f8743..f3d1c62c98dd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1891,7 +1891,12 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
u8 max_reformat_remove_size[0x8];
u8 max_reformat_remove_offset[0x8];
- u8 reserved_at_c0[0xe0];
+ u8 reserved_at_c0[0x8];
+ u8 migration_multi_load[0x1];
+ u8 migration_tracking_state[0x1];
+ u8 reserved_at_ca[0x16];
+
+ u8 reserved_at_e0[0xc0];
u8 reserved_at_1a0[0xb];
u8 log_min_mkey_entity_size[0x5];
@@ -12033,7 +12038,8 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits {
u8 reserved_at_20[0x10];
u8 op_mod[0x10];
- u8 reserved_at_40[0x10];
+ u8 incremental[0x1];
+ u8 reserved_at_41[0xf];
u8 vhca_id[0x10];
u8 reserved_at_60[0x20];
@@ -12059,7 +12065,9 @@ struct mlx5_ifc_save_vhca_state_in_bits {
u8 reserved_at_20[0x10];
u8 op_mod[0x10];
- u8 reserved_at_40[0x10];
+ u8 incremental[0x1];
+ u8 set_track[0x1];
+ u8 reserved_at_42[0xe];
u8 vhca_id[0x10];
u8 reserved_at_60[0x20];
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index a615542df1e0..35be78e9ae57 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -146,6 +146,9 @@ int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
* @migration_get_state: Optional callback to get the migration state for
* devices that support migration. It's mandatory for
* VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ * @migration_get_data_size: Optional callback to get the estimated data
+ * length that will be required to complete stop copy. It's mandatory for
+ * VFIO_DEVICE_FEATURE_MIGRATION migration support.
*/
struct vfio_migration_ops {
struct file *(*migration_set_state)(
@@ -153,6 +156,8 @@ struct vfio_migration_ops {
enum vfio_device_mig_state new_state);
int (*migration_get_state)(struct vfio_device *device,
enum vfio_device_mig_state *curr_state);
+ int (*migration_get_data_size)(struct vfio_device *device,
+ unsigned long *stop_copy_length);
};
/**
@@ -215,9 +220,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
dev, ops), \
struct dev_struct, member)
-int vfio_init_device(struct vfio_device *device, struct device *dev,
- const struct vfio_device_ops *ops);
-void vfio_free_device(struct vfio_device *device);
static inline void vfio_put_device(struct vfio_device *device)
{
put_device(&device->device);
@@ -271,29 +273,6 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
int num_irqs, int max_irq_type,
size_t *data_size);
-struct pci_dev;
-#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH)
-void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
-void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
-long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd,
- unsigned long arg);
-#else
-static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
-{
-}
-
-static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
-{
-}
-
-static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
- unsigned int cmd,
- unsigned long arg)
-{
- return -ENOTTY;
-}
-#endif /* CONFIG_VFIO_SPAPR_EEH */
-
/*
* IRQfd - generic
*/
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d7d8e0922376..23105eb036fa 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -819,12 +819,20 @@ struct vfio_device_feature {
* VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P
* is supported in addition to the STOP_COPY states.
*
+ * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY means that
+ * PRE_COPY is supported in addition to the STOP_COPY states.
+ *
+ * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY
+ * means that RUNNING_P2P, PRE_COPY and PRE_COPY_P2P are supported
+ * in addition to the STOP_COPY states.
+ *
* Other combinations of flags have behavior to be defined in the future.
*/
struct vfio_device_feature_migration {
__aligned_u64 flags;
#define VFIO_MIGRATION_STOP_COPY (1 << 0)
#define VFIO_MIGRATION_P2P (1 << 1)
+#define VFIO_MIGRATION_PRE_COPY (1 << 2)
};
#define VFIO_DEVICE_FEATURE_MIGRATION 1
@@ -875,8 +883,13 @@ struct vfio_device_feature_mig_state {
* RESUMING - The device is stopped and is loading a new internal state
* ERROR - The device has failed and must be reset
*
- * And 1 optional state to support VFIO_MIGRATION_P2P:
+ * And optional states to support VFIO_MIGRATION_P2P:
* RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA
+ * And VFIO_MIGRATION_PRE_COPY:
+ * PRE_COPY - The device is running normally but tracking internal state
+ * changes
+ * And VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY:
+ * PRE_COPY_P2P - PRE_COPY, except the device cannot do peer to peer DMA
*
* The FSM takes actions on the arcs between FSM states. The driver implements
* the following behavior for the FSM arcs:
@@ -908,20 +921,48 @@ struct vfio_device_feature_mig_state {
*
* To abort a RESUMING session the device must be reset.
*
+ * PRE_COPY -> RUNNING
* RUNNING_P2P -> RUNNING
* While in RUNNING the device is fully operational, the device may generate
* interrupts, DMA, respond to MMIO, all vfio device regions are functional,
* and the device may advance its internal state.
*
+ * The PRE_COPY arc will terminate a data transfer session.
+ *
+ * PRE_COPY_P2P -> RUNNING_P2P
* RUNNING -> RUNNING_P2P
* STOP -> RUNNING_P2P
* While in RUNNING_P2P the device is partially running in the P2P quiescent
* state defined below.
*
+ * The PRE_COPY_P2P arc will terminate a data transfer session.
+ *
+ * RUNNING -> PRE_COPY
+ * RUNNING_P2P -> PRE_COPY_P2P
* STOP -> STOP_COPY
- * This arc begin the process of saving the device state and will return a
- * new data_fd.
+ * PRE_COPY, PRE_COPY_P2P and STOP_COPY form the "saving group" of states
+ * which share a data transfer session. Moving between these states alters
+ * what is streamed in session, but does not terminate or otherwise affect
+ * the associated fd.
+ *
+ * These arcs begin the process of saving the device state and will return a
+ * new data_fd. The migration driver may perform actions such as enabling
+ * dirty logging of device state when entering PRE_COPY or PER_COPY_P2P.
+ *
+ * Each arc does not change the device operation, the device remains
+ * RUNNING, P2P quiesced or in STOP. The STOP_COPY state is described below
+ * in PRE_COPY_P2P -> STOP_COPY.
*
+ * PRE_COPY -> PRE_COPY_P2P
+ * Entering PRE_COPY_P2P continues all the behaviors of PRE_COPY above.
+ * However, while in the PRE_COPY_P2P state, the device is partially running
+ * in the P2P quiescent state defined below, like RUNNING_P2P.
+ *
+ * PRE_COPY_P2P -> PRE_COPY
+ * This arc allows returning the device to a full RUNNING behavior while
+ * continuing all the behaviors of PRE_COPY.
+ *
+ * PRE_COPY_P2P -> STOP_COPY
* While in the STOP_COPY state the device has the same behavior as STOP
* with the addition that the data transfers session continues to stream the
* migration state. End of stream on the FD indicates the entire device
@@ -939,6 +980,13 @@ struct vfio_device_feature_mig_state {
* device state for this arc if required to prepare the device to receive the
* migration data.
*
+ * STOP_COPY -> PRE_COPY
+ * STOP_COPY -> PRE_COPY_P2P
+ * These arcs are not permitted and return error if requested. Future
+ * revisions of this API may define behaviors for these arcs, in this case
+ * support will be discoverable by a new flag in
+ * VFIO_DEVICE_FEATURE_MIGRATION.
+ *
* any -> ERROR
* ERROR cannot be specified as a device state, however any transition request
* can be failed with an errno return and may then move the device_state into
@@ -950,7 +998,7 @@ struct vfio_device_feature_mig_state {
* The optional peer to peer (P2P) quiescent state is intended to be a quiescent
* state for the device for the purposes of managing multiple devices within a
* user context where peer-to-peer DMA between devices may be active. The
- * RUNNING_P2P states must prevent the device from initiating
+ * RUNNING_P2P and PRE_COPY_P2P states must prevent the device from initiating
* any new P2P DMA transactions. If the device can identify P2P transactions
* then it can stop only P2P DMA, otherwise it must stop all DMA. The migration
* driver must complete any such outstanding operations prior to completing the
@@ -963,6 +1011,8 @@ struct vfio_device_feature_mig_state {
* above FSM arcs. As there are multiple paths through the FSM arcs the path
* should be selected based on the following rules:
* - Select the shortest path.
+ * - The path cannot have saving group states as interior arcs, only
+ * starting/end states.
* Refer to vfio_mig_get_next_state() for the result of the algorithm.
*
* The automatic transit through the FSM arcs that make up the combination
@@ -976,6 +1026,9 @@ struct vfio_device_feature_mig_state {
* support them. The user can discover if these states are supported by using
* VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can
* avoid knowing about these optional states if the kernel driver supports them.
+ *
+ * Arcs touching PRE_COPY and PRE_COPY_P2P are removed if support for PRE_COPY
+ * is not present.
*/
enum vfio_device_mig_state {
VFIO_DEVICE_STATE_ERROR = 0,
@@ -984,8 +1037,70 @@ enum vfio_device_mig_state {
VFIO_DEVICE_STATE_STOP_COPY = 3,
VFIO_DEVICE_STATE_RESUMING = 4,
VFIO_DEVICE_STATE_RUNNING_P2P = 5,
+ VFIO_DEVICE_STATE_PRE_COPY = 6,
+ VFIO_DEVICE_STATE_PRE_COPY_P2P = 7,
+};
+
+/**
+ * VFIO_MIG_GET_PRECOPY_INFO - _IO(VFIO_TYPE, VFIO_BASE + 21)
+ *
+ * This ioctl is used on the migration data FD in the precopy phase of the
+ * migration data transfer. It returns an estimate of the current data sizes
+ * remaining to be transferred. It allows the user to judge when it is
+ * appropriate to leave PRE_COPY for STOP_COPY.
+ *
+ * This ioctl is valid only in PRE_COPY states and kernel driver should
+ * return -EINVAL from any other migration state.
+ *
+ * The vfio_precopy_info data structure returned by this ioctl provides
+ * estimates of data available from the device during the PRE_COPY states.
+ * This estimate is split into two categories, initial_bytes and
+ * dirty_bytes.
+ *
+ * The initial_bytes field indicates the amount of initial precopy
+ * data available from the device. This field should have a non-zero initial
+ * value and decrease as migration data is read from the device.
+ * It is recommended to leave PRE_COPY for STOP_COPY only after this field
+ * reaches zero. Leaving PRE_COPY earlier might make things slower.
+ *
+ * The dirty_bytes field tracks device state changes relative to data
+ * previously retrieved. This field starts at zero and may increase as
+ * the internal device state is modified or decrease as that modified
+ * state is read from the device.
+ *
+ * Userspace may use the combination of these fields to estimate the
+ * potential data size available during the PRE_COPY phases, as well as
+ * trends relative to the rate the device is dirtying its internal
+ * state, but these fields are not required to have any bearing relative
+ * to the data size available during the STOP_COPY phase.
+ *
+ * Drivers have a lot of flexibility in when and what they transfer during the
+ * PRE_COPY phase, and how they report this from VFIO_MIG_GET_PRECOPY_INFO.
+ *
+ * During pre-copy the migration data FD has a temporary "end of stream" that is
+ * reached when both initial_bytes and dirty_byte are zero. For instance, this
+ * may indicate that the device is idle and not currently dirtying any internal
+ * state. When read() is done on this temporary end of stream the kernel driver
+ * should return ENOMSG from read(). Userspace can wait for more data (which may
+ * never come) by using poll.
+ *
+ * Once in STOP_COPY the migration data FD has a permanent end of stream
+ * signaled in the usual way by read() always returning 0 and poll always
+ * returning readable. ENOMSG may not be returned in STOP_COPY.
+ * Support for this ioctl is mandatory if a driver claims to support
+ * VFIO_MIGRATION_PRE_COPY.
+ *
+ * Return: 0 on success, -1 and errno set on failure.
+ */
+struct vfio_precopy_info {
+ __u32 argsz;
+ __u32 flags;
+ __aligned_u64 initial_bytes;
+ __aligned_u64 dirty_bytes;
};
+#define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21)
+
/*
* Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power
* state with the platform-based power management. Device use of lower power
@@ -1128,6 +1243,19 @@ struct vfio_device_feature_dma_logging_report {
#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8
+/*
+ * Upon VFIO_DEVICE_FEATURE_GET read back the estimated data length that will
+ * be required to complete stop copy.
+ *
+ * Note: Can be called on each device state.
+ */
+
+struct vfio_device_feature_mig_data_size {
+ __aligned_u64 stop_copy_length;
+};
+
+#define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 117a8d799f71..e54eb752e1ba 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -594,7 +594,6 @@ static void mbochs_release_dev(struct vfio_device *vdev)
atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes);
kfree(mdev_state->pages);
kfree(mdev_state->vconfig);
- vfio_free_device(vdev);
}
static void mbochs_remove(struct mdev_device *mdev)
@@ -1431,7 +1430,7 @@ static int __init mbochs_dev_init(void)
ret = device_register(&mbochs_dev);
if (ret)
- goto err_class;
+ goto err_put;
ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver,
mbochs_mdev_types,
@@ -1442,8 +1441,9 @@ static int __init mbochs_dev_init(void)
return 0;
err_device:
- device_unregister(&mbochs_dev);
-err_class:
+ device_del(&mbochs_dev);
+err_put:
+ put_device(&mbochs_dev);
class_destroy(mbochs_class);
err_driver:
mdev_unregister_driver(&mbochs_driver);
diff --git a/samples/vfio-mdev/mdpy-fb.c b/samples/vfio-mdev/mdpy-fb.c
index 9ec93d90e8a5..4eb7aa11cfbb 100644
--- a/samples/vfio-mdev/mdpy-fb.c
+++ b/samples/vfio-mdev/mdpy-fb.c
@@ -109,7 +109,7 @@ static int mdpy_fb_probe(struct pci_dev *pdev,
ret = pci_request_regions(pdev, "mdpy-fb");
if (ret < 0)
- return ret;
+ goto err_disable_dev;
pci_read_config_dword(pdev, MDPY_FORMAT_OFFSET, &format);
pci_read_config_dword(pdev, MDPY_WIDTH_OFFSET, &width);
@@ -191,6 +191,9 @@ err_release_fb:
err_release_regions:
pci_release_regions(pdev);
+err_disable_dev:
+ pci_disable_device(pdev);
+
return ret;
}
@@ -199,7 +202,10 @@ static void mdpy_fb_remove(struct pci_dev *pdev)
struct fb_info *info = pci_get_drvdata(pdev);
unregister_framebuffer(info);
+ iounmap(info->screen_base);
framebuffer_release(info);
+ pci_release_regions(pdev);
+ pci_disable_device(pdev);
}
static struct pci_device_id mdpy_fb_pci_table[] = {
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 946e8cfde6fd..e8400fdab71d 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -283,7 +283,6 @@ static void mdpy_release_dev(struct vfio_device *vdev)
vfree(mdev_state->memblk);
kfree(mdev_state->vconfig);
- vfio_free_device(vdev);
}
static void mdpy_remove(struct mdev_device *mdev)
@@ -718,7 +717,7 @@ static int __init mdpy_dev_init(void)
ret = device_register(&mdpy_dev);
if (ret)
- goto err_class;
+ goto err_put;
ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver,
mdpy_mdev_types,
@@ -729,8 +728,9 @@ static int __init mdpy_dev_init(void)
return 0;
err_device:
- device_unregister(&mdpy_dev);
-err_class:
+ device_del(&mdpy_dev);
+err_put:
+ put_device(&mdpy_dev);
class_destroy(mdpy_class);
err_driver:
mdev_unregister_driver(&mdpy_driver);
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index e72085fc1376..e887de672c52 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -784,7 +784,6 @@ static void mtty_release_dev(struct vfio_device *vdev)
atomic_add(mdev_state->nr_ports, &mdev_avail_ports);
kfree(mdev_state->vconfig);
- vfio_free_device(vdev);
}
static void mtty_remove(struct mdev_device *mdev)
@@ -1331,7 +1330,7 @@ static int __init mtty_dev_init(void)
ret = device_register(&mtty_dev.dev);
if (ret)
- goto err_class;
+ goto err_put;
ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev,
&mtty_driver, mtty_mdev_types,
@@ -1341,8 +1340,9 @@ static int __init mtty_dev_init(void)
return 0;
err_device:
- device_unregister(&mtty_dev.dev);
-err_class:
+ device_del(&mtty_dev.dev);
+err_put:
+ put_device(&mtty_dev.dev);
class_destroy(mtty_dev.vd_class);
err_driver:
mdev_unregister_driver(&mtty_driver);