summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-27 21:57:03 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-27 21:57:03 +0100
commit4aca98a8a150f3531fea782c3040ada0ad1ce3b6 (patch)
tree9baefca94687e6434160dcbdc5389387274ddb49 /drivers
parentMerge tag 'riscv-for-linus-6.13-mw1' of git://git.kernel.org/pub/scm/linux/ke... (diff)
parentvfio/pci: Properly hide first-in-list PCIe extended capability (diff)
downloadlinux-4aca98a8a150f3531fea782c3040ada0ad1ce3b6.tar.xz
linux-4aca98a8a150f3531fea782c3040ada0ad1ce3b6.zip
Merge tag 'vfio-v6.13-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - Constify an unmodified structure used in linking vfio and kvm (Christophe JAILLET) - Add ID for an additional hardware SKU supported by the nvgrace-gpu vfio-pci variant driver (Ankit Agrawal) - Fix incorrect signed cast in QAT vfio-pci variant driver, negating test in check_add_overflow(), though still caught by later tests (Giovanni Cabiddu) - Additional debugfs attributes exposed in hisi_acc vfio-pci variant driver for migration debugging (Longfang Liu) - Migration support is added to the virtio vfio-pci variant driver, becoming the primary feature of the driver while retaining emulation of virtio legacy support as a secondary option (Yishai Hadas) - Fixes to a few unwind flows in the mlx5 vfio-pci driver discovered through reviews of the virtio variant driver (Yishai Hadas) - Fix an unlikely issue where a PCI device exposed to userspace with an unknown capability at the base of the extended capability chain can overflow an array index (Avihai Horon) * tag 'vfio-v6.13-rc1' of https://github.com/awilliam/linux-vfio: vfio/pci: Properly hide first-in-list PCIe extended capability vfio/mlx5: Fix unwind flows in mlx5vf_pci_save/resume_device_data() vfio/mlx5: Fix an unwind issue in mlx5vf_add_migration_pages() vfio/virtio: Enable live migration once VIRTIO_PCI was configured vfio/virtio: Add PRE_COPY support for live migration vfio/virtio: Add support for the basic live migration functionality virtio-pci: Introduce APIs to execute device parts admin commands virtio: Manage device and driver capabilities via the admin commands virtio: Extend the admin command to include the result size virtio_pci: Introduce device parts access commands Documentation: add debugfs description for hisi migration hisi_acc_vfio_pci: register debugfs for hisilicon migration driver hisi_acc_vfio_pci: create subfunction for data reading hisi_acc_vfio_pci: extract public functions for container_of vfio/qat: fix overflow check in qat_vf_resume_write() vfio/nvgrace-gpu: Add a new GH200 SKU to the devid table kvm/vfio: Constify struct kvm_device_ops
Diffstat (limited to 'drivers')
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c266
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h19
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c6
-rw-r--r--drivers/vfio/pci/mlx5/main.c35
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/main.c2
-rw-r--r--drivers/vfio/pci/qat/main.c2
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c16
-rw-r--r--drivers/vfio/pci/virtio/Kconfig42
-rw-r--r--drivers/vfio/pci/virtio/Makefile3
-rw-r--r--drivers/vfio/pci/virtio/common.h127
-rw-r--r--drivers/vfio/pci/virtio/legacy_io.c418
-rw-r--r--drivers/vfio/pci/virtio/main.c476
-rw-r--r--drivers/vfio/pci/virtio/migrate.c1337
-rw-r--r--drivers/virtio/virtio_pci_common.h19
-rw-r--r--drivers/virtio/virtio_pci_modern.c457
15 files changed, 2751 insertions, 474 deletions
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 0d632ba5d2a3..451c639299eb 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -486,31 +486,11 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return 0;
}
-static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
- struct hisi_acc_vf_migration_file *migf)
+static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data)
{
- struct acc_vf_data *vf_data = &migf->vf_data;
- struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
struct device *dev = &vf_qm->pdev->dev;
int ret;
- if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
- /* Update state and return with match data */
- vf_data->vf_qm_state = QM_NOT_READY;
- hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
- migf->total_length = QM_MATCH_SIZE;
- return 0;
- }
-
- vf_data->vf_qm_state = QM_READY;
- hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
-
- ret = vf_qm_cache_wb(vf_qm);
- if (ret) {
- dev_err(dev, "failed to writeback QM Cache!\n");
- return ret;
- }
-
ret = qm_get_regs(vf_qm, vf_data);
if (ret)
return -EINVAL;
@@ -536,6 +516,38 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return -EINVAL;
}
+ return 0;
+}
+
+static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ struct acc_vf_data *vf_data = &migf->vf_data;
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ struct device *dev = &vf_qm->pdev->dev;
+ int ret;
+
+ if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
+ /* Update state and return with match data */
+ vf_data->vf_qm_state = QM_NOT_READY;
+ hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+ migf->total_length = QM_MATCH_SIZE;
+ return 0;
+ }
+
+ vf_data->vf_qm_state = QM_READY;
+ hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+
+ ret = vf_qm_cache_wb(vf_qm);
+ if (ret) {
+ dev_err(dev, "failed to writeback QM Cache!\n");
+ return ret;
+ }
+
+ ret = vf_qm_read_data(vf_qm, vf_data);
+ if (ret)
+ return -EINVAL;
+
migf->total_length = sizeof(struct acc_vf_data);
return 0;
}
@@ -615,21 +627,43 @@ static void hisi_acc_vf_disable_fd(struct hisi_acc_vf_migration_file *migf)
mutex_unlock(&migf->lock);
}
+static void
+hisi_acc_debug_migf_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev,
+ struct hisi_acc_vf_migration_file *src_migf)
+{
+ struct hisi_acc_vf_migration_file *dst_migf = hisi_acc_vdev->debug_migf;
+
+ if (!dst_migf)
+ return;
+
+ dst_migf->total_length = src_migf->total_length;
+ memcpy(&dst_migf->vf_data, &src_migf->vf_data,
+ sizeof(struct acc_vf_data));
+}
+
static void hisi_acc_vf_disable_fds(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
if (hisi_acc_vdev->resuming_migf) {
+ hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->resuming_migf);
hisi_acc_vf_disable_fd(hisi_acc_vdev->resuming_migf);
fput(hisi_acc_vdev->resuming_migf->filp);
hisi_acc_vdev->resuming_migf = NULL;
}
if (hisi_acc_vdev->saving_migf) {
+ hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->saving_migf);
hisi_acc_vf_disable_fd(hisi_acc_vdev->saving_migf);
fput(hisi_acc_vdev->saving_migf->filp);
hisi_acc_vdev->saving_migf = NULL;
}
}
+static struct hisi_acc_vf_core_device *hisi_acc_get_vf_dev(struct vfio_device *vdev)
+{
+ return container_of(vdev, struct hisi_acc_vf_core_device,
+ core_device.vdev);
+}
+
static void hisi_acc_vf_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev)
{
hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
@@ -1031,8 +1065,7 @@ static struct file *
hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state new_state)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
enum vfio_device_mig_state next_state;
struct file *res = NULL;
int ret;
@@ -1073,8 +1106,7 @@ static int
hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev,
enum vfio_device_mig_state *curr_state)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
mutex_lock(&hisi_acc_vdev->state_mutex);
*curr_state = hisi_acc_vdev->mig_state;
@@ -1276,10 +1308,132 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int
return vfio_pci_core_ioctl(core_vdev, cmd, arg);
}
+static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev)
+{
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ int ret;
+
+ lockdep_assert_held(&hisi_acc_vdev->open_mutex);
+ /*
+ * When the device is not opened, the io_base is not mapped.
+ * The driver cannot perform device read and write operations.
+ */
+ if (!hisi_acc_vdev->dev_opened) {
+ seq_puts(seq, "device not opened!\n");
+ return -EINVAL;
+ }
+
+ ret = qm_wait_dev_not_ready(vf_qm);
+ if (ret) {
+ seq_puts(seq, "VF device not ready!\n");
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int hisi_acc_vf_debug_cmd(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev);
+ struct vfio_device *vdev = &core_device->vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ u64 value;
+ int ret;
+
+ mutex_lock(&hisi_acc_vdev->open_mutex);
+ ret = hisi_acc_vf_debug_check(seq, vdev);
+ if (ret) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ return ret;
+ }
+
+ value = readl(vf_qm->io_base + QM_MB_CMD_SEND_BASE);
+ if (value == QM_MB_CMD_NOT_READY) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ seq_puts(seq, "mailbox cmd channel not ready!\n");
+ return -EINVAL;
+ }
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ seq_puts(seq, "mailbox cmd channel ready!\n");
+
+ return 0;
+}
+
+static int hisi_acc_vf_dev_read(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev);
+ struct vfio_device *vdev = &core_device->vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ size_t vf_data_sz = offsetofend(struct acc_vf_data, padding);
+ struct acc_vf_data *vf_data;
+ int ret;
+
+ mutex_lock(&hisi_acc_vdev->open_mutex);
+ ret = hisi_acc_vf_debug_check(seq, vdev);
+ if (ret) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+ return ret;
+ }
+
+ mutex_lock(&hisi_acc_vdev->state_mutex);
+ vf_data = kzalloc(sizeof(*vf_data), GFP_KERNEL);
+ if (!vf_data) {
+ ret = -ENOMEM;
+ goto mutex_release;
+ }
+
+ vf_data->vf_qm_state = hisi_acc_vdev->vf_qm_state;
+ ret = vf_qm_read_data(&hisi_acc_vdev->vf_qm, vf_data);
+ if (ret)
+ goto migf_err;
+
+ seq_hex_dump(seq, "Dev Data:", DUMP_PREFIX_OFFSET, 16, 1,
+ (const void *)vf_data, vf_data_sz, false);
+
+ seq_printf(seq,
+ "guest driver load: %u\n"
+ "data size: %lu\n",
+ hisi_acc_vdev->vf_qm_state,
+ sizeof(struct acc_vf_data));
+
+migf_err:
+ kfree(vf_data);
+mutex_release:
+ mutex_unlock(&hisi_acc_vdev->state_mutex);
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
+
+ return ret;
+}
+
+static int hisi_acc_vf_migf_read(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev);
+ struct vfio_device *vdev = &core_device->vdev;
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev);
+ size_t vf_data_sz = offsetofend(struct acc_vf_data, padding);
+ struct hisi_acc_vf_migration_file *debug_migf = hisi_acc_vdev->debug_migf;
+
+ /* Check whether the live migration operation has been performed */
+ if (debug_migf->total_length < QM_MATCH_SIZE) {
+ seq_puts(seq, "device not migrated!\n");
+ return -EAGAIN;
+ }
+
+ seq_hex_dump(seq, "Mig Data:", DUMP_PREFIX_OFFSET, 16, 1,
+ (const void *)&debug_migf->vf_data, vf_data_sz, false);
+ seq_printf(seq, "migrate data length: %lu\n", debug_migf->total_length);
+
+ return 0;
+}
+
static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device;
int ret;
@@ -1288,12 +1442,16 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
return ret;
if (core_vdev->mig_ops) {
+ mutex_lock(&hisi_acc_vdev->open_mutex);
ret = hisi_acc_vf_qm_init(hisi_acc_vdev);
if (ret) {
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
vfio_pci_core_disable(vdev);
return ret;
}
hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ hisi_acc_vdev->dev_opened = true;
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
}
vfio_pci_core_finish_enable(vdev);
@@ -1302,11 +1460,13 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev)
static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ mutex_lock(&hisi_acc_vdev->open_mutex);
+ hisi_acc_vdev->dev_opened = false;
iounmap(vf_qm->io_base);
+ mutex_unlock(&hisi_acc_vdev->open_mutex);
vfio_pci_core_close_device(core_vdev);
}
@@ -1318,8 +1478,7 @@ static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = {
static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
{
- struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev,
- struct hisi_acc_vf_core_device, core_device.vdev);
+ struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
struct hisi_qm *pf_qm = hisi_acc_get_pf_qm(pdev);
@@ -1327,6 +1486,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
hisi_acc_vdev->pf_qm = pf_qm;
hisi_acc_vdev->vf_dev = pdev;
mutex_init(&hisi_acc_vdev->state_mutex);
+ mutex_init(&hisi_acc_vdev->open_mutex);
core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY;
core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops;
@@ -1372,6 +1532,47 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
+static void hisi_acc_vfio_debug_init(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ struct vfio_device *vdev = &hisi_acc_vdev->core_device.vdev;
+ struct hisi_acc_vf_migration_file *migf;
+ struct dentry *vfio_dev_migration;
+ struct dentry *vfio_hisi_acc;
+ struct device *dev = vdev->dev;
+
+ if (!debugfs_initialized() ||
+ !IS_ENABLED(CONFIG_VFIO_DEBUGFS))
+ return;
+
+ if (vdev->ops != &hisi_acc_vfio_pci_migrn_ops)
+ return;
+
+ vfio_dev_migration = debugfs_lookup("migration", vdev->debug_root);
+ if (!vfio_dev_migration) {
+ dev_err(dev, "failed to lookup migration debugfs file!\n");
+ return;
+ }
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL);
+ if (!migf)
+ return;
+ hisi_acc_vdev->debug_migf = migf;
+
+ vfio_hisi_acc = debugfs_create_dir("hisi_acc", vfio_dev_migration);
+ debugfs_create_devm_seqfile(dev, "dev_data", vfio_hisi_acc,
+ hisi_acc_vf_dev_read);
+ debugfs_create_devm_seqfile(dev, "migf_data", vfio_hisi_acc,
+ hisi_acc_vf_migf_read);
+ debugfs_create_devm_seqfile(dev, "cmd_state", vfio_hisi_acc,
+ hisi_acc_vf_debug_cmd);
+}
+
+static void hisi_acc_vf_debugfs_exit(struct hisi_acc_vf_core_device *hisi_acc_vdev)
+{
+ kfree(hisi_acc_vdev->debug_migf);
+ hisi_acc_vdev->debug_migf = NULL;
+}
+
static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct hisi_acc_vf_core_device *hisi_acc_vdev;
@@ -1398,6 +1599,8 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device
ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device);
if (ret)
goto out_put_vdev;
+
+ hisi_acc_vfio_debug_init(hisi_acc_vdev);
return 0;
out_put_vdev:
@@ -1410,6 +1613,7 @@ static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev)
struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev);
vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device);
+ hisi_acc_vf_debugfs_exit(hisi_acc_vdev);
vfio_put_device(&hisi_acc_vdev->core_device.vdev);
}
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
index 5bab46602fad..245d7537b2bc 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -32,6 +32,7 @@
#define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0)
#define QM_SQC_VFT_NUM_SHIFT_V2 45
#define QM_SQC_VFT_NUM_MASK_V2 GENMASK(9, 0)
+#define QM_MB_CMD_NOT_READY 0xffffffff
/* RW regs */
#define QM_REGS_MAX_LEN 7
@@ -99,6 +100,13 @@ struct hisi_acc_vf_migration_file {
struct hisi_acc_vf_core_device {
struct vfio_pci_core_device core_device;
u8 match_done;
+ /*
+ * io_base is only valid when dev_opened is true,
+ * which is protected by open_mutex.
+ */
+ bool dev_opened;
+ /* Ensure the accuracy of dev_opened operation */
+ struct mutex open_mutex;
/* For migration state */
struct mutex state_mutex;
@@ -107,9 +115,20 @@ struct hisi_acc_vf_core_device {
struct pci_dev *vf_dev;
struct hisi_qm *pf_qm;
struct hisi_qm vf_qm;
+ /*
+ * vf_qm_state represents the QM_VF_STATE register value.
+ * It is set by Guest driver for the ACC VF dev indicating
+ * the driver has loaded and configured the dev correctly.
+ */
u32 vf_qm_state;
int vf_id;
struct hisi_acc_vf_migration_file *resuming_migf;
struct hisi_acc_vf_migration_file *saving_migf;
+
+ /*
+ * It holds migration data corresponding to the last migration
+ * and is used by the debugfs interface to report it.
+ */
+ struct hisi_acc_vf_migration_file *debug_migf;
};
#endif /* HISI_ACC_VFIO_PCI_H */
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 41a4b0cf4297..7527e277c898 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -423,6 +423,7 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
unsigned long filled;
unsigned int to_fill;
int ret;
+ int i;
to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
@@ -443,7 +444,7 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
GFP_KERNEL_ACCOUNT);
if (ret)
- goto err;
+ goto err_append;
buf->allocated_length += filled * PAGE_SIZE;
/* clean input for another bulk allocation */
memset(page_list, 0, filled * sizeof(*page_list));
@@ -454,6 +455,9 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
kvfree(page_list);
return 0;
+err_append:
+ for (i = filled - 1; i >= 0; i--)
+ __free_page(page_list[i]);
err:
kvfree(page_list);
return ret;
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 242c23eef452..8833e60d42f5 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -640,14 +640,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
O_RDONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
- goto end;
+ kfree(migf);
+ return ERR_PTR(ret);
}
migf->mvdev = mvdev;
- ret = mlx5vf_cmd_alloc_pd(migf);
- if (ret)
- goto out_free;
-
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
init_waitqueue_head(&migf->poll_wait);
@@ -663,6 +660,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
+
+ ret = mlx5vf_cmd_alloc_pd(migf);
+ if (ret)
+ goto out;
+
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
if (ret)
goto out_pd;
@@ -692,10 +694,8 @@ out_save:
mlx5vf_free_data_buffer(buf);
out_pd:
mlx5fv_cmd_clean_migf_resources(migf);
-out_free:
+out:
fput(migf->filp);
-end:
- kfree(migf);
return ERR_PTR(ret);
}
@@ -1016,13 +1016,19 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
O_WRONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
- goto end;
+ kfree(migf);
+ return ERR_PTR(ret);
}
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
migf->mvdev = mvdev;
ret = mlx5vf_cmd_alloc_pd(migf);
if (ret)
- goto out_free;
+ goto out;
buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
if (IS_ERR(buf)) {
@@ -1041,20 +1047,13 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
migf->buf_header[0] = buf;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
- stream_open(migf->filp->f_inode, migf->filp);
- mutex_init(&migf->lock);
- INIT_LIST_HEAD(&migf->buf_list);
- INIT_LIST_HEAD(&migf->avail_list);
- spin_lock_init(&migf->list_lock);
return migf;
out_buf:
mlx5vf_free_data_buffer(migf->buf[0]);
out_pd:
mlx5vf_cmd_dealloc_pd(migf);
-out_free:
+out:
fput(migf->filp);
-end:
- kfree(migf);
return ERR_PTR(ret);
}
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index a7fd018aa548..a467085038f0 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -866,6 +866,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
/* GH200 480GB */
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
+ /* GH200 SKU */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
{}
};
diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c
index be3644ced17b..c78cb6de9390 100644
--- a/drivers/vfio/pci/qat/main.c
+++ b/drivers/vfio/pci/qat/main.c
@@ -304,7 +304,7 @@ static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf,
offs = &filp->f_pos;
if (*offs < 0 ||
- check_add_overflow((loff_t)len, *offs, &end))
+ check_add_overflow(len, *offs, &end))
return -EOVERFLOW;
if (end > mig_dev->state_size)
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 97422aafaa7b..ea2745c1ac5e 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -313,6 +313,10 @@ static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos,
return count;
}
+static struct perm_bits direct_ro_perms = {
+ .readfn = vfio_direct_config_read,
+};
+
/* Default capability regions to read-only, no-virtualization */
static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
[0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
@@ -1897,9 +1901,17 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user
cap_start = *ppos;
} else {
if (*ppos >= PCI_CFG_SPACE_SIZE) {
- WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
+ /*
+ * We can get a cap_id that exceeds PCI_EXT_CAP_ID_MAX
+ * if we're hiding an unknown capability at the start
+ * of the extended capability list. Use default, ro
+ * access, which will virtualize the id and next values.
+ */
+ if (cap_id > PCI_EXT_CAP_ID_MAX)
+ perm = &direct_ro_perms;
+ else
+ perm = &ecap_perms[cap_id];
- perm = &ecap_perms[cap_id];
cap_start = vfio_find_cap_start(vdev, *ppos);
} else {
WARN_ON(cap_id > PCI_CAP_ID_MAX);
diff --git a/drivers/vfio/pci/virtio/Kconfig b/drivers/vfio/pci/virtio/Kconfig
index bd80eca4a196..2770f7eb702c 100644
--- a/drivers/vfio/pci/virtio/Kconfig
+++ b/drivers/vfio/pci/virtio/Kconfig
@@ -1,15 +1,31 @@
# SPDX-License-Identifier: GPL-2.0-only
config VIRTIO_VFIO_PCI
- tristate "VFIO support for VIRTIO NET PCI devices"
- depends on VIRTIO_PCI && VIRTIO_PCI_ADMIN_LEGACY
- select VFIO_PCI_CORE
- help
- This provides support for exposing VIRTIO NET VF devices which support
- legacy IO access, using the VFIO framework that can work with a legacy
- virtio driver in the guest.
- Based on PCIe spec, VFs do not support I/O Space.
- As of that this driver emulates I/O BAR in software to let a VF be
- seen as a transitional device by its users and let it work with
- a legacy driver.
-
- If you don't know what to do here, say N.
+ tristate "VFIO support for VIRTIO NET PCI VF devices"
+ depends on VIRTIO_PCI
+ select VFIO_PCI_CORE
+ help
+ This provides migration support for VIRTIO NET PCI VF devices
+ using the VFIO framework. Migration support requires the
+ SR-IOV PF device to support specific VIRTIO extensions,
+ otherwise this driver provides no additional functionality
+ beyond vfio-pci.
+
+ Migration support in this driver relies on dirty page tracking
+ provided by the IOMMU hardware and exposed through IOMMUFD, any
+ other use cases are dis-recommended.
+
+ If you don't know what to do here, say N.
+
+config VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ bool "Legacy I/O support for VIRTIO NET PCI VF devices"
+ depends on VIRTIO_VFIO_PCI && VIRTIO_PCI_ADMIN_LEGACY
+ default y
+ help
+ This extends the virtio-vfio-pci driver to support legacy I/O
+ access, allowing use of legacy virtio drivers with VIRTIO NET
+ PCI VF devices. Legacy I/O support requires the SR-IOV PF
+ device to support and enable specific VIRTIO extensions,
+ otherwise this driver provides no additional functionality
+ beyond vfio-pci.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/virtio/Makefile b/drivers/vfio/pci/virtio/Makefile
index 7171105baf33..d9b0bb40d6b3 100644
--- a/drivers/vfio/pci/virtio/Makefile
+++ b/drivers/vfio/pci/virtio/Makefile
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio-vfio-pci.o
-virtio-vfio-pci-y := main.o
+virtio-vfio-pci-y := main.o migrate.o
+virtio-vfio-pci-$(CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY) += legacy_io.o
diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h
new file mode 100644
index 000000000000..c7d7e27af386
--- /dev/null
+++ b/drivers/vfio/pci/virtio/common.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef VIRTIO_VFIO_COMMON_H
+#define VIRTIO_VFIO_COMMON_H
+
+#include <linux/kernel.h>
+#include <linux/virtio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+
+enum virtiovf_migf_state {
+ VIRTIOVF_MIGF_STATE_ERROR = 1,
+ VIRTIOVF_MIGF_STATE_PRECOPY = 2,
+ VIRTIOVF_MIGF_STATE_COMPLETE = 3,
+};
+
+enum virtiovf_load_state {
+ VIRTIOVF_LOAD_STATE_READ_HEADER,
+ VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA,
+ VIRTIOVF_LOAD_STATE_READ_HEADER_DATA,
+ VIRTIOVF_LOAD_STATE_PREP_CHUNK,
+ VIRTIOVF_LOAD_STATE_READ_CHUNK,
+ VIRTIOVF_LOAD_STATE_LOAD_CHUNK,
+};
+
+struct virtiovf_data_buffer {
+ struct sg_append_table table;
+ loff_t start_pos;
+ u64 length;
+ u64 allocated_length;
+ struct list_head buf_elm;
+ u8 include_header_object:1;
+ struct virtiovf_migration_file *migf;
+ /* Optimize virtiovf_get_migration_page() for sequential access */
+ struct scatterlist *last_offset_sg;
+ unsigned int sg_last_entry;
+ unsigned long last_offset;
+};
+
+enum virtiovf_migf_header_flags {
+ VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0,
+ VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0,
+};
+
+enum virtiovf_migf_header_tag {
+ VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA = 0,
+};
+
+struct virtiovf_migration_header {
+ __le64 record_size;
+ /* For future use in case we may need to change the kernel protocol */
+ __le32 flags; /* Use virtiovf_migf_header_flags */
+ __le32 tag; /* Use virtiovf_migf_header_tag */
+ __u8 data[]; /* Its size is given in the record_size */
+};
+
+struct virtiovf_migration_file {
+ struct file *filp;
+ /* synchronize access to the file state */
+ struct mutex lock;
+ loff_t max_pos;
+ u64 pre_copy_initial_bytes;
+ struct ratelimit_state pre_copy_rl_state;
+ u64 record_size;
+ u32 record_tag;
+ u8 has_obj_id:1;
+ u32 obj_id;
+ enum virtiovf_migf_state state;
+ enum virtiovf_load_state load_state;
+ /* synchronize access to the lists */
+ spinlock_t list_lock;
+ struct list_head buf_list;
+ struct list_head avail_list;
+ struct virtiovf_data_buffer *buf;
+ struct virtiovf_data_buffer *buf_header;
+ struct virtiovf_pci_core_device *virtvdev;
+};
+
+struct virtiovf_pci_core_device {
+ struct vfio_pci_core_device core_device;
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ u8 *bar0_virtual_buf;
+ /* synchronize access to the virtual buf */
+ struct mutex bar_mutex;
+ void __iomem *notify_addr;
+ u64 notify_offset;
+ __le32 pci_base_addr_0;
+ __le16 pci_cmd;
+ u8 bar0_virtual_buf_size;
+ u8 notify_bar;
+#endif
+
+ /* LM related */
+ u8 migrate_cap:1;
+ u8 deferred_reset:1;
+ /* protect migration state */
+ struct mutex state_mutex;
+ enum vfio_device_mig_state mig_state;
+ /* protect the reset_done flow */
+ spinlock_t reset_lock;
+ struct virtiovf_migration_file *resuming_migf;
+ struct virtiovf_migration_file *saving_migf;
+};
+
+void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_migration_reset_done(struct pci_dev *pdev);
+
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev);
+long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg);
+int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg);
+ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count,
+ loff_t *ppos);
+ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+ size_t count, loff_t *ppos);
+bool virtiovf_support_legacy_io(struct pci_dev *pdev);
+int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev);
+void virtiovf_legacy_io_reset_done(struct pci_dev *pdev);
+#endif
+
+#endif /* VIRTIO_VFIO_COMMON_H */
diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c
new file mode 100644
index 000000000000..20382ee15fac
--- /dev/null
+++ b/drivers/vfio/pci/virtio/legacy_io.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_pci_admin.h>
+
+#include "common.h"
+
+static int
+virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev,
+ loff_t pos, char __user *buf,
+ size_t count, bool read)
+{
+ bool msix_enabled =
+ (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX);
+ struct pci_dev *pdev = virtvdev->core_device.pdev;
+ u8 *bar0_buf = virtvdev->bar0_virtual_buf;
+ bool common;
+ u8 offset;
+ int ret;
+
+ common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled);
+ /* offset within the relevant configuration area */
+ offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled);
+ mutex_lock(&virtvdev->bar_mutex);
+ if (read) {
+ if (common)
+ ret = virtio_pci_admin_legacy_common_io_read(pdev, offset,
+ count, bar0_buf + pos);
+ else
+ ret = virtio_pci_admin_legacy_device_io_read(pdev, offset,
+ count, bar0_buf + pos);
+ if (ret)
+ goto out;
+ if (copy_to_user(buf, bar0_buf + pos, count))
+ ret = -EFAULT;
+ } else {
+ if (copy_from_user(bar0_buf + pos, buf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (common)
+ ret = virtio_pci_admin_legacy_common_io_write(pdev, offset,
+ count, bar0_buf + pos);
+ else
+ ret = virtio_pci_admin_legacy_device_io_write(pdev, offset,
+ count, bar0_buf + pos);
+ }
+out:
+ mutex_unlock(&virtvdev->bar_mutex);
+ return ret;
+}
+
+static int
+virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev,
+ loff_t pos, char __user *buf,
+ size_t count, bool read)
+{
+ struct vfio_pci_core_device *core_device = &virtvdev->core_device;
+ struct pci_dev *pdev = core_device->pdev;
+ u16 queue_notify;
+ int ret;
+
+ if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO))
+ return -EIO;
+
+ if (pos + count > virtvdev->bar0_virtual_buf_size)
+ return -EINVAL;
+
+ ret = pm_runtime_resume_and_get(&pdev->dev);
+ if (ret) {
+ pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret);
+ return -EIO;
+ }
+
+ switch (pos) {
+ case VIRTIO_PCI_QUEUE_NOTIFY:
+ if (count != sizeof(queue_notify)) {
+ ret = -EINVAL;
+ goto end;
+ }
+ if (read) {
+ ret = vfio_pci_core_ioread16(core_device, true, &queue_notify,
+ virtvdev->notify_addr);
+ if (ret)
+ goto end;
+ if (copy_to_user(buf, &queue_notify,
+ sizeof(queue_notify))) {
+ ret = -EFAULT;
+ goto end;
+ }
+ } else {
+ if (copy_from_user(&queue_notify, buf, count)) {
+ ret = -EFAULT;
+ goto end;
+ }
+ ret = vfio_pci_core_iowrite16(core_device, true, queue_notify,
+ virtvdev->notify_addr);
+ }
+ break;
+ default:
+ ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count,
+ read);
+ }
+
+end:
+ pm_runtime_put(&pdev->dev);
+ return ret ? ret : count;
+}
+
+static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
+ char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+ __le32 val32;
+ __le16 val16;
+ u8 val8;
+ int ret;
+
+ ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
+ if (ret < 0)
+ return ret;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count))
+ return -EFAULT;
+ }
+
+ if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) &&
+ vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset,
+ copy_count))
+ return -EFAULT;
+ val16 |= cpu_to_le16(PCI_COMMAND_IO);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID,
+ sizeof(val8), &copy_offset,
+ &copy_count, &register_offset)) {
+ /* Transional needs to have revision 0 */
+ val8 = 0;
+ if (copy_to_user(buf + copy_offset, &val8, copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+ sizeof(val32), &copy_offset,
+ &copy_count, &register_offset)) {
+ u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1);
+ u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0);
+
+ val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO);
+ if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ /*
+ * Transitional devices use the PCI subsystem device id as
+ * virtio device id, same as legacy driver always did.
+ */
+ val16 = cpu_to_le16(VIRTIO_ID_NET);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID,
+ sizeof(val16), &copy_offset,
+ &copy_count, &register_offset)) {
+ val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET);
+ if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ return count;
+}
+
+ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ if (!count)
+ return 0;
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return virtiovf_pci_read_config(core_vdev, buf, count, ppos);
+
+ if (index == VFIO_PCI_BAR0_REGION_INDEX)
+ return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true);
+
+ return vfio_pci_core_read(core_vdev, buf, count, ppos);
+}
+
+static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev,
+ const char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ size_t register_offset;
+ loff_t copy_offset;
+ size_t copy_count;
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+ sizeof(virtvdev->pci_cmd),
+ &copy_offset, &copy_count,
+ &register_offset)) {
+ if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset,
+ buf + copy_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+ sizeof(virtvdev->pci_base_addr_0),
+ &copy_offset, &copy_count,
+ &register_offset)) {
+ if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset,
+ buf + copy_offset,
+ copy_count))
+ return -EFAULT;
+ }
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ if (!count)
+ return 0;
+
+ if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+ return virtiovf_pci_write_config(core_vdev, buf, count, ppos);
+
+ if (index == VFIO_PCI_BAR0_REGION_INDEX)
+ return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false);
+
+ return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
+ unsigned int cmd, unsigned long arg)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+ void __user *uarg = (void __user *)arg;
+ struct vfio_region_info info = {};
+
+ if (copy_from_user(&info, uarg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ switch (info.index) {
+ case VFIO_PCI_BAR0_REGION_INDEX:
+ info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+ info.size = virtvdev->bar0_virtual_buf_size;
+ info.flags = VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE;
+ return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
+ default:
+ return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ }
+}
+
+long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case VFIO_DEVICE_GET_REGION_INFO:
+ return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg);
+ default:
+ return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+ }
+}
+
+static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct vfio_pci_core_device *core_device = &virtvdev->core_device;
+ int ret;
+
+ /*
+ * Setup the BAR where the 'notify' exists to be used by vfio as well
+ * This will let us mmap it only once and use it when needed.
+ */
+ ret = vfio_pci_core_setup_barmap(core_device,
+ virtvdev->notify_bar);
+ if (ret)
+ return ret;
+
+ virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] +
+ virtvdev->notify_offset;
+ return 0;
+}
+
+int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (!virtvdev->bar0_virtual_buf)
+ return 0;
+
+ /*
+ * Upon close_device() the vfio_pci_core_disable() is called
+ * and will close all the previous mmaps, so it seems that the
+ * valid life cycle for the 'notify' addr is per open/close.
+ */
+ return virtiovf_set_notify_addr(virtvdev);
+}
+
+static int virtiovf_get_device_config_size(unsigned short device)
+{
+ /* Network card */
+ return offsetofend(struct virtio_net_config, status);
+}
+
+static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev)
+{
+ u64 offset;
+ int ret;
+ u8 bar;
+
+ ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev,
+ VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM,
+ &bar, &offset);
+ if (ret)
+ return ret;
+
+ virtvdev->notify_bar = bar;
+ virtvdev->notify_offset = offset;
+ return 0;
+}
+
+static bool virtiovf_bar0_exists(struct pci_dev *pdev)
+{
+ struct resource *res = pdev->resource;
+
+ return res->flags;
+}
+
+bool virtiovf_support_legacy_io(struct pci_dev *pdev)
+{
+ return virtio_pci_admin_has_legacy_io(pdev) && !virtiovf_bar0_exists(pdev);
+}
+
+int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct pci_dev *pdev = virtvdev->core_device.pdev;
+ int ret;
+
+ ret = virtiovf_read_notify_info(virtvdev);
+ if (ret)
+ return ret;
+
+ virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) +
+ virtiovf_get_device_config_size(pdev->device);
+ BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size));
+ virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size,
+ GFP_KERNEL);
+ if (!virtvdev->bar0_virtual_buf)
+ return -ENOMEM;
+ mutex_init(&virtvdev->bar_mutex);
+ return 0;
+}
+
+void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+{
+ kfree(virtvdev->bar0_virtual_buf);
+}
+
+void virtiovf_legacy_io_reset_done(struct pci_dev *pdev)
+{
+ struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
+
+ virtvdev->pci_cmd = 0;
+}
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index b5d3a8c5bbc9..d534d48c4163 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -16,347 +16,12 @@
#include <linux/virtio_net.h>
#include <linux/virtio_pci_admin.h>
-struct virtiovf_pci_core_device {
- struct vfio_pci_core_device core_device;
- u8 *bar0_virtual_buf;
- /* synchronize access to the virtual buf */
- struct mutex bar_mutex;
- void __iomem *notify_addr;
- u64 notify_offset;
- __le32 pci_base_addr_0;
- __le16 pci_cmd;
- u8 bar0_virtual_buf_size;
- u8 notify_bar;
-};
-
-static int
-virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev,
- loff_t pos, char __user *buf,
- size_t count, bool read)
-{
- bool msix_enabled =
- (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX);
- struct pci_dev *pdev = virtvdev->core_device.pdev;
- u8 *bar0_buf = virtvdev->bar0_virtual_buf;
- bool common;
- u8 offset;
- int ret;
-
- common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled);
- /* offset within the relevant configuration area */
- offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled);
- mutex_lock(&virtvdev->bar_mutex);
- if (read) {
- if (common)
- ret = virtio_pci_admin_legacy_common_io_read(pdev, offset,
- count, bar0_buf + pos);
- else
- ret = virtio_pci_admin_legacy_device_io_read(pdev, offset,
- count, bar0_buf + pos);
- if (ret)
- goto out;
- if (copy_to_user(buf, bar0_buf + pos, count))
- ret = -EFAULT;
- } else {
- if (copy_from_user(bar0_buf + pos, buf, count)) {
- ret = -EFAULT;
- goto out;
- }
-
- if (common)
- ret = virtio_pci_admin_legacy_common_io_write(pdev, offset,
- count, bar0_buf + pos);
- else
- ret = virtio_pci_admin_legacy_device_io_write(pdev, offset,
- count, bar0_buf + pos);
- }
-out:
- mutex_unlock(&virtvdev->bar_mutex);
- return ret;
-}
-
-static int
-virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev,
- loff_t pos, char __user *buf,
- size_t count, bool read)
-{
- struct vfio_pci_core_device *core_device = &virtvdev->core_device;
- struct pci_dev *pdev = core_device->pdev;
- u16 queue_notify;
- int ret;
-
- if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO))
- return -EIO;
-
- if (pos + count > virtvdev->bar0_virtual_buf_size)
- return -EINVAL;
-
- ret = pm_runtime_resume_and_get(&pdev->dev);
- if (ret) {
- pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret);
- return -EIO;
- }
-
- switch (pos) {
- case VIRTIO_PCI_QUEUE_NOTIFY:
- if (count != sizeof(queue_notify)) {
- ret = -EINVAL;
- goto end;
- }
- if (read) {
- ret = vfio_pci_core_ioread16(core_device, true, &queue_notify,
- virtvdev->notify_addr);
- if (ret)
- goto end;
- if (copy_to_user(buf, &queue_notify,
- sizeof(queue_notify))) {
- ret = -EFAULT;
- goto end;
- }
- } else {
- if (copy_from_user(&queue_notify, buf, count)) {
- ret = -EFAULT;
- goto end;
- }
- ret = vfio_pci_core_iowrite16(core_device, true, queue_notify,
- virtvdev->notify_addr);
- }
- break;
- default:
- ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count,
- read);
- }
-
-end:
- pm_runtime_put(&pdev->dev);
- return ret ? ret : count;
-}
-
-static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
- char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- size_t register_offset;
- loff_t copy_offset;
- size_t copy_count;
- __le32 val32;
- __le16 val16;
- u8 val8;
- int ret;
-
- ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
- if (ret < 0)
- return ret;
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID,
- sizeof(val16), &copy_offset,
- &copy_count, &register_offset)) {
- val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count))
- return -EFAULT;
- }
-
- if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) &&
- vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
- sizeof(val16), &copy_offset,
- &copy_count, &register_offset)) {
- if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset,
- copy_count))
- return -EFAULT;
- val16 |= cpu_to_le16(PCI_COMMAND_IO);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
- copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID,
- sizeof(val8), &copy_offset,
- &copy_count, &register_offset)) {
- /* Transional needs to have revision 0 */
- val8 = 0;
- if (copy_to_user(buf + copy_offset, &val8, copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
- sizeof(val32), &copy_offset,
- &copy_count, &register_offset)) {
- u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1);
- u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0);
-
- val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO);
- if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID,
- sizeof(val16), &copy_offset,
- &copy_count, &register_offset)) {
- /*
- * Transitional devices use the PCI subsystem device id as
- * virtio device id, same as legacy driver always did.
- */
- val16 = cpu_to_le16(VIRTIO_ID_NET);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
- copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID,
- sizeof(val16), &copy_offset,
- &copy_count, &register_offset)) {
- val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET);
- if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
- copy_count))
- return -EFAULT;
- }
-
- return count;
-}
-
-static ssize_t
-virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
-
- if (!count)
- return 0;
-
- if (index == VFIO_PCI_CONFIG_REGION_INDEX)
- return virtiovf_pci_read_config(core_vdev, buf, count, ppos);
-
- if (index == VFIO_PCI_BAR0_REGION_INDEX)
- return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true);
-
- return vfio_pci_core_read(core_vdev, buf, count, ppos);
-}
-
-static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev,
- const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- size_t register_offset;
- loff_t copy_offset;
- size_t copy_count;
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
- sizeof(virtvdev->pci_cmd),
- &copy_offset, &copy_count,
- &register_offset)) {
- if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset,
- buf + copy_offset,
- copy_count))
- return -EFAULT;
- }
-
- if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
- sizeof(virtvdev->pci_base_addr_0),
- &copy_offset, &copy_count,
- &register_offset)) {
- if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset,
- buf + copy_offset,
- copy_count))
- return -EFAULT;
- }
-
- return vfio_pci_core_write(core_vdev, buf, count, ppos);
-}
-
-static ssize_t
-virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
-
- if (!count)
- return 0;
-
- if (index == VFIO_PCI_CONFIG_REGION_INDEX)
- return virtiovf_pci_write_config(core_vdev, buf, count, ppos);
-
- if (index == VFIO_PCI_BAR0_REGION_INDEX)
- return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false);
-
- return vfio_pci_core_write(core_vdev, buf, count, ppos);
-}
-
-static int
-virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev,
- unsigned int cmd, unsigned long arg)
-{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- unsigned long minsz = offsetofend(struct vfio_region_info, offset);
- void __user *uarg = (void __user *)arg;
- struct vfio_region_info info = {};
-
- if (copy_from_user(&info, uarg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- switch (info.index) {
- case VFIO_PCI_BAR0_REGION_INDEX:
- info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
- info.size = virtvdev->bar0_virtual_buf_size;
- info.flags = VFIO_REGION_INFO_FLAG_READ |
- VFIO_REGION_INFO_FLAG_WRITE;
- return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
-}
-
-static long
-virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
- unsigned long arg)
-{
- switch (cmd) {
- case VFIO_DEVICE_GET_REGION_INFO:
- return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg);
- default:
- return vfio_pci_core_ioctl(core_vdev, cmd, arg);
- }
-}
-
-static int
-virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev)
-{
- struct vfio_pci_core_device *core_device = &virtvdev->core_device;
- int ret;
-
- /*
- * Setup the BAR where the 'notify' exists to be used by vfio as well
- * This will let us mmap it only once and use it when needed.
- */
- ret = vfio_pci_core_setup_barmap(core_device,
- virtvdev->notify_bar);
- if (ret)
- return ret;
-
- virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] +
- virtvdev->notify_offset;
- return 0;
-}
+#include "common.h"
static int virtiovf_pci_open_device(struct vfio_device *core_vdev)
{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
struct vfio_pci_core_device *vdev = &virtvdev->core_device;
int ret;
@@ -364,88 +29,84 @@ static int virtiovf_pci_open_device(struct vfio_device *core_vdev)
if (ret)
return ret;
- if (virtvdev->bar0_virtual_buf) {
- /*
- * Upon close_device() the vfio_pci_core_disable() is called
- * and will close all the previous mmaps, so it seems that the
- * valid life cycle for the 'notify' addr is per open/close.
- */
- ret = virtiovf_set_notify_addr(virtvdev);
- if (ret) {
- vfio_pci_core_disable(vdev);
- return ret;
- }
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ ret = virtiovf_open_legacy_io(virtvdev);
+ if (ret) {
+ vfio_pci_core_disable(vdev);
+ return ret;
}
+#endif
+ virtiovf_open_migration(virtvdev);
vfio_pci_core_finish_enable(vdev);
return 0;
}
-static int virtiovf_get_device_config_size(unsigned short device)
+static void virtiovf_pci_close_device(struct vfio_device *core_vdev)
{
- /* Network card */
- return offsetofend(struct virtio_net_config, status);
-}
-
-static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev)
-{
- u64 offset;
- int ret;
- u8 bar;
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
- ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev,
- VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM,
- &bar, &offset);
- if (ret)
- return ret;
-
- virtvdev->notify_bar = bar;
- virtvdev->notify_offset = offset;
- return 0;
+ virtiovf_close_migration(virtvdev);
+ vfio_pci_core_close_device(core_vdev);
}
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
static int virtiovf_pci_init_device(struct vfio_device *core_vdev)
{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
- struct pci_dev *pdev;
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
int ret;
ret = vfio_pci_core_init_dev(core_vdev);
if (ret)
return ret;
- pdev = virtvdev->core_device.pdev;
- ret = virtiovf_read_notify_info(virtvdev);
- if (ret)
- return ret;
-
- virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) +
- virtiovf_get_device_config_size(pdev->device);
- BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size));
- virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size,
- GFP_KERNEL);
- if (!virtvdev->bar0_virtual_buf)
- return -ENOMEM;
- mutex_init(&virtvdev->bar_mutex);
- return 0;
+ /*
+ * The vfio_device_ops.init() callback is set to virtiovf_pci_init_device()
+ * only when legacy I/O is supported. Now, let's initialize it.
+ */
+ return virtiovf_init_legacy_io(virtvdev);
}
+#endif
static void virtiovf_pci_core_release_dev(struct vfio_device *core_vdev)
{
- struct virtiovf_pci_core_device *virtvdev = container_of(
- core_vdev, struct virtiovf_pci_core_device, core_device.vdev);
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev,
+ struct virtiovf_pci_core_device, core_device.vdev);
- kfree(virtvdev->bar0_virtual_buf);
+ virtiovf_release_legacy_io(virtvdev);
+#endif
vfio_pci_core_release_dev(core_vdev);
}
-static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = {
- .name = "virtio-vfio-pci-trans",
+static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = {
+ .name = "virtio-vfio-pci-lm",
+ .init = vfio_pci_core_init_dev,
+ .release = virtiovf_pci_core_release_dev,
+ .open_device = virtiovf_pci_open_device,
+ .close_device = virtiovf_pci_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
+};
+
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = {
+ .name = "virtio-vfio-pci-trans-lm",
.init = virtiovf_pci_init_device,
.release = virtiovf_pci_core_release_dev,
.open_device = virtiovf_pci_open_device,
- .close_device = vfio_pci_core_close_device,
+ .close_device = virtiovf_pci_close_device,
.ioctl = virtiovf_vfio_pci_core_ioctl,
.device_feature = vfio_pci_core_ioctl_feature,
.read = virtiovf_pci_core_read,
@@ -458,6 +119,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = {
.attach_ioas = vfio_iommufd_physical_attach_ioas,
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
+#endif
static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
.name = "virtio-vfio-pci",
@@ -478,29 +140,34 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
.detach_ioas = vfio_iommufd_physical_detach_ioas,
};
-static bool virtiovf_bar0_exists(struct pci_dev *pdev)
-{
- struct resource *res = pdev->resource;
-
- return res->flags;
-}
-
static int virtiovf_pci_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
const struct vfio_device_ops *ops = &virtiovf_vfio_pci_ops;
struct virtiovf_pci_core_device *virtvdev;
+ bool sup_legacy_io = false;
+ bool sup_lm = false;
int ret;
- if (pdev->is_virtfn && virtio_pci_admin_has_legacy_io(pdev) &&
- !virtiovf_bar0_exists(pdev))
- ops = &virtiovf_vfio_pci_tran_ops;
+ if (pdev->is_virtfn) {
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ sup_legacy_io = virtiovf_support_legacy_io(pdev);
+ if (sup_legacy_io)
+ ops = &virtiovf_vfio_pci_tran_lm_ops;
+#endif
+ sup_lm = virtio_pci_admin_has_dev_parts(pdev);
+ if (sup_lm && !sup_legacy_io)
+ ops = &virtiovf_vfio_pci_lm_ops;
+ }
virtvdev = vfio_alloc_device(virtiovf_pci_core_device, core_device.vdev,
&pdev->dev, ops);
if (IS_ERR(virtvdev))
return PTR_ERR(virtvdev);
+ if (sup_lm)
+ virtiovf_set_migratable(virtvdev);
+
dev_set_drvdata(&pdev->dev, &virtvdev->core_device);
ret = vfio_pci_core_register_device(&virtvdev->core_device);
if (ret)
@@ -529,9 +196,10 @@ MODULE_DEVICE_TABLE(pci, virtiovf_pci_table);
static void virtiovf_pci_aer_reset_done(struct pci_dev *pdev)
{
- struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
-
- virtvdev->pci_cmd = 0;
+#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY
+ virtiovf_legacy_io_reset_done(pdev);
+#endif
+ virtiovf_migration_reset_done(pdev);
}
static const struct pci_error_handlers virtiovf_err_handlers = {
diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c
new file mode 100644
index 000000000000..ee54f4c17857
--- /dev/null
+++ b/drivers/vfio/pci/virtio/migrate.c
@@ -0,0 +1,1337 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/virtio_pci.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_pci_admin.h>
+#include <linux/anon_inodes.h>
+
+#include "common.h"
+
+/* Device specification max parts size */
+#define MAX_LOAD_SIZE (BIT_ULL(BITS_PER_TYPE \
+ (((struct virtio_admin_cmd_dev_parts_metadata_result *)0)->parts_size.size)) - 1)
+
+/* Initial target buffer size */
+#define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M
+
+static int
+virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
+ u32 ctx_size);
+
+static struct page *
+virtiovf_get_migration_page(struct virtiovf_data_buffer *buf,
+ unsigned long offset)
+{
+ unsigned long cur_offset = 0;
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* All accesses are sequential */
+ if (offset < buf->last_offset || !buf->last_offset_sg) {
+ buf->last_offset = 0;
+ buf->last_offset_sg = buf->table.sgt.sgl;
+ buf->sg_last_entry = 0;
+ }
+
+ cur_offset = buf->last_offset;
+
+ for_each_sg(buf->last_offset_sg, sg,
+ buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
+ if (offset < sg->length + cur_offset) {
+ buf->last_offset_sg = sg;
+ buf->sg_last_entry += i;
+ buf->last_offset = cur_offset;
+ return nth_page(sg_page(sg),
+ (offset - cur_offset) / PAGE_SIZE);
+ }
+ cur_offset += sg->length;
+ }
+ return NULL;
+}
+
+static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf,
+ unsigned int npages)
+{
+ unsigned int to_alloc = npages;
+ struct page **page_list;
+ unsigned long filled;
+ unsigned int to_fill;
+ int ret;
+ int i;
+
+ to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
+ page_list = kvcalloc(to_fill, sizeof(*page_list), GFP_KERNEL_ACCOUNT);
+ if (!page_list)
+ return -ENOMEM;
+
+ do {
+ filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
+ page_list);
+ if (!filled) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ to_alloc -= filled;
+ ret = sg_alloc_append_table_from_pages(&buf->table, page_list,
+ filled, 0, filled << PAGE_SHIFT, UINT_MAX,
+ SG_MAX_SINGLE_ALLOC, GFP_KERNEL_ACCOUNT);
+
+ if (ret)
+ goto err_append;
+ buf->allocated_length += filled * PAGE_SIZE;
+ /* clean input for another bulk allocation */
+ memset(page_list, 0, filled * sizeof(*page_list));
+ to_fill = min_t(unsigned int, to_alloc,
+ PAGE_SIZE / sizeof(*page_list));
+ } while (to_alloc > 0);
+
+ kvfree(page_list);
+ return 0;
+
+err_append:
+ for (i = filled - 1; i >= 0; i--)
+ __free_page(page_list[i]);
+err:
+ kvfree(page_list);
+ return ret;
+}
+
+static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf)
+{
+ struct sg_page_iter sg_iter;
+
+ /* Undo alloc_pages_bulk_array() */
+ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
+ __free_page(sg_page_iter_page(&sg_iter));
+ sg_free_append_table(&buf->table);
+ kfree(buf);
+}
+
+static struct virtiovf_data_buffer *
+virtiovf_alloc_data_buffer(struct virtiovf_migration_file *migf, size_t length)
+{
+ struct virtiovf_data_buffer *buf;
+ int ret;
+
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ ret = virtiovf_add_migration_pages(buf,
+ DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (ret)
+ goto end;
+
+ buf->migf = migf;
+ return buf;
+end:
+ virtiovf_free_data_buffer(buf);
+ return ERR_PTR(ret);
+}
+
+static void virtiovf_put_data_buffer(struct virtiovf_data_buffer *buf)
+{
+ spin_lock_irq(&buf->migf->list_lock);
+ list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
+ spin_unlock_irq(&buf->migf->list_lock);
+}
+
+static int
+virtiovf_pci_alloc_obj_id(struct virtiovf_pci_core_device *virtvdev, u8 type,
+ u32 *obj_id)
+{
+ return virtio_pci_admin_obj_create(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, type, obj_id);
+}
+
+static void
+virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device *virtvdev, u32 obj_id)
+{
+ virtio_pci_admin_obj_destroy(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id);
+}
+
+static struct virtiovf_data_buffer *
+virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length)
+{
+ struct virtiovf_data_buffer *buf, *temp_buf;
+ struct list_head free_list;
+
+ INIT_LIST_HEAD(&free_list);
+
+ spin_lock_irq(&migf->list_lock);
+ list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
+ list_del_init(&buf->buf_elm);
+ if (buf->allocated_length >= length) {
+ spin_unlock_irq(&migf->list_lock);
+ goto found;
+ }
+ /*
+ * Prevent holding redundant buffers. Put in a free
+ * list and call at the end not under the spin lock
+ * (&migf->list_lock) to minimize its scope usage.
+ */
+ list_add(&buf->buf_elm, &free_list);
+ }
+ spin_unlock_irq(&migf->list_lock);
+ buf = virtiovf_alloc_data_buffer(migf, length);
+
+found:
+ while ((temp_buf = list_first_entry_or_null(&free_list,
+ struct virtiovf_data_buffer, buf_elm))) {
+ list_del(&temp_buf->buf_elm);
+ virtiovf_free_data_buffer(temp_buf);
+ }
+
+ return buf;
+}
+
+static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf)
+{
+ struct virtiovf_data_buffer *entry;
+
+ if (migf->buf) {
+ virtiovf_free_data_buffer(migf->buf);
+ migf->buf = NULL;
+ }
+
+ if (migf->buf_header) {
+ virtiovf_free_data_buffer(migf->buf_header);
+ migf->buf_header = NULL;
+ }
+
+ list_splice(&migf->avail_list, &migf->buf_list);
+
+ while ((entry = list_first_entry_or_null(&migf->buf_list,
+ struct virtiovf_data_buffer, buf_elm))) {
+ list_del(&entry->buf_elm);
+ virtiovf_free_data_buffer(entry);
+ }
+
+ if (migf->has_obj_id)
+ virtiovf_pci_free_obj_id(migf->virtvdev, migf->obj_id);
+}
+
+static void virtiovf_disable_fd(struct virtiovf_migration_file *migf)
+{
+ mutex_lock(&migf->lock);
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+ migf->filp->f_pos = 0;
+ mutex_unlock(&migf->lock);
+}
+
+static void virtiovf_disable_fds(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (virtvdev->resuming_migf) {
+ virtiovf_disable_fd(virtvdev->resuming_migf);
+ virtiovf_clean_migf_resources(virtvdev->resuming_migf);
+ fput(virtvdev->resuming_migf->filp);
+ virtvdev->resuming_migf = NULL;
+ }
+ if (virtvdev->saving_migf) {
+ virtiovf_disable_fd(virtvdev->saving_migf);
+ virtiovf_clean_migf_resources(virtvdev->saving_migf);
+ fput(virtvdev->saving_migf->filp);
+ virtvdev->saving_migf = NULL;
+ }
+}
+
+/*
+ * This function is called in all state_mutex unlock cases to
+ * handle a 'deferred_reset' if exists.
+ */
+static void virtiovf_state_mutex_unlock(struct virtiovf_pci_core_device *virtvdev)
+{
+again:
+ spin_lock(&virtvdev->reset_lock);
+ if (virtvdev->deferred_reset) {
+ virtvdev->deferred_reset = false;
+ spin_unlock(&virtvdev->reset_lock);
+ virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+ virtiovf_disable_fds(virtvdev);
+ goto again;
+ }
+ mutex_unlock(&virtvdev->state_mutex);
+ spin_unlock(&virtvdev->reset_lock);
+}
+
+void virtiovf_migration_reset_done(struct pci_dev *pdev)
+{
+ struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev);
+
+ if (!virtvdev->migrate_cap)
+ return;
+
+ /*
+ * As the higher VFIO layers are holding locks across reset and using
+ * those same locks with the mm_lock we need to prevent ABBA deadlock
+ * with the state_mutex and mm_lock.
+ * In case the state_mutex was taken already we defer the cleanup work
+ * to the unlock flow of the other running context.
+ */
+ spin_lock(&virtvdev->reset_lock);
+ virtvdev->deferred_reset = true;
+ if (!mutex_trylock(&virtvdev->state_mutex)) {
+ spin_unlock(&virtvdev->reset_lock);
+ return;
+ }
+ spin_unlock(&virtvdev->reset_lock);
+ virtiovf_state_mutex_unlock(virtvdev);
+}
+
+static int virtiovf_release_file(struct inode *inode, struct file *filp)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+
+ virtiovf_disable_fd(migf);
+ mutex_destroy(&migf->lock);
+ kfree(migf);
+ return 0;
+}
+
+static struct virtiovf_data_buffer *
+virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file *migf,
+ loff_t pos, bool *end_of_data)
+{
+ struct virtiovf_data_buffer *buf;
+ bool found = false;
+
+ *end_of_data = false;
+ spin_lock_irq(&migf->list_lock);
+ if (list_empty(&migf->buf_list)) {
+ *end_of_data = true;
+ goto end;
+ }
+
+ buf = list_first_entry(&migf->buf_list, struct virtiovf_data_buffer,
+ buf_elm);
+ if (pos >= buf->start_pos &&
+ pos < buf->start_pos + buf->length) {
+ found = true;
+ goto end;
+ }
+
+ /*
+ * As we use a stream based FD we may expect having the data always
+ * on first chunk
+ */
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+
+end:
+ spin_unlock_irq(&migf->list_lock);
+ return found ? buf : NULL;
+}
+
+static ssize_t virtiovf_buf_read(struct virtiovf_data_buffer *vhca_buf,
+ char __user **buf, size_t *len, loff_t *pos)
+{
+ unsigned long offset;
+ ssize_t done = 0;
+ size_t copy_len;
+
+ copy_len = min_t(size_t,
+ vhca_buf->start_pos + vhca_buf->length - *pos, *len);
+ while (copy_len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+ page_offset = offset % PAGE_SIZE;
+ offset -= page_offset;
+ page = virtiovf_get_migration_page(vhca_buf, offset);
+ if (!page)
+ return -EINVAL;
+ page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ ret = copy_to_user(*buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (ret)
+ return -EFAULT;
+ *pos += page_len;
+ *len -= page_len;
+ *buf += page_len;
+ done += page_len;
+ copy_len -= page_len;
+ }
+
+ if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
+ spin_lock_irq(&vhca_buf->migf->list_lock);
+ list_del_init(&vhca_buf->buf_elm);
+ list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+ spin_unlock_irq(&vhca_buf->migf->list_lock);
+ }
+
+ return done;
+}
+
+static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *pos)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+ struct virtiovf_data_buffer *vhca_buf;
+ bool first_loop_call = true;
+ bool end_of_data;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ mutex_lock(&migf->lock);
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len) {
+ ssize_t count;
+
+ vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, &end_of_data);
+ if (first_loop_call) {
+ first_loop_call = false;
+ /* Temporary end of file as part of PRE_COPY */
+ if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) {
+ done = -ENOMSG;
+ goto out_unlock;
+ }
+ if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
+ if (end_of_data)
+ goto out_unlock;
+
+ if (!vhca_buf) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ count = virtiovf_buf_read(vhca_buf, &buf, &len, pos);
+ if (count < 0) {
+ done = count;
+ goto out_unlock;
+ }
+ done += count;
+ }
+
+out_unlock:
+ mutex_unlock(&migf->lock);
+ return done;
+}
+
+static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+ struct virtiovf_pci_core_device *virtvdev = migf->virtvdev;
+ struct vfio_precopy_info info = {};
+ loff_t *pos = &filp->f_pos;
+ bool end_of_data = false;
+ unsigned long minsz;
+ u32 ctx_size = 0;
+ int ret;
+
+ if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ mutex_lock(&virtvdev->state_mutex);
+ if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
+ virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+ ret = -EINVAL;
+ goto err_state_unlock;
+ }
+
+ /*
+ * The virtio specification does not include a PRE_COPY concept.
+ * Since we can expect the data to remain the same for a certain period,
+ * we use a rate limiter mechanism before making a call to the device.
+ */
+ if (__ratelimit(&migf->pre_copy_rl_state)) {
+
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &ctx_size);
+ if (ret)
+ goto err_state_unlock;
+ }
+
+ mutex_lock(&migf->lock);
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+ ret = -ENODEV;
+ goto err_migf_unlock;
+ }
+
+ if (migf->pre_copy_initial_bytes > *pos) {
+ info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
+ } else {
+ info.dirty_bytes = migf->max_pos - *pos;
+ if (!info.dirty_bytes)
+ end_of_data = true;
+ info.dirty_bytes += ctx_size;
+ }
+
+ if (!end_of_data || !ctx_size) {
+ mutex_unlock(&migf->lock);
+ goto done;
+ }
+
+ mutex_unlock(&migf->lock);
+ /*
+ * We finished transferring the current state and the device has a
+ * dirty state, read a new state.
+ */
+ ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+ if (ret)
+ /*
+ * The machine is running, and context size could be grow, so no reason to mark
+ * the device state as VIRTIOVF_MIGF_STATE_ERROR.
+ */
+ goto err_state_unlock;
+
+done:
+ virtiovf_state_mutex_unlock(virtvdev);
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+ return 0;
+
+err_migf_unlock:
+ mutex_unlock(&migf->lock);
+err_state_unlock:
+ virtiovf_state_mutex_unlock(virtvdev);
+ return ret;
+}
+
+static const struct file_operations virtiovf_save_fops = {
+ .owner = THIS_MODULE,
+ .read = virtiovf_save_read,
+ .unlocked_ioctl = virtiovf_precopy_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+ .release = virtiovf_release_file,
+};
+
+static int
+virtiovf_add_buf_header(struct virtiovf_data_buffer *header_buf,
+ u32 data_size)
+{
+ struct virtiovf_migration_file *migf = header_buf->migf;
+ struct virtiovf_migration_header header = {};
+ struct page *page;
+ u8 *to_buff;
+
+ header.record_size = cpu_to_le64(data_size);
+ header.flags = cpu_to_le32(VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY);
+ header.tag = cpu_to_le32(VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA);
+ page = virtiovf_get_migration_page(header_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &header, sizeof(header));
+ kunmap_local(to_buff);
+ header_buf->length = sizeof(header);
+ header_buf->start_pos = header_buf->migf->max_pos;
+ migf->max_pos += header_buf->length;
+ spin_lock_irq(&migf->list_lock);
+ list_add_tail(&header_buf->buf_elm, &migf->buf_list);
+ spin_unlock_irq(&migf->list_lock);
+ return 0;
+}
+
+static int
+virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf,
+ u32 ctx_size)
+{
+ struct virtiovf_data_buffer *header_buf;
+ struct virtiovf_data_buffer *buf;
+ bool unmark_end = false;
+ struct scatterlist *sg;
+ unsigned int i;
+ u32 res_size;
+ int nent;
+ int ret;
+
+ buf = virtiovf_get_data_buffer(migf, ctx_size);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ /* Find the total count of SG entries which satisfies the size */
+ nent = sg_nents_for_len(buf->table.sgt.sgl, ctx_size);
+ if (nent <= 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Iterate to that SG entry and mark it as last (if it's not already)
+ * to let underlay layers iterate only till that entry.
+ */
+ for_each_sg(buf->table.sgt.sgl, sg, nent - 1, i)
+ ;
+
+ if (!sg_is_last(sg)) {
+ unmark_end = true;
+ sg_mark_end(sg);
+ }
+
+ ret = virtio_pci_admin_dev_parts_get(migf->virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS,
+ migf->obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL,
+ buf->table.sgt.sgl, &res_size);
+ /* Restore the original SG mark end */
+ if (unmark_end)
+ sg_unmark_end(sg);
+ if (ret)
+ goto out;
+
+ buf->length = res_size;
+ header_buf = virtiovf_get_data_buffer(migf,
+ sizeof(struct virtiovf_migration_header));
+ if (IS_ERR(header_buf)) {
+ ret = PTR_ERR(header_buf);
+ goto out;
+ }
+
+ ret = virtiovf_add_buf_header(header_buf, res_size);
+ if (ret)
+ goto out_header;
+
+ buf->start_pos = buf->migf->max_pos;
+ migf->max_pos += buf->length;
+ spin_lock(&migf->list_lock);
+ list_add_tail(&buf->buf_elm, &migf->buf_list);
+ spin_unlock_irq(&migf->list_lock);
+ return 0;
+
+out_header:
+ virtiovf_put_data_buffer(header_buf);
+out:
+ virtiovf_put_data_buffer(buf);
+ return ret;
+}
+
+static int
+virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct virtiovf_migration_file *migf = virtvdev->saving_migf;
+ u32 ctx_size;
+ int ret;
+
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR)
+ return -ENODEV;
+
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &ctx_size);
+ if (ret)
+ goto err;
+
+ if (!ctx_size) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+ if (ret)
+ goto err;
+
+ migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
+ return 0;
+
+err:
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+ return ret;
+}
+
+static struct virtiovf_migration_file *
+virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev,
+ bool pre_copy)
+{
+ struct virtiovf_migration_file *migf;
+ u32 ctx_size;
+ u32 obj_id;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_save_fops, migf,
+ O_RDONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+ migf->virtvdev = virtvdev;
+
+ lockdep_assert_held(&virtvdev->state_mutex);
+ ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET,
+ &obj_id);
+ if (ret)
+ goto out;
+
+ migf->obj_id = obj_id;
+ /* Mark as having a valid obj id which can be even 0 */
+ migf->has_obj_id = true;
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &ctx_size);
+ if (ret)
+ goto out_clean;
+
+ if (!ctx_size) {
+ ret = -EINVAL;
+ goto out_clean;
+ }
+
+ ret = virtiovf_read_device_context_chunk(migf, ctx_size);
+ if (ret)
+ goto out_clean;
+
+ if (pre_copy) {
+ migf->pre_copy_initial_bytes = migf->max_pos;
+ /* Arbitrarily set the pre-copy rate limit to 1-second intervals */
+ ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1);
+ /* Prevent any rate messages upon its usage */
+ ratelimit_set_flags(&migf->pre_copy_rl_state,
+ RATELIMIT_MSG_ON_RELEASE);
+ migf->state = VIRTIOVF_MIGF_STATE_PRECOPY;
+ } else {
+ migf->state = VIRTIOVF_MIGF_STATE_COMPLETE;
+ }
+
+ return migf;
+
+out_clean:
+ virtiovf_clean_migf_resources(migf);
+out:
+ fput(migf->filp);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Set the required object header at the beginning of the buffer.
+ * The actual device parts data will be written post of the header offset.
+ */
+static int virtiovf_set_obj_cmd_header(struct virtiovf_data_buffer *vhca_buf)
+{
+ struct virtio_admin_cmd_resource_obj_cmd_hdr obj_hdr = {};
+ struct page *page;
+ u8 *to_buff;
+
+ obj_hdr.type = cpu_to_le16(VIRTIO_RESOURCE_OBJ_DEV_PARTS);
+ obj_hdr.id = cpu_to_le32(vhca_buf->migf->obj_id);
+ page = virtiovf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ memcpy(to_buff, &obj_hdr, sizeof(obj_hdr));
+ kunmap_local(to_buff);
+
+ /* Mark the buffer as including the header object data */
+ vhca_buf->include_header_object = 1;
+ return 0;
+}
+
+static int
+virtiovf_append_page_to_mig_buf(struct virtiovf_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ unsigned long offset;
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int ret;
+
+ offset = *pos - vhca_buf->start_pos;
+
+ if (vhca_buf->include_header_object)
+ /* The buffer holds the object header, update the offset accordingly */
+ offset += sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr);
+
+ page_offset = offset % PAGE_SIZE;
+
+ page = virtiovf_get_migration_page(vhca_buf, offset - page_offset);
+ if (!page)
+ return -EINVAL;
+
+ page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + page_offset, *buf, page_len);
+ kunmap_local(to_buff);
+ if (ret)
+ return -EFAULT;
+
+ *pos += page_len;
+ *done += page_len;
+ *buf += page_len;
+ *len -= page_len;
+ vhca_buf->length += page_len;
+ return 0;
+}
+
+static ssize_t
+virtiovf_resume_read_chunk(struct virtiovf_migration_file *migf,
+ struct virtiovf_data_buffer *vhca_buf,
+ size_t chunk_size, const char __user **buf,
+ size_t *len, loff_t *pos, ssize_t *done,
+ bool *has_work)
+{
+ size_t copy_len, to_copy;
+ int ret;
+
+ to_copy = min_t(size_t, *len, chunk_size - vhca_buf->length);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy,
+ pos, done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == chunk_size) {
+ migf->load_state = VIRTIOVF_LOAD_STATE_LOAD_CHUNK;
+ migf->max_pos += chunk_size;
+ *has_work = true;
+ }
+
+ return 0;
+}
+
+static int
+virtiovf_resume_read_header_data(struct virtiovf_migration_file *migf,
+ struct virtiovf_data_buffer *vhca_buf,
+ const char __user **buf, size_t *len,
+ loff_t *pos, ssize_t *done)
+{
+ size_t copy_len, to_copy;
+ size_t required_data;
+ int ret;
+
+ required_data = migf->record_size - vhca_buf->length;
+ to_copy = min_t(size_t, *len, required_data);
+ copy_len = to_copy;
+ while (to_copy) {
+ ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy,
+ pos, done);
+ if (ret)
+ return ret;
+ }
+
+ *len -= copy_len;
+ if (vhca_buf->length == migf->record_size) {
+ switch (migf->record_tag) {
+ default:
+ /* Optional tag */
+ break;
+ }
+
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
+ migf->max_pos += migf->record_size;
+ vhca_buf->length = 0;
+ }
+
+ return 0;
+}
+
+static int
+virtiovf_resume_read_header(struct virtiovf_migration_file *migf,
+ struct virtiovf_data_buffer *vhca_buf,
+ const char __user **buf,
+ size_t *len, loff_t *pos,
+ ssize_t *done, bool *has_work)
+{
+ struct page *page;
+ size_t copy_len;
+ u8 *to_buff;
+ int ret;
+
+ copy_len = min_t(size_t, *len,
+ sizeof(struct virtiovf_migration_header) - vhca_buf->length);
+ page = virtiovf_get_migration_page(vhca_buf, 0);
+ if (!page)
+ return -EINVAL;
+ to_buff = kmap_local_page(page);
+ ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto end;
+ }
+
+ *buf += copy_len;
+ *pos += copy_len;
+ *done += copy_len;
+ *len -= copy_len;
+ vhca_buf->length += copy_len;
+ if (vhca_buf->length == sizeof(struct virtiovf_migration_header)) {
+ u64 record_size;
+ u32 flags;
+
+ record_size = le64_to_cpup((__le64 *)to_buff);
+ if (record_size > MAX_LOAD_SIZE) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ migf->record_size = record_size;
+ flags = le32_to_cpup((__le32 *)(to_buff +
+ offsetof(struct virtiovf_migration_header, flags)));
+ migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
+ offsetof(struct virtiovf_migration_header, tag)));
+ switch (migf->record_tag) {
+ case VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA:
+ migf->load_state = VIRTIOVF_LOAD_STATE_PREP_CHUNK;
+ break;
+ default:
+ if (!(flags & VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
+ ret = -EOPNOTSUPP;
+ goto end;
+ }
+ /* We may read and skip this optional record data */
+ migf->load_state = VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA;
+ }
+
+ migf->max_pos += vhca_buf->length;
+ vhca_buf->length = 0;
+ *has_work = true;
+ }
+end:
+ kunmap_local(to_buff);
+ return ret;
+}
+
+static ssize_t virtiovf_resume_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct virtiovf_migration_file *migf = filp->private_data;
+ struct virtiovf_data_buffer *vhca_buf = migf->buf;
+ struct virtiovf_data_buffer *vhca_buf_header = migf->buf_header;
+ unsigned int orig_length;
+ bool has_work = false;
+ ssize_t done = 0;
+ int ret = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+ if (*pos < vhca_buf->start_pos)
+ return -EINVAL;
+
+ mutex_lock(&migf->virtvdev->state_mutex);
+ mutex_lock(&migf->lock);
+ if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) {
+ done = -ENODEV;
+ goto out_unlock;
+ }
+
+ while (len || has_work) {
+ has_work = false;
+ switch (migf->load_state) {
+ case VIRTIOVF_LOAD_STATE_READ_HEADER:
+ ret = virtiovf_resume_read_header(migf, vhca_buf_header, &buf,
+ &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA:
+ if (vhca_buf_header->allocated_length < migf->record_size) {
+ virtiovf_free_data_buffer(vhca_buf_header);
+
+ migf->buf_header = virtiovf_alloc_data_buffer(migf,
+ migf->record_size);
+ if (IS_ERR(migf->buf_header)) {
+ ret = PTR_ERR(migf->buf_header);
+ migf->buf_header = NULL;
+ goto out_unlock;
+ }
+
+ vhca_buf_header = migf->buf_header;
+ }
+
+ vhca_buf_header->start_pos = migf->max_pos;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER_DATA;
+ break;
+ case VIRTIOVF_LOAD_STATE_READ_HEADER_DATA:
+ ret = virtiovf_resume_read_header_data(migf, vhca_buf_header,
+ &buf, &len, pos, &done);
+ if (ret)
+ goto out_unlock;
+ break;
+ case VIRTIOVF_LOAD_STATE_PREP_CHUNK:
+ {
+ u32 cmd_size = migf->record_size +
+ sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr);
+
+ /*
+ * The DMA map/unmap is managed in virtio layer, we just need to extend
+ * the SG pages to hold the extra required chunk data.
+ */
+ if (vhca_buf->allocated_length < cmd_size) {
+ ret = virtiovf_add_migration_pages(vhca_buf,
+ DIV_ROUND_UP_ULL(cmd_size - vhca_buf->allocated_length,
+ PAGE_SIZE));
+ if (ret)
+ goto out_unlock;
+ }
+
+ vhca_buf->start_pos = migf->max_pos;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_CHUNK;
+ break;
+ }
+ case VIRTIOVF_LOAD_STATE_READ_CHUNK:
+ ret = virtiovf_resume_read_chunk(migf, vhca_buf, migf->record_size,
+ &buf, &len, pos, &done, &has_work);
+ if (ret)
+ goto out_unlock;
+ break;
+ case VIRTIOVF_LOAD_STATE_LOAD_CHUNK:
+ /* Mark the last SG entry and set its length */
+ sg_mark_end(vhca_buf->last_offset_sg);
+ orig_length = vhca_buf->last_offset_sg->length;
+ /* Length should include the resource object command header */
+ vhca_buf->last_offset_sg->length = vhca_buf->length +
+ sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr) -
+ vhca_buf->last_offset;
+ ret = virtio_pci_admin_dev_parts_set(migf->virtvdev->core_device.pdev,
+ vhca_buf->table.sgt.sgl);
+ /* Restore the original SG data */
+ vhca_buf->last_offset_sg->length = orig_length;
+ sg_unmark_end(vhca_buf->last_offset_sg);
+ if (ret)
+ goto out_unlock;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
+ /* be ready for reading the next chunk */
+ vhca_buf->length = 0;
+ break;
+ default:
+ break;
+ }
+ }
+
+out_unlock:
+ if (ret)
+ migf->state = VIRTIOVF_MIGF_STATE_ERROR;
+ mutex_unlock(&migf->lock);
+ virtiovf_state_mutex_unlock(migf->virtvdev);
+ return ret ? ret : done;
+}
+
+static const struct file_operations virtiovf_resume_fops = {
+ .owner = THIS_MODULE,
+ .write = virtiovf_resume_write,
+ .release = virtiovf_release_file,
+};
+
+static struct virtiovf_migration_file *
+virtiovf_pci_resume_device_data(struct virtiovf_pci_core_device *virtvdev)
+{
+ struct virtiovf_migration_file *migf;
+ struct virtiovf_data_buffer *buf;
+ u32 obj_id;
+ int ret;
+
+ migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+ if (!migf)
+ return ERR_PTR(-ENOMEM);
+
+ migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_resume_fops, migf,
+ O_WRONLY);
+ if (IS_ERR(migf->filp)) {
+ ret = PTR_ERR(migf->filp);
+ kfree(migf);
+ return ERR_PTR(ret);
+ }
+
+ stream_open(migf->filp->f_inode, migf->filp);
+ mutex_init(&migf->lock);
+ INIT_LIST_HEAD(&migf->buf_list);
+ INIT_LIST_HEAD(&migf->avail_list);
+ spin_lock_init(&migf->list_lock);
+
+ buf = virtiovf_alloc_data_buffer(migf, VIRTIOVF_TARGET_INITIAL_BUF_SIZE);
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out;
+ }
+
+ migf->buf = buf;
+
+ buf = virtiovf_alloc_data_buffer(migf,
+ sizeof(struct virtiovf_migration_header));
+ if (IS_ERR(buf)) {
+ ret = PTR_ERR(buf);
+ goto out_clean;
+ }
+
+ migf->buf_header = buf;
+ migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER;
+
+ migf->virtvdev = virtvdev;
+ ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET,
+ &obj_id);
+ if (ret)
+ goto out_clean;
+
+ migf->obj_id = obj_id;
+ /* Mark as having a valid obj id which can be even 0 */
+ migf->has_obj_id = true;
+ ret = virtiovf_set_obj_cmd_header(migf->buf);
+ if (ret)
+ goto out_clean;
+
+ return migf;
+
+out_clean:
+ virtiovf_clean_migf_resources(migf);
+out:
+ fput(migf->filp);
+ return ERR_PTR(ret);
+}
+
+static struct file *
+virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev,
+ u32 new)
+{
+ u32 cur = virtvdev->mig_state;
+ int ret;
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
+ /* NOP */
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ /* NOP */
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev,
+ BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED));
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
+ ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0);
+ if (ret)
+ return ERR_PTR(ret);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ struct virtiovf_migration_file *migf;
+
+ migf = virtiovf_pci_save_device_data(virtvdev, false);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ virtvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
+ (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+ virtiovf_disable_fds(virtvdev);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
+ struct virtiovf_migration_file *migf;
+
+ migf = virtiovf_pci_resume_device_data(virtvdev);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ virtvdev->resuming_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
+ virtiovf_disable_fds(virtvdev);
+ return NULL;
+ }
+
+ if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
+ (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+ new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
+ struct virtiovf_migration_file *migf;
+
+ migf = virtiovf_pci_save_device_data(virtvdev, true);
+ if (IS_ERR(migf))
+ return ERR_CAST(migf);
+ get_file(migf->filp);
+ virtvdev->saving_migf = migf;
+ return migf->filp;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
+ ret = virtiovf_pci_save_device_final_data(virtvdev);
+ return ret ? ERR_PTR(ret) : NULL;
+ }
+
+ /*
+ * vfio_mig_get_next_state() does not use arcs other than the above
+ */
+ WARN_ON(true);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct file *
+virtiovf_pci_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ enum vfio_device_mig_state next_state;
+ struct file *res = NULL;
+ int ret;
+
+ mutex_lock(&virtvdev->state_mutex);
+ while (new_state != virtvdev->mig_state) {
+ ret = vfio_mig_get_next_state(vdev, virtvdev->mig_state,
+ new_state, &next_state);
+ if (ret) {
+ res = ERR_PTR(ret);
+ break;
+ }
+ res = virtiovf_pci_step_device_state_locked(virtvdev, next_state);
+ if (IS_ERR(res))
+ break;
+ virtvdev->mig_state = next_state;
+ if (WARN_ON(res && new_state != virtvdev->mig_state)) {
+ fput(res);
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ virtiovf_state_mutex_unlock(virtvdev);
+ return res;
+}
+
+static int virtiovf_pci_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *curr_state)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ vdev, struct virtiovf_pci_core_device, core_device.vdev);
+
+ mutex_lock(&virtvdev->state_mutex);
+ *curr_state = virtvdev->mig_state;
+ virtiovf_state_mutex_unlock(virtvdev);
+ return 0;
+}
+
+static int virtiovf_pci_get_data_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ struct virtiovf_pci_core_device *virtvdev = container_of(
+ vdev, struct virtiovf_pci_core_device, core_device.vdev);
+ bool obj_id_exists;
+ u32 res_size;
+ u32 obj_id;
+ int ret;
+
+ mutex_lock(&virtvdev->state_mutex);
+ obj_id_exists = virtvdev->saving_migf && virtvdev->saving_migf->has_obj_id;
+ if (!obj_id_exists) {
+ ret = virtiovf_pci_alloc_obj_id(virtvdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET,
+ &obj_id);
+ if (ret)
+ goto end;
+ } else {
+ obj_id = virtvdev->saving_migf->obj_id;
+ }
+
+ ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev,
+ VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id,
+ VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE,
+ &res_size);
+ if (!ret)
+ *stop_copy_length = res_size;
+
+ /*
+ * We can't leave this obj_id alive if didn't exist before, otherwise, it might
+ * stay alive, even without an active migration flow (e.g. migration was cancelled)
+ */
+ if (!obj_id_exists)
+ virtiovf_pci_free_obj_id(virtvdev, obj_id);
+end:
+ virtiovf_state_mutex_unlock(virtvdev);
+ return ret;
+}
+
+static const struct vfio_migration_ops virtvdev_pci_mig_ops = {
+ .migration_set_state = virtiovf_pci_set_device_state,
+ .migration_get_state = virtiovf_pci_get_device_state,
+ .migration_get_data_size = virtiovf_pci_get_data_size,
+};
+
+void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev)
+{
+ virtvdev->migrate_cap = 1;
+ mutex_init(&virtvdev->state_mutex);
+ spin_lock_init(&virtvdev->reset_lock);
+ virtvdev->core_device.vdev.migration_flags =
+ VFIO_MIGRATION_STOP_COPY |
+ VFIO_MIGRATION_P2P |
+ VFIO_MIGRATION_PRE_COPY;
+ virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops;
+}
+
+void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (!virtvdev->migrate_cap)
+ return;
+
+ virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
+}
+
+void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev)
+{
+ if (!virtvdev->migrate_cap)
+ return;
+
+ virtiovf_disable_fds(virtvdev);
+}
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 8beecf23ec85..8cd01de27baf 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -48,6 +48,9 @@ struct virtio_pci_admin_vq {
/* Protects virtqueue access. */
spinlock_t lock;
u64 supported_cmds;
+ u64 supported_caps;
+ u8 max_dev_parts_objects;
+ struct ida dev_parts_ida;
/* Name of the admin queue: avq.$vq_index. */
char name[10];
u16 vq_index;
@@ -167,15 +170,27 @@ struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev);
BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_READ) | \
BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO))
+#define VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP \
+ (BIT_ULL(VIRTIO_ADMIN_CMD_CAP_ID_LIST_QUERY) | \
+ BIT_ULL(VIRTIO_ADMIN_CMD_DRIVER_CAP_SET) | \
+ BIT_ULL(VIRTIO_ADMIN_CMD_DEVICE_CAP_GET) | \
+ BIT_ULL(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_CREATE) | \
+ BIT_ULL(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_DESTROY) | \
+ BIT_ULL(VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_GET) | \
+ BIT_ULL(VIRTIO_ADMIN_CMD_DEV_PARTS_GET) | \
+ BIT_ULL(VIRTIO_ADMIN_CMD_DEV_PARTS_SET) | \
+ BIT_ULL(VIRTIO_ADMIN_CMD_DEV_MODE_SET))
+
/* Unlike modern drivers which support hardware virtio devices, legacy drivers
* assume software-based devices: e.g. they don't use proper memory barriers
* on ARM, use big endian on PPC, etc. X86 drivers are mostly ok though, more
* or less by chance. For now, only support legacy IO on X86.
*/
#ifdef CONFIG_VIRTIO_PCI_ADMIN_LEGACY
-#define VIRTIO_ADMIN_CMD_BITMAP VIRTIO_LEGACY_ADMIN_CMD_BITMAP
+#define VIRTIO_ADMIN_CMD_BITMAP (VIRTIO_LEGACY_ADMIN_CMD_BITMAP | \
+ VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP)
#else
-#define VIRTIO_ADMIN_CMD_BITMAP 0
+#define VIRTIO_ADMIN_CMD_BITMAP VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP
#endif
bool vp_is_avq(struct virtio_device *vdev, unsigned int index);
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 4fbcbc7a9ae1..5eaade757860 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -15,6 +15,7 @@
*/
#include <linux/delay.h>
+#include <linux/virtio_pci_admin.h>
#define VIRTIO_PCI_NO_LEGACY
#define VIRTIO_RING_NO_LEGACY
#include "virtio_pci_common.h"
@@ -54,8 +55,10 @@ void vp_modern_avq_done(struct virtqueue *vq)
spin_lock_irqsave(&admin_vq->lock, flags);
do {
virtqueue_disable_cb(vq);
- while ((cmd = virtqueue_get_buf(vq, &len)))
+ while ((cmd = virtqueue_get_buf(vq, &len))) {
+ cmd->result_sg_size = len;
complete(&cmd->completion);
+ }
} while (!virtqueue_enable_cb(vq));
spin_unlock_irqrestore(&admin_vq->lock, flags);
}
@@ -218,12 +221,117 @@ end:
kfree(data);
}
+static void
+virtio_pci_admin_cmd_dev_parts_objects_enable(struct virtio_device *virtio_dev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(virtio_dev);
+ struct virtio_admin_cmd_cap_get_data *get_data;
+ struct virtio_admin_cmd_cap_set_data *set_data;
+ struct virtio_dev_parts_cap *result;
+ struct virtio_admin_cmd cmd = {};
+ struct scatterlist result_sg;
+ struct scatterlist data_sg;
+ u8 resource_objects_limit;
+ u16 set_data_size;
+ int ret;
+
+ get_data = kzalloc(sizeof(*get_data), GFP_KERNEL);
+ if (!get_data)
+ return;
+
+ result = kzalloc(sizeof(*result), GFP_KERNEL);
+ if (!result)
+ goto end;
+
+ get_data->id = cpu_to_le16(VIRTIO_DEV_PARTS_CAP);
+ sg_init_one(&data_sg, get_data, sizeof(*get_data));
+ sg_init_one(&result_sg, result, sizeof(*result));
+ cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEVICE_CAP_GET);
+ cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+ cmd.data_sg = &data_sg;
+ cmd.result_sg = &result_sg;
+ ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+ if (ret)
+ goto err_get;
+
+ set_data_size = sizeof(*set_data) + sizeof(*result);
+ set_data = kzalloc(set_data_size, GFP_KERNEL);
+ if (!set_data)
+ goto err_get;
+
+ set_data->id = cpu_to_le16(VIRTIO_DEV_PARTS_CAP);
+
+ /* Set the limit to the minimum value between the GET and SET values
+ * supported by the device. Since the obj_id for VIRTIO_DEV_PARTS_CAP
+ * is a globally unique value per PF, there is no possibility of
+ * overlap between GET and SET operations.
+ */
+ resource_objects_limit = min(result->get_parts_resource_objects_limit,
+ result->set_parts_resource_objects_limit);
+ result->get_parts_resource_objects_limit = resource_objects_limit;
+ result->set_parts_resource_objects_limit = resource_objects_limit;
+ memcpy(set_data->cap_specific_data, result, sizeof(*result));
+ sg_init_one(&data_sg, set_data, set_data_size);
+ cmd.data_sg = &data_sg;
+ cmd.result_sg = NULL;
+ cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DRIVER_CAP_SET);
+ ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+ if (ret)
+ goto err_set;
+
+ /* Allocate IDR to manage the dev caps objects */
+ ida_init(&vp_dev->admin_vq.dev_parts_ida);
+ vp_dev->admin_vq.max_dev_parts_objects = resource_objects_limit;
+
+err_set:
+ kfree(set_data);
+err_get:
+ kfree(result);
+end:
+ kfree(get_data);
+}
+
+static void virtio_pci_admin_cmd_cap_init(struct virtio_device *virtio_dev)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(virtio_dev);
+ struct virtio_admin_cmd_query_cap_id_result *data;
+ struct virtio_admin_cmd cmd = {};
+ struct scatterlist result_sg;
+ int ret;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return;
+
+ sg_init_one(&result_sg, data, sizeof(*data));
+ cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_CAP_ID_LIST_QUERY);
+ cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+ cmd.result_sg = &result_sg;
+
+ ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+ if (ret)
+ goto end;
+
+ /* Max number of caps fits into a single u64 */
+ BUILD_BUG_ON(sizeof(data->supported_caps) > sizeof(u64));
+
+ vp_dev->admin_vq.supported_caps = le64_to_cpu(data->supported_caps[0]);
+
+ if (!(vp_dev->admin_vq.supported_caps & (1 << VIRTIO_DEV_PARTS_CAP)))
+ goto end;
+
+ virtio_pci_admin_cmd_dev_parts_objects_enable(virtio_dev);
+end:
+ kfree(data);
+}
+
static void vp_modern_avq_activate(struct virtio_device *vdev)
{
if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
return;
virtio_pci_admin_cmd_list_init(vdev);
+ virtio_pci_admin_cmd_cap_init(vdev);
}
static void vp_modern_avq_cleanup(struct virtio_device *vdev)
@@ -758,6 +866,353 @@ static bool vp_get_shm_region(struct virtio_device *vdev,
return true;
}
+/*
+ * virtio_pci_admin_has_dev_parts - Checks whether the device parts
+ * functionality is supported
+ * @pdev: VF pci_dev
+ *
+ * Returns true on success.
+ */
+bool virtio_pci_admin_has_dev_parts(struct pci_dev *pdev)
+{
+ struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+ struct virtio_pci_device *vp_dev;
+
+ if (!virtio_dev)
+ return false;
+
+ if (!virtio_has_feature(virtio_dev, VIRTIO_F_ADMIN_VQ))
+ return false;
+
+ vp_dev = to_vp_device(virtio_dev);
+
+ if (!((vp_dev->admin_vq.supported_cmds & VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP) ==
+ VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP))
+ return false;
+
+ return vp_dev->admin_vq.max_dev_parts_objects;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_has_dev_parts);
+
+/*
+ * virtio_pci_admin_mode_set - Sets the mode of a member device
+ * @pdev: VF pci_dev
+ * @flags: device mode's flags
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_mode_set(struct pci_dev *pdev, u8 flags)
+{
+ struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+ struct virtio_admin_cmd_dev_mode_set_data *data;
+ struct virtio_admin_cmd cmd = {};
+ struct scatterlist data_sg;
+ int vf_id;
+ int ret;
+
+ if (!virtio_dev)
+ return -ENODEV;
+
+ vf_id = pci_iov_vf_id(pdev);
+ if (vf_id < 0)
+ return vf_id;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->flags = flags;
+ sg_init_one(&data_sg, data, sizeof(*data));
+ cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_MODE_SET);
+ cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+ cmd.group_member_id = cpu_to_le64(vf_id + 1);
+ cmd.data_sg = &data_sg;
+ ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+
+ kfree(data);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_mode_set);
+
+/*
+ * virtio_pci_admin_obj_create - Creates an object for a given type and operation,
+ * following the max objects that can be created for that request.
+ * @pdev: VF pci_dev
+ * @obj_type: Object type
+ * @operation_type: Operation type
+ * @obj_id: Output unique object id
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_obj_create(struct pci_dev *pdev, u16 obj_type, u8 operation_type,
+ u32 *obj_id)
+{
+ struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+ u16 data_size = sizeof(struct virtio_admin_cmd_resource_obj_create_data);
+ struct virtio_admin_cmd_resource_obj_create_data *obj_create_data;
+ struct virtio_resource_obj_dev_parts obj_dev_parts = {};
+ struct virtio_pci_admin_vq *avq;
+ struct virtio_admin_cmd cmd = {};
+ struct scatterlist data_sg;
+ void *data;
+ int id = -1;
+ int vf_id;
+ int ret;
+
+ if (!virtio_dev)
+ return -ENODEV;
+
+ vf_id = pci_iov_vf_id(pdev);
+ if (vf_id < 0)
+ return vf_id;
+
+ if (obj_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS)
+ return -EOPNOTSUPP;
+
+ if (operation_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET &&
+ operation_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET)
+ return -EINVAL;
+
+ avq = &to_vp_device(virtio_dev)->admin_vq;
+ if (!avq->max_dev_parts_objects)
+ return -EOPNOTSUPP;
+
+ id = ida_alloc_range(&avq->dev_parts_ida, 0,
+ avq->max_dev_parts_objects - 1, GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ *obj_id = id;
+ data_size += sizeof(obj_dev_parts);
+ data = kzalloc(data_size, GFP_KERNEL);
+ if (!data) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ obj_create_data = data;
+ obj_create_data->hdr.type = cpu_to_le16(obj_type);
+ obj_create_data->hdr.id = cpu_to_le32(*obj_id);
+ obj_dev_parts.type = operation_type;
+ memcpy(obj_create_data->resource_obj_specific_data, &obj_dev_parts,
+ sizeof(obj_dev_parts));
+ sg_init_one(&data_sg, data, data_size);
+ cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_CREATE);
+ cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+ cmd.group_member_id = cpu_to_le64(vf_id + 1);
+ cmd.data_sg = &data_sg;
+ ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+
+ kfree(data);
+end:
+ if (ret)
+ ida_free(&avq->dev_parts_ida, id);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_obj_create);
+
+/*
+ * virtio_pci_admin_obj_destroy - Destroys an object of a given type and id
+ * @pdev: VF pci_dev
+ * @obj_type: Object type
+ * @id: Object id
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_obj_destroy(struct pci_dev *pdev, u16 obj_type, u32 id)
+{
+ struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+ struct virtio_admin_cmd_resource_obj_cmd_hdr *data;
+ struct virtio_pci_device *vp_dev;
+ struct virtio_admin_cmd cmd = {};
+ struct scatterlist data_sg;
+ int vf_id;
+ int ret;
+
+ if (!virtio_dev)
+ return -ENODEV;
+
+ vf_id = pci_iov_vf_id(pdev);
+ if (vf_id < 0)
+ return vf_id;
+
+ if (obj_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS)
+ return -EINVAL;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->type = cpu_to_le16(obj_type);
+ data->id = cpu_to_le32(id);
+ sg_init_one(&data_sg, data, sizeof(*data));
+ cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_DESTROY);
+ cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+ cmd.group_member_id = cpu_to_le64(vf_id + 1);
+ cmd.data_sg = &data_sg;
+ ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+ if (!ret) {
+ vp_dev = to_vp_device(virtio_dev);
+ ida_free(&vp_dev->admin_vq.dev_parts_ida, id);
+ }
+
+ kfree(data);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_obj_destroy);
+
+/*
+ * virtio_pci_admin_dev_parts_metadata_get - Gets the metadata of the device parts
+ * identified by the below attributes.
+ * @pdev: VF pci_dev
+ * @obj_type: Object type
+ * @id: Object id
+ * @metadata_type: Metadata type
+ * @out: Upon success holds the output for 'metadata type size'
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_dev_parts_metadata_get(struct pci_dev *pdev, u16 obj_type,
+ u32 id, u8 metadata_type, u32 *out)
+{
+ struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+ struct virtio_admin_cmd_dev_parts_metadata_result *result;
+ struct virtio_admin_cmd_dev_parts_metadata_data *data;
+ struct scatterlist data_sg, result_sg;
+ struct virtio_admin_cmd cmd = {};
+ int vf_id;
+ int ret;
+
+ if (!virtio_dev)
+ return -ENODEV;
+
+ if (metadata_type != VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE)
+ return -EOPNOTSUPP;
+
+ vf_id = pci_iov_vf_id(pdev);
+ if (vf_id < 0)
+ return vf_id;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ result = kzalloc(sizeof(*result), GFP_KERNEL);
+ if (!result) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ data->hdr.type = cpu_to_le16(obj_type);
+ data->hdr.id = cpu_to_le32(id);
+ data->type = metadata_type;
+ sg_init_one(&data_sg, data, sizeof(*data));
+ sg_init_one(&result_sg, result, sizeof(*result));
+ cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_GET);
+ cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+ cmd.group_member_id = cpu_to_le64(vf_id + 1);
+ cmd.data_sg = &data_sg;
+ cmd.result_sg = &result_sg;
+ ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+ if (!ret)
+ *out = le32_to_cpu(result->parts_size.size);
+
+ kfree(result);
+end:
+ kfree(data);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_dev_parts_metadata_get);
+
+/*
+ * virtio_pci_admin_dev_parts_get - Gets the device parts identified by the below attributes.
+ * @pdev: VF pci_dev
+ * @obj_type: Object type
+ * @id: Object id
+ * @get_type: Get type
+ * @res_sg: Upon success holds the output result data
+ * @res_size: Upon success holds the output result size
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_dev_parts_get(struct pci_dev *pdev, u16 obj_type, u32 id,
+ u8 get_type, struct scatterlist *res_sg,
+ u32 *res_size)
+{
+ struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+ struct virtio_admin_cmd_dev_parts_get_data *data;
+ struct scatterlist data_sg;
+ struct virtio_admin_cmd cmd = {};
+ int vf_id;
+ int ret;
+
+ if (!virtio_dev)
+ return -ENODEV;
+
+ if (get_type != VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL)
+ return -EOPNOTSUPP;
+
+ vf_id = pci_iov_vf_id(pdev);
+ if (vf_id < 0)
+ return vf_id;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->hdr.type = cpu_to_le16(obj_type);
+ data->hdr.id = cpu_to_le32(id);
+ data->type = get_type;
+ sg_init_one(&data_sg, data, sizeof(*data));
+ cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_PARTS_GET);
+ cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+ cmd.group_member_id = cpu_to_le64(vf_id + 1);
+ cmd.data_sg = &data_sg;
+ cmd.result_sg = res_sg;
+ ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+ if (!ret)
+ *res_size = cmd.result_sg_size;
+
+ kfree(data);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_dev_parts_get);
+
+/*
+ * virtio_pci_admin_dev_parts_set - Sets the device parts identified by the below attributes.
+ * @pdev: VF pci_dev
+ * @data_sg: The device parts data, its layout follows struct virtio_admin_cmd_dev_parts_set_data
+ *
+ * Note: caller must serialize access for the given device.
+ * Returns 0 on success, or negative on failure.
+ */
+int virtio_pci_admin_dev_parts_set(struct pci_dev *pdev, struct scatterlist *data_sg)
+{
+ struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev);
+ struct virtio_admin_cmd cmd = {};
+ int vf_id;
+
+ if (!virtio_dev)
+ return -ENODEV;
+
+ vf_id = pci_iov_vf_id(pdev);
+ if (vf_id < 0)
+ return vf_id;
+
+ cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_PARTS_SET);
+ cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV);
+ cmd.group_member_id = cpu_to_le64(vf_id + 1);
+ cmd.data_sg = data_sg;
+ return vp_modern_admin_cmd_exec(virtio_dev, &cmd);
+}
+EXPORT_SYMBOL_GPL(virtio_pci_admin_dev_parts_set);
+
static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
.get = NULL,
.set = NULL,