From 1e361632da12ac00cb86c25a857ba251fdf2de95 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 19 Mar 2020 18:07:37 -0500 Subject: libnvdimm/label: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200319230737.GA16452@embeddedor.com Signed-off-by: Dan Williams --- drivers/nvdimm/label.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvdimm') diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h index 4c7b775c2811..956b6d1bd8cc 100644 --- a/drivers/nvdimm/label.h +++ b/drivers/nvdimm/label.h @@ -62,7 +62,7 @@ struct nd_namespace_index { __le16 major; __le16 minor; __le64 checksum; - u8 free[0]; + u8 free[]; }; /** -- cgit v1.2.3 From 9106137c6f0d0d959a855ad6885c6b3cb010ff98 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 19 Mar 2020 18:09:37 -0500 Subject: libnvdimm/region: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200319230937.GA16648@embeddedor.com Signed-off-by: Dan Williams --- drivers/nvdimm/nd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvdimm') diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index c9f6a5b5253a..2cf77bc859c1 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -39,7 +39,7 @@ struct nd_region_data { int ns_count; int ns_active; unsigned int hints_shift; - void __iomem *flush_wpq[0]; + void __iomem *flush_wpq[]; }; static inline void __iomem *ndrd_get_flush_wpq(struct nd_region_data *ndrd, @@ -156,7 +156,7 @@ struct nd_region { struct nd_interleave_set *nd_set; struct nd_percpu_lane __percpu *lane; int (*flush)(struct nd_region *nd_region, struct bio *bio); - struct nd_mapping mapping[0]; + struct nd_mapping mapping[]; }; struct nd_blk_region { -- cgit v1.2.3 From 338f6dac8585beaf4d913de8847e430808fb7596 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 24 Mar 2020 09:18:21 +0530 Subject: libnvdimm: Update persistence domain value for of_pmem and papr_scm device Currently, kernel shows the below values "persistence_domain":"cpu_cache" "persistence_domain":"memory_controller" "persistence_domain":"unknown" "cpu_cache" indicates no extra instructions is needed to ensure the persistence of data in the pmem media on power failure. "memory_controller" indicates cpu cache flush instructions are required to flush the data. Platform provides mechanisms to automatically flush outstanding write data from memory controler to pmem on system power loss. Based on the above use memory_controller for non volatile regions on ppc64. Signed-off-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/20200324034821.60869-1-aneesh.kumar@linux.ibm.com Signed-off-by: Dan Williams --- arch/powerpc/platforms/pseries/papr_scm.c | 4 +++- drivers/nvdimm/of_pmem.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/nvdimm') diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 0b4467e378e5..922a4fc3b61b 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -361,8 +361,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) if (p->is_volatile) p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc); - else + else { + set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags); p->region = nvdimm_pmem_region_create(p->bus, &ndr_desc); + } if (!p->region) { dev_err(dev, "Error registering region %pR from %pOF\n", ndr_desc.res, p->dn); diff --git a/drivers/nvdimm/of_pmem.c b/drivers/nvdimm/of_pmem.c index 8224d1431ea9..6826a274a1f1 100644 --- a/drivers/nvdimm/of_pmem.c +++ b/drivers/nvdimm/of_pmem.c @@ -62,8 +62,10 @@ static int of_pmem_region_probe(struct platform_device *pdev) if (is_volatile) region = nvdimm_volatile_region_create(bus, &ndr_desc); - else + else { + set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags); region = nvdimm_pmem_region_create(bus, &ndr_desc); + } if (!region) dev_warn(&pdev->dev, "Unable to register region %pR from %pOF\n", -- cgit v1.2.3 From 5d64efe79703c4de1f017af9f7643ead1dc85432 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 28 Feb 2020 11:34:51 -0500 Subject: pmem: Add functions for reading/writing page to/from pmem This splits pmem_do_bvec() into pmem_do_read() and pmem_do_write(). pmem_do_write() will be used by pmem zero_page_range() as well. Hence sharing the same code. Suggested-by: Christoph Hellwig Signed-off-by: Vivek Goyal Reviewed-by: Christoph Hellwig Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20200228163456.1587-2-vgoyal@redhat.com Signed-off-by: Dan Williams --- drivers/nvdimm/pmem.c | 86 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 36 deletions(-) (limited to 'drivers/nvdimm') diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4eae441f86c9..075b11682192 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -136,9 +136,25 @@ static blk_status_t read_pmem(struct page *page, unsigned int off, return BLK_STS_OK; } -static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, - unsigned int len, unsigned int off, unsigned int op, - sector_t sector) +static blk_status_t pmem_do_read(struct pmem_device *pmem, + struct page *page, unsigned int page_off, + sector_t sector, unsigned int len) +{ + blk_status_t rc; + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; + void *pmem_addr = pmem->virt_addr + pmem_off; + + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) + return BLK_STS_IOERR; + + rc = read_pmem(page, page_off, pmem_addr, len); + flush_dcache_page(page); + return rc; +} + +static blk_status_t pmem_do_write(struct pmem_device *pmem, + struct page *page, unsigned int page_off, + sector_t sector, unsigned int len) { blk_status_t rc = BLK_STS_OK; bool bad_pmem = false; @@ -148,34 +164,25 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) bad_pmem = true; - if (!op_is_write(op)) { - if (unlikely(bad_pmem)) - rc = BLK_STS_IOERR; - else { - rc = read_pmem(page, off, pmem_addr, len); - flush_dcache_page(page); - } - } else { - /* - * Note that we write the data both before and after - * clearing poison. The write before clear poison - * handles situations where the latest written data is - * preserved and the clear poison operation simply marks - * the address range as valid without changing the data. - * In this case application software can assume that an - * interrupted write will either return the new good - * data or an error. - * - * However, if pmem_clear_poison() leaves the data in an - * indeterminate state we need to perform the write - * after clear poison. - */ - flush_dcache_page(page); - write_pmem(pmem_addr, page, off, len); - if (unlikely(bad_pmem)) { - rc = pmem_clear_poison(pmem, pmem_off, len); - write_pmem(pmem_addr, page, off, len); - } + /* + * Note that we write the data both before and after + * clearing poison. The write before clear poison + * handles situations where the latest written data is + * preserved and the clear poison operation simply marks + * the address range as valid without changing the data. + * In this case application software can assume that an + * interrupted write will either return the new good + * data or an error. + * + * However, if pmem_clear_poison() leaves the data in an + * indeterminate state we need to perform the write + * after clear poison. + */ + flush_dcache_page(page); + write_pmem(pmem_addr, page, page_off, len); + if (unlikely(bad_pmem)) { + rc = pmem_clear_poison(pmem, pmem_off, len); + write_pmem(pmem_addr, page, page_off, len); } return rc; @@ -197,8 +204,12 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) do_acct = nd_iostat_start(bio, &start); bio_for_each_segment(bvec, bio, iter) { - rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, - bvec.bv_offset, bio_op(bio), iter.bi_sector); + if (op_is_write(bio_op(bio))) + rc = pmem_do_write(pmem, bvec.bv_page, bvec.bv_offset, + iter.bi_sector, bvec.bv_len); + else + rc = pmem_do_read(pmem, bvec.bv_page, bvec.bv_offset, + iter.bi_sector, bvec.bv_len); if (rc) { bio->bi_status = rc; break; @@ -223,9 +234,12 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct pmem_device *pmem = bdev->bd_queue->queuedata; blk_status_t rc; - rc = pmem_do_bvec(pmem, page, hpage_nr_pages(page) * PAGE_SIZE, - 0, op, sector); - + if (op_is_write(op)) + rc = pmem_do_write(pmem, page, 0, sector, + hpage_nr_pages(page) * PAGE_SIZE); + else + rc = pmem_do_read(pmem, page, 0, sector, + hpage_nr_pages(page) * PAGE_SIZE); /* * The ->rw_page interface is subtle and tricky. The core * retries on any error, so we can only invoke page_endio() in -- cgit v1.2.3 From f605a263e0690177ecc180417eacf2b5507dd177 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 28 Feb 2020 11:34:52 -0500 Subject: dax, pmem: Add a dax operation zero_page_range Add a dax operation zero_page_range, to zero a page. This will also clear any known poison in the page being zeroed. As of now, zeroing of one page is allowed in a single call. There are no callers which are trying to zero more than a page in a single call. Once we grow the callers which zero more than a page in single call, we can add that support. Primary reason for not doing that yet is that this will add little complexity in dm implementation where a range might be spanning multiple underlying targets and one will have to split the range into multiple sub ranges and call zero_page_range() on individual targets. Suggested-by: Christoph Hellwig Signed-off-by: Vivek Goyal Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20200228163456.1587-3-vgoyal@redhat.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 20 ++++++++++++++++++++ drivers/nvdimm/pmem.c | 11 +++++++++++ include/linux/dax.h | 4 ++++ 3 files changed, 35 insertions(+) (limited to 'drivers/nvdimm') diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0aa4b6bc5101..e498daf3c0d7 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -344,6 +344,26 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, } EXPORT_SYMBOL_GPL(dax_copy_to_iter); +int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) +{ + if (!dax_alive(dax_dev)) + return -ENXIO; + + if (!dax_dev->ops->zero_page_range) + return -EOPNOTSUPP; + /* + * There are no callers that want to zero more than one page as of now. + * Once users are there, this check can be removed after the + * device mapper code has been updated to split ranges across targets. + */ + if (nr_pages != 1) + return -EIO; + + return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 075b11682192..5b774ddd0efb 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -282,6 +282,16 @@ static const struct block_device_operations pmem_fops = { .revalidate_disk = nvdimm_revalidate_disk, }; +static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages) +{ + struct pmem_device *pmem = dax_get_private(dax_dev); + + return blk_status_to_errno(pmem_do_write(pmem, ZERO_PAGE(0), 0, + PFN_PHYS(pgoff) >> SECTOR_SHIFT, + PAGE_SIZE)); +} + static long pmem_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { @@ -313,6 +323,7 @@ static const struct dax_operations pmem_dax_ops = { .dax_supported = generic_fsdax_supported, .copy_from_iter = pmem_copy_from_iter, .copy_to_iter = pmem_copy_to_iter, + .zero_page_range = pmem_dax_zero_page_range, }; static const struct attribute_group *pmem_attribute_groups[] = { diff --git a/include/linux/dax.h b/include/linux/dax.h index 328c2dbb4409..71735c430c05 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -34,6 +34,8 @@ struct dax_operations { /* copy_to_iter: required operation for fs-dax direct-i/o */ size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t, struct iov_iter *); + /* zero_page_range: required operation. Zero page range */ + int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); }; extern struct attribute_group dax_attribute_group; @@ -199,6 +201,8 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); +int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, -- cgit v1.2.3 From 4e4ced93794acb42adb19484132966defba8f3a6 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 1 Apr 2020 12:11:25 -0400 Subject: dax: Move mandatory ->zero_page_range() check in alloc_dax() zero_page_range() dax operation is mandatory for dax devices. Right now that check happens in dax_zero_page_range() function. Dan thinks that's too late and its better to do the check earlier in alloc_dax(). I also modified alloc_dax() to return pointer with error code in it in case of failure. Right now it returns NULL and caller assumes failure happened due to -ENOMEM. But with this ->zero_page_range() check, I need to return -EINVAL instead. Signed-off-by: Vivek Goyal Link: https://lore.kernel.org/r/20200401161125.GB9398@redhat.com Signed-off-by: Dan Williams --- drivers/dax/bus.c | 4 +++- drivers/dax/super.c | 14 +++++++++----- drivers/md/dm.c | 2 +- drivers/nvdimm/pmem.c | 4 ++-- drivers/s390/block/dcssblk.c | 5 +++-- 5 files changed, 18 insertions(+), 11 deletions(-) (limited to 'drivers/nvdimm') diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 46e46047a1f7..df238c8b6ef2 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -421,8 +421,10 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id, * device outside of mmap of the resulting character device. */ dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC); - if (!dax_dev) + if (IS_ERR(dax_dev)) { + rc = PTR_ERR(dax_dev); goto err; + } /* a device_dax instance is dead while the driver is not attached */ kill_dax(dax_dev); diff --git a/drivers/dax/super.c b/drivers/dax/super.c index e498daf3c0d7..8e32345be0f7 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -349,9 +349,6 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, { if (!dax_alive(dax_dev)) return -ENXIO; - - if (!dax_dev->ops->zero_page_range) - return -EOPNOTSUPP; /* * There are no callers that want to zero more than one page as of now. * Once users are there, this check can be removed after the @@ -571,9 +568,16 @@ struct dax_device *alloc_dax(void *private, const char *__host, dev_t devt; int minor; + if (ops && !ops->zero_page_range) { + pr_debug("%s: error: device does not provide dax" + " operation zero_page_range()\n", + __host ? __host : "Unknown"); + return ERR_PTR(-EINVAL); + } + host = kstrdup(__host, GFP_KERNEL); if (__host && !host) - return NULL; + return ERR_PTR(-ENOMEM); minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); if (minor < 0) @@ -596,7 +600,7 @@ struct dax_device *alloc_dax(void *private, const char *__host, ida_simple_remove(&dax_minor_ida, minor); err_minor: kfree(host); - return NULL; + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL_GPL(alloc_dax); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index aa72d9e757c1..6504f0a358e5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2005,7 +2005,7 @@ static struct mapped_device *alloc_dev(int minor) if (IS_ENABLED(CONFIG_DAX_DRIVER)) { md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops, 0); - if (!md->dax_dev) + if (IS_ERR(md->dax_dev)) goto bad; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5b774ddd0efb..715cb0696525 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -487,9 +487,9 @@ static int pmem_attach_disk(struct device *dev, if (is_nvdimm_sync(nd_region)) flags = DAXDEV_F_SYNC; dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags); - if (!dax_dev) { + if (IS_ERR(dax_dev)) { put_disk(disk); - return -ENOMEM; + return PTR_ERR(dax_dev); } dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); pmem->dax_dev = dax_dev; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index ab3e2898816c..9b05cf1a25c4 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -695,8 +695,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name, &dcssblk_dax_ops, DAXDEV_F_SYNC); - if (!dev_info->dax_dev) { - rc = -ENOMEM; + if (IS_ERR(dev_info->dax_dev)) { + rc = PTR_ERR(dev_info->dax_dev); + dev_info->dax_dev = NULL; goto put_dev; } -- cgit v1.2.3