diff options
Diffstat (limited to 'drivers/infiniband')
72 files changed, 2607 insertions, 1124 deletions
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 93a1c48d0c32..1ee87c3aaeab 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3295,7 +3295,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; - route->path_rec->rate = iboe_get_rate(ndev); + route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate @@ -4805,8 +4805,7 @@ static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, event->param.ud.qkey = id_priv->qkey; out: - if (ndev) - dev_put(ndev); + dev_put(ndev); } static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) @@ -4964,7 +4963,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, if (!ndev) return -ENODEV; - ib.rec.rate = iboe_get_rate(ndev); + ib.rec.rate = IB_RATE_PORT_CURRENT; ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4796f6a8828c..e836c9c477f6 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1850,8 +1850,13 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, attr->path_mtu = cmd->base.path_mtu; if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE) attr->path_mig_state = cmd->base.path_mig_state; - if (cmd->base.attr_mask & IB_QP_QKEY) + if (cmd->base.attr_mask & IB_QP_QKEY) { + if (cmd->base.qkey & IB_QP_SET_QKEY && !capable(CAP_NET_RAW)) { + ret = -EPERM; + goto release_qp; + } attr->qkey = cmd->base.qkey; + } if (cmd->base.attr_mask & IB_QP_RQ_PSN) attr->rq_psn = cmd->base.rq_psn; if (cmd->base.attr_mask & IB_QP_SQ_PSN) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index fbace69672ca..7c9c79c13941 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -222,8 +222,12 @@ static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, spin_lock_irq(&ev_queue->lock); while (list_empty(&ev_queue->event_list)) { - spin_unlock_irq(&ev_queue->lock); + if (ev_queue->is_closed) { + spin_unlock_irq(&ev_queue->lock); + return -EIO; + } + spin_unlock_irq(&ev_queue->lock); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; @@ -233,12 +237,6 @@ static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, return -ERESTARTSYS; spin_lock_irq(&ev_queue->lock); - - /* If device was disassociated and no event exists set an error */ - if (list_empty(&ev_queue->event_list) && ev_queue->is_closed) { - spin_unlock_irq(&ev_queue->lock); - return -EIO; - } } event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list); diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 5a2baf49ecaa..ea81b2497511 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -39,6 +39,7 @@ #ifndef __BNXT_RE_H__ #define __BNXT_RE_H__ +#include <rdma/uverbs_ioctl.h> #include "hw_counters.h" #define ROCE_DRV_MODULE_NAME "bnxt_re" @@ -135,8 +136,6 @@ struct bnxt_re_dev { struct delayed_work worker; u8 cur_prio_map; - u16 active_speed; - u8 active_width; /* FP Notification Queue (CQ & SRQ) */ struct tasklet_struct nq_task; @@ -181,10 +180,14 @@ struct bnxt_re_dev { #define BNXT_RE_ROCEV2_IPV4_PACKET 2 #define BNXT_RE_ROCEV2_IPV6_PACKET 3 +#define BNXT_RE_CHECK_RC(x) ((x) && ((x) != -ETIMEDOUT)) + static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) { if (rdev) return &rdev->ibdev.dev; return NULL; } + +extern const struct uapi_definition bnxt_re_uapi_defs[]; #endif diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index e86afecfbe46..abef0b8baa7c 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -61,6 +61,15 @@ #include "bnxt_re.h" #include "ib_verbs.h" + +#include <rdma/uverbs_types.h> +#include <rdma/uverbs_std_types.h> + +#include <rdma/ib_user_ioctl_cmds.h> + +#define UVERBS_MODULE_NAME bnxt_re +#include <rdma/uverbs_named_ioctl.h> + #include <rdma/bnxt_re-abi.h> static int __from_ib_access_flags(int iflags) @@ -199,6 +208,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + int rc; memset(port_attr, 0, sizeof(*port_attr)); @@ -228,10 +238,10 @@ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, port_attr->sm_sl = 0; port_attr->subnet_timeout = 0; port_attr->init_type_reply = 0; - port_attr->active_speed = rdev->active_speed; - port_attr->active_width = rdev->active_width; + rc = ib_get_eth_speed(&rdev->ibdev, port_num, &port_attr->active_speed, + &port_attr->active_width); - return 0; + return rc; } int bnxt_re_get_port_immutable(struct ib_device *ibdev, u32 port_num, @@ -533,12 +543,57 @@ fail: return rc; } +static struct bnxt_re_user_mmap_entry* +bnxt_re_mmap_entry_insert(struct bnxt_re_ucontext *uctx, u64 mem_offset, + enum bnxt_re_mmap_flag mmap_flag, u64 *offset) +{ + struct bnxt_re_user_mmap_entry *entry; + int ret; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + entry->mem_offset = mem_offset; + entry->mmap_flag = mmap_flag; + entry->uctx = uctx; + + switch (mmap_flag) { + case BNXT_RE_MMAP_SH_PAGE: + ret = rdma_user_mmap_entry_insert_exact(&uctx->ib_uctx, + &entry->rdma_entry, PAGE_SIZE, 0); + break; + case BNXT_RE_MMAP_UC_DB: + case BNXT_RE_MMAP_WC_DB: + ret = rdma_user_mmap_entry_insert(&uctx->ib_uctx, + &entry->rdma_entry, PAGE_SIZE); + break; + default: + ret = -EINVAL; + break; + } + + if (ret) { + kfree(entry); + return NULL; + } + if (offset) + *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + + return entry; +} + /* Protection Domains */ int bnxt_re_dealloc_pd(struct ib_pd *ib_pd, struct ib_udata *udata) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); struct bnxt_re_dev *rdev = pd->rdev; + if (udata) { + rdma_user_mmap_entry_remove(pd->pd_db_mmap); + pd->pd_db_mmap = NULL; + } + bnxt_re_destroy_fence_mr(pd); if (pd->qplib_pd.id) { @@ -557,7 +612,8 @@ int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) struct bnxt_re_ucontext *ucntx = rdma_udata_to_drv_context( udata, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd); - int rc; + struct bnxt_re_user_mmap_entry *entry = NULL; + int rc = 0; pd->rdev = rdev; if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) { @@ -567,15 +623,15 @@ int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) } if (udata) { - struct bnxt_re_pd_resp resp; + struct bnxt_re_pd_resp resp = {}; if (!ucntx->dpi.dbr) { /* Allocate DPI in alloc_pd to avoid failing of * ibv_devinfo and family of application when DPIs * are depleted. */ - if (bnxt_qplib_alloc_dpi(&rdev->qplib_res.dpi_tbl, - &ucntx->dpi, ucntx)) { + if (bnxt_qplib_alloc_dpi(&rdev->qplib_res, + &ucntx->dpi, ucntx, BNXT_QPLIB_DPI_TYPE_UC)) { rc = -ENOMEM; goto dbfail; } @@ -584,12 +640,21 @@ int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) resp.pdid = pd->qplib_pd.id; /* Still allow mapping this DBR to the new user PD. */ resp.dpi = ucntx->dpi.dpi; - resp.dbr = (u64)ucntx->dpi.umdbr; - rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + entry = bnxt_re_mmap_entry_insert(ucntx, (u64)ucntx->dpi.umdbr, + BNXT_RE_MMAP_UC_DB, &resp.dbr); + + if (!entry) { + rc = -ENOMEM; + goto dbfail; + } + + pd->pd_db_mmap = &entry->rdma_entry; + + rc = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen)); if (rc) { - ibdev_err(&rdev->ibdev, - "Failed to copy user response\n"); + rdma_user_mmap_entry_remove(pd->pd_db_mmap); + rc = -EFAULT; goto dbfail; } } @@ -613,12 +678,20 @@ int bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags) { struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah); struct bnxt_re_dev *rdev = ah->rdev; + bool block = true; + int rc = 0; - bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah, - !(flags & RDMA_DESTROY_AH_SLEEPABLE)); + block = !(flags & RDMA_DESTROY_AH_SLEEPABLE); + rc = bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah, block); + if (BNXT_RE_CHECK_RC(rc)) { + if (rc == -ETIMEDOUT) + rc = 0; + else + goto fail; + } atomic_dec(&rdev->ah_count); - - return 0; +fail: + return rc; } static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype) @@ -3341,9 +3414,7 @@ static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *gsi_qp, udwr.remote_qkey = gsi_sqp->qplib_qp.qkey; /* post data received in the send queue */ - rc = bnxt_re_post_send_shadow_qp(rdev, gsi_sqp, swr); - - return 0; + return bnxt_re_post_send_shadow_qp(rdev, gsi_sqp, swr); } static void bnxt_re_process_res_rawqp1_wc(struct ib_wc *wc, @@ -3956,6 +4027,7 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) container_of(ctx, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_re_user_mmap_entry *entry; struct bnxt_re_uctx_resp resp = {}; u32 chip_met_rev_num = 0; int rc; @@ -3994,6 +4066,16 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; resp.mode = rdev->chip_ctx->modes.wqe_mode; + if (rdev->chip_ctx->modes.db_push) + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_WC_DPI_ENABLED; + + entry = bnxt_re_mmap_entry_insert(uctx, 0, BNXT_RE_MMAP_SH_PAGE, NULL); + if (!entry) { + rc = -ENOMEM; + goto cfail; + } + uctx->shpage_mmap = &entry->rdma_entry; + rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (rc) { ibdev_err(ibdev, "Failed to copy user context"); @@ -4017,6 +4099,8 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) struct bnxt_re_dev *rdev = uctx->rdev; + rdma_user_mmap_entry_remove(uctx->shpage_mmap); + uctx->shpage_mmap = NULL; if (uctx->shpg) free_page((unsigned long)uctx->shpg); @@ -4024,8 +4108,7 @@ void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) /* Free DPI only if this is the first PD allocated by the * application and mark the context dpi as NULL */ - bnxt_qplib_dealloc_dpi(&rdev->qplib_res, - &rdev->qplib_res.dpi_tbl, &uctx->dpi); + bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &uctx->dpi); uctx->dpi.dbr = NULL; } } @@ -4036,27 +4119,177 @@ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) struct bnxt_re_ucontext *uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); - struct bnxt_re_dev *rdev = uctx->rdev; + struct bnxt_re_user_mmap_entry *bnxt_entry; + struct rdma_user_mmap_entry *rdma_entry; + int ret = 0; u64 pfn; - if (vma->vm_end - vma->vm_start != PAGE_SIZE) + rdma_entry = rdma_user_mmap_entry_get(&uctx->ib_uctx, vma); + if (!rdma_entry) return -EINVAL; - if (vma->vm_pgoff) { - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - PAGE_SIZE, vma->vm_page_prot)) { - ibdev_err(&rdev->ibdev, "Failed to map DPI"); - return -EAGAIN; - } - } else { - pfn = virt_to_phys(uctx->shpg) >> PAGE_SHIFT; - if (remap_pfn_range(vma, vma->vm_start, - pfn, PAGE_SIZE, vma->vm_page_prot)) { - ibdev_err(&rdev->ibdev, "Failed to map shared page"); - return -EAGAIN; + bnxt_entry = container_of(rdma_entry, struct bnxt_re_user_mmap_entry, + rdma_entry); + + switch (bnxt_entry->mmap_flag) { + case BNXT_RE_MMAP_WC_DB: + pfn = bnxt_entry->mem_offset >> PAGE_SHIFT; + ret = rdma_user_mmap_io(ib_uctx, vma, pfn, PAGE_SIZE, + pgprot_writecombine(vma->vm_page_prot), + rdma_entry); + break; + case BNXT_RE_MMAP_UC_DB: + pfn = bnxt_entry->mem_offset >> PAGE_SHIFT; + ret = rdma_user_mmap_io(ib_uctx, vma, pfn, PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot), + rdma_entry); + break; + case BNXT_RE_MMAP_SH_PAGE: + ret = vm_insert_page(vma, vma->vm_start, virt_to_page(uctx->shpg)); + break; + default: + ret = -EINVAL; + break; + } + + rdma_user_mmap_entry_put(rdma_entry); + return ret; +} + +void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct bnxt_re_user_mmap_entry *bnxt_entry; + + bnxt_entry = container_of(rdma_entry, struct bnxt_re_user_mmap_entry, + rdma_entry); + + kfree(bnxt_entry); +} + +static int UVERBS_HANDLER(BNXT_RE_METHOD_ALLOC_PAGE)(struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, BNXT_RE_ALLOC_PAGE_HANDLE); + enum bnxt_re_alloc_page_type alloc_type; + struct bnxt_re_user_mmap_entry *entry; + enum bnxt_re_mmap_flag mmap_flag; + struct bnxt_qplib_chip_ctx *cctx; + struct bnxt_re_ucontext *uctx; + struct bnxt_re_dev *rdev; + u64 mmap_offset; + u32 length; + u32 dpi; + u64 dbr; + int err; + + uctx = container_of(ib_uverbs_get_ucontext(attrs), struct bnxt_re_ucontext, ib_uctx); + if (IS_ERR(uctx)) + return PTR_ERR(uctx); + + err = uverbs_get_const(&alloc_type, attrs, BNXT_RE_ALLOC_PAGE_TYPE); + if (err) + return err; + + rdev = uctx->rdev; + cctx = rdev->chip_ctx; + + switch (alloc_type) { + case BNXT_RE_ALLOC_WC_PAGE: + if (cctx->modes.db_push) { + if (bnxt_qplib_alloc_dpi(&rdev->qplib_res, &uctx->wcdpi, + uctx, BNXT_QPLIB_DPI_TYPE_WC)) + return -ENOMEM; + length = PAGE_SIZE; + dpi = uctx->wcdpi.dpi; + dbr = (u64)uctx->wcdpi.umdbr; + mmap_flag = BNXT_RE_MMAP_WC_DB; + } else { + return -EINVAL; } + + break; + + default: + return -EOPNOTSUPP; } + entry = bnxt_re_mmap_entry_insert(uctx, dbr, mmap_flag, &mmap_offset); + if (!entry) + return -ENOMEM; + + uobj->object = entry; + uverbs_finalize_uobj_create(attrs, BNXT_RE_ALLOC_PAGE_HANDLE); + err = uverbs_copy_to(attrs, BNXT_RE_ALLOC_PAGE_MMAP_OFFSET, + &mmap_offset, sizeof(mmap_offset)); + if (err) + return err; + + err = uverbs_copy_to(attrs, BNXT_RE_ALLOC_PAGE_MMAP_LENGTH, + &length, sizeof(length)); + if (err) + return err; + + err = uverbs_copy_to(attrs, BNXT_RE_ALLOC_PAGE_DPI, + &dpi, sizeof(length)); + if (err) + return err; + + return 0; +} + +static int alloc_page_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct bnxt_re_user_mmap_entry *entry = uobject->object; + struct bnxt_re_ucontext *uctx = entry->uctx; + + switch (entry->mmap_flag) { + case BNXT_RE_MMAP_WC_DB: + if (uctx && uctx->wcdpi.dbr) { + struct bnxt_re_dev *rdev = uctx->rdev; + + bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &uctx->wcdpi); + uctx->wcdpi.dbr = NULL; + } + break; + default: + goto exit; + } + rdma_user_mmap_entry_remove(&entry->rdma_entry); +exit: return 0; } + +DECLARE_UVERBS_NAMED_METHOD(BNXT_RE_METHOD_ALLOC_PAGE, + UVERBS_ATTR_IDR(BNXT_RE_ALLOC_PAGE_HANDLE, + BNXT_RE_OBJECT_ALLOC_PAGE, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(BNXT_RE_ALLOC_PAGE_TYPE, + enum bnxt_re_alloc_page_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(BNXT_RE_ALLOC_PAGE_MMAP_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(BNXT_RE_ALLOC_PAGE_MMAP_LENGTH, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(BNXT_RE_ALLOC_PAGE_DPI, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY(BNXT_RE_METHOD_DESTROY_PAGE, + UVERBS_ATTR_IDR(BNXT_RE_DESTROY_PAGE_HANDLE, + BNXT_RE_OBJECT_ALLOC_PAGE, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(BNXT_RE_OBJECT_ALLOC_PAGE, + UVERBS_TYPE_ALLOC_IDR(alloc_page_obj_cleanup), + &UVERBS_METHOD(BNXT_RE_METHOD_ALLOC_PAGE), + &UVERBS_METHOD(BNXT_RE_METHOD_DESTROY_PAGE)); + +const struct uapi_definition bnxt_re_uapi_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(BNXT_RE_OBJECT_ALLOC_PAGE), + {} +}; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 31f7e34040f7..32d9e9d09791 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -60,6 +60,8 @@ struct bnxt_re_pd { struct bnxt_re_dev *rdev; struct bnxt_qplib_pd qplib_pd; struct bnxt_re_fence_data fence; + struct rdma_user_mmap_entry *pd_db_mmap; + struct rdma_user_mmap_entry *pd_wcdb_mmap; }; struct bnxt_re_ah { @@ -134,8 +136,23 @@ struct bnxt_re_ucontext { struct ib_ucontext ib_uctx; struct bnxt_re_dev *rdev; struct bnxt_qplib_dpi dpi; + struct bnxt_qplib_dpi wcdpi; void *shpg; spinlock_t sh_lock; /* protect shpg */ + struct rdma_user_mmap_entry *shpage_mmap; +}; + +enum bnxt_re_mmap_flag { + BNXT_RE_MMAP_SH_PAGE, + BNXT_RE_MMAP_UC_DB, + BNXT_RE_MMAP_WC_DB, +}; + +struct bnxt_re_user_mmap_entry { + struct rdma_user_mmap_entry rdma_entry; + struct bnxt_re_ucontext *uctx; + u64 mem_offset; + u8 mmap_flag; }; static inline u16 bnxt_re_get_swqe_size(int nsge) @@ -213,6 +230,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); +void bnxt_re_mmap_free(struct rdma_user_mmap_entry *rdma_entry); + unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp); void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp, unsigned long flags); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b9e2f89337e8..b42166fe7454 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -83,6 +83,45 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, unsigned long event, void *ptr); static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev); static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev); +static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); + +static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, + u32 *offset); +static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) +{ + struct bnxt_qplib_chip_ctx *cctx; + struct bnxt_en_dev *en_dev; + struct bnxt_qplib_res *res; + u32 l2db_len = 0; + u32 offset = 0; + u32 barlen; + int rc; + + res = &rdev->qplib_res; + en_dev = rdev->en_dev; + cctx = rdev->chip_ctx; + + /* Issue qcfg */ + rc = bnxt_re_hwrm_qcfg(rdev, &l2db_len, &offset); + if (rc) + dev_info(rdev_to_dev(rdev), + "Couldn't get DB bar size, Low latency framework is disabled\n"); + /* set register offsets for both UC and WC */ + res->dpi_tbl.ucreg.offset = res->is_vf ? BNXT_QPLIB_DBR_VF_DB_OFFSET : + BNXT_QPLIB_DBR_PF_DB_OFFSET; + res->dpi_tbl.wcreg.offset = res->dpi_tbl.ucreg.offset; + + /* If WC mapping is disabled by L2 driver then en_dev->l2_db_size + * is equal to the DB-Bar actual size. This indicates that L2 + * is mapping entire bar as UC-. RoCE driver can't enable WC mapping + * in such cases and DB-push will be disabled. + */ + barlen = pci_resource_len(res->pdev, RCFW_DBR_PCI_BAR_REGION); + if (cctx->modes.db_push && l2db_len && en_dev->l2_db_size != barlen) { + res->dpi_tbl.wcreg.offset = en_dev->l2_db_size; + dev_info(rdev_to_dev(rdev), "Low latency framework is enabled\n"); + } +} static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) { @@ -91,6 +130,9 @@ static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) cctx = rdev->chip_ctx; cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ? mode : BNXT_QPLIB_WQE_MODE_STATIC; + if (bnxt_re_hwrm_qcaps(rdev)) + dev_err(rdev_to_dev(rdev), + "Failed to query hwrm qcaps\n"); } static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) @@ -112,6 +154,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) { struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; + int rc; en_dev = rdev->en_dev; @@ -130,6 +173,12 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); bnxt_re_set_drv_mode(rdev, wqe_mode); + + bnxt_re_set_db_offset(rdev); + rc = bnxt_qplib_map_db_bar(&rdev->qplib_res); + if (rc) + return rc; + if (bnxt_qplib_determine_atomics(en_dev->pdev)) ibdev_info(&rdev->ibdev, "platform doesn't support global atomics."); @@ -283,15 +332,21 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) for (indx = 0; indx < rdev->num_msix; indx++) rdev->en_dev->msix_entries[indx].vector = ent[indx].vector; - bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector, - false); + rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_ent[BNXT_RE_AEQ_IDX].vector, + false); + if (rc) { + ibdev_warn(&rdev->ibdev, "Failed to reinit CREQ\n"); + return; + } for (indx = BNXT_RE_NQ_IDX ; indx < rdev->num_msix; indx++) { nq = &rdev->nq[indx - 1]; rc = bnxt_qplib_nq_start_irq(nq, indx - 1, msix_ent[indx].vector, false); - if (rc) + if (rc) { ibdev_warn(&rdev->ibdev, "Failed to reinit NQ index %d\n", indx - 1); + return; + } } } @@ -315,12 +370,11 @@ static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) return rc; } -static void bnxt_re_init_hwrm_hdr(struct bnxt_re_dev *rdev, struct input *hdr, - u16 opcd, u16 crid, u16 trid) +static void bnxt_re_init_hwrm_hdr(struct input *hdr, u16 opcd) { hdr->req_type = cpu_to_le16(opcd); - hdr->cmpl_ring = cpu_to_le16(crid); - hdr->target_id = cpu_to_le16(trid); + hdr->cmpl_ring = cpu_to_le16(-1); + hdr->target_id = cpu_to_le16(-1); } static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, @@ -334,13 +388,60 @@ static void bnxt_re_fill_fw_msg(struct bnxt_fw_msg *fw_msg, void *msg, fw_msg->timeout = timeout; } +/* Query device config using common hwrm */ +static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, + u32 *offset) +{ + struct bnxt_en_dev *en_dev = rdev->en_dev; + struct hwrm_func_qcfg_output resp = {0}; + struct hwrm_func_qcfg_input req = {0}; + struct bnxt_fw_msg fw_msg; + int rc; + + memset(&fw_msg, 0, sizeof(fw_msg)); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_QCFG); + req.fid = cpu_to_le16(0xffff); + bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + rc = bnxt_send_msg(en_dev, &fw_msg); + if (!rc) { + *db_len = PAGE_ALIGN(le16_to_cpu(resp.l2_doorbell_bar_size_kb) * 1024); + *offset = PAGE_ALIGN(le16_to_cpu(resp.legacy_l2_db_size_kb) * 1024); + } + return rc; +} + +/* Query function capabilities using common hwrm */ +int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev) +{ + struct bnxt_en_dev *en_dev = rdev->en_dev; + struct hwrm_func_qcaps_output resp = {}; + struct hwrm_func_qcaps_input req = {}; + struct bnxt_qplib_chip_ctx *cctx; + struct bnxt_fw_msg fw_msg = {}; + int rc; + + cctx = rdev->chip_ctx; + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_FUNC_QCAPS); + req.fid = cpu_to_le16(0xffff); + bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, + sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); + + rc = bnxt_send_msg(en_dev, &fw_msg); + if (rc) + return rc; + cctx->modes.db_push = le32_to_cpu(resp.flags) & FUNC_QCAPS_RESP_FLAGS_WCB_PUSH_MODE; + + return 0; +} + static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, u16 fw_ring_id, int type) { struct bnxt_en_dev *en_dev; - struct hwrm_ring_free_input req = {0}; + struct hwrm_ring_free_input req = {}; struct hwrm_ring_free_output resp; - struct bnxt_fw_msg fw_msg; + struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; if (!rdev) @@ -354,9 +455,7 @@ static int bnxt_re_net_ring_free(struct bnxt_re_dev *rdev, if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) return 0; - memset(&fw_msg, 0, sizeof(fw_msg)); - - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_FREE, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_RING_FREE); req.ring_type = type; req.ring_id = cpu_to_le16(fw_ring_id); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, @@ -373,16 +472,15 @@ static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, u16 *fw_ring_id) { struct bnxt_en_dev *en_dev = rdev->en_dev; - struct hwrm_ring_alloc_input req = {0}; + struct hwrm_ring_alloc_input req = {}; struct hwrm_ring_alloc_output resp; - struct bnxt_fw_msg fw_msg; + struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; if (!en_dev) return rc; - memset(&fw_msg, 0, sizeof(fw_msg)); - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_ALLOC, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_RING_ALLOC); req.enables = 0; req.page_tbl_addr = cpu_to_le64(ring_attr->dma_arr[0]); if (ring_attr->pages > 1) { @@ -411,7 +509,7 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, struct bnxt_en_dev *en_dev = rdev->en_dev; struct hwrm_stat_ctx_free_input req = {}; struct hwrm_stat_ctx_free_output resp = {}; - struct bnxt_fw_msg fw_msg; + struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; if (!en_dev) @@ -420,9 +518,7 @@ static int bnxt_re_net_stats_ctx_free(struct bnxt_re_dev *rdev, if (test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags)) return 0; - memset(&fw_msg, 0, sizeof(fw_msg)); - - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_FREE, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_FREE); req.stat_ctx_id = cpu_to_le32(fw_stats_ctx_id); bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp, sizeof(resp), DFLT_HWRM_CMD_TIMEOUT); @@ -439,10 +535,10 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, u32 *fw_stats_ctx_id) { struct bnxt_qplib_chip_ctx *chip_ctx = rdev->chip_ctx; - struct hwrm_stat_ctx_alloc_output resp = {0}; - struct hwrm_stat_ctx_alloc_input req = {0}; + struct hwrm_stat_ctx_alloc_output resp = {}; + struct hwrm_stat_ctx_alloc_input req = {}; struct bnxt_en_dev *en_dev = rdev->en_dev; - struct bnxt_fw_msg fw_msg; + struct bnxt_fw_msg fw_msg = {}; int rc = -EINVAL; *fw_stats_ctx_id = INVALID_STATS_CTX_ID; @@ -450,9 +546,7 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, if (!en_dev) return rc; - memset(&fw_msg, 0, sizeof(fw_msg)); - - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_ALLOC, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_STAT_CTX_ALLOC); req.update_period_ms = cpu_to_le32(1000); req.stats_dma_addr = cpu_to_le64(dma_map); req.stats_dma_length = cpu_to_le16(chip_ctx->hw_stats_size); @@ -466,6 +560,10 @@ static int bnxt_re_net_stats_ctx_alloc(struct bnxt_re_dev *rdev, return rc; } +static void bnxt_re_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} + /* Device */ static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev) @@ -532,6 +630,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .destroy_qp = bnxt_re_destroy_qp, .destroy_srq = bnxt_re_destroy_srq, .device_group = &bnxt_re_dev_attr_group, + .disassociate_ucontext = bnxt_re_disassociate_ucontext, .get_dev_fw_str = bnxt_re_query_fw_str, .get_dma_mr = bnxt_re_get_dma_mr, .get_hw_stats = bnxt_re_ib_get_hw_stats, @@ -539,6 +638,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .get_port_immutable = bnxt_re_get_port_immutable, .map_mr_sg = bnxt_re_map_mr_sg, .mmap = bnxt_re_mmap, + .mmap_free = bnxt_re_mmap_free, .modify_qp = bnxt_re_modify_qp, .modify_srq = bnxt_re_modify_srq, .poll_cq = bnxt_re_poll_cq, @@ -579,6 +679,9 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->dev.parent = &rdev->en_dev->pdev->dev; ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY; + if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) + ibdev->driver_def = bnxt_re_uapi_defs; + ib_set_device_ops(ibdev, &bnxt_re_dev_ops); ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1); if (ret) @@ -822,7 +925,6 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev) if (rdev->qplib_res.dpi_tbl.max) { bnxt_qplib_dealloc_dpi(&rdev->qplib_res, - &rdev->qplib_res.dpi_tbl, &rdev->dpi_privileged); } if (rdev->qplib_res.rcfw) { @@ -850,9 +952,9 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) if (rc) goto fail; - rc = bnxt_qplib_alloc_dpi(&rdev->qplib_res.dpi_tbl, + rc = bnxt_qplib_alloc_dpi(&rdev->qplib_res, &rdev->dpi_privileged, - rdev); + rdev, BNXT_QPLIB_DPI_TYPE_KERNEL); if (rc) goto dealloc_res; @@ -892,7 +994,6 @@ free_nq: bnxt_qplib_free_nq(&rdev->nq[i]); } bnxt_qplib_dealloc_dpi(&rdev->qplib_res, - &rdev->qplib_res.dpi_tbl, &rdev->dpi_privileged); dealloc_res: bnxt_qplib_free_res(&rdev->qplib_res); @@ -963,12 +1064,6 @@ static int bnxt_re_update_gid(struct bnxt_re_dev *rdev) if (!ib_device_try_get(&rdev->ibdev)) return 0; - if (!sgid_tbl) { - ibdev_err(&rdev->ibdev, "QPLIB: SGID table not allocated"); - rc = -EINVAL; - goto out; - } - for (index = 0; index < sgid_tbl->active; index++) { gid_idx = sgid_tbl->hw_id[index]; @@ -986,7 +1081,7 @@ static int bnxt_re_update_gid(struct bnxt_re_dev *rdev) rc = bnxt_qplib_update_sgid(sgid_tbl, &gid, gid_idx, rdev->qplib_res.netdev->dev_addr); } -out: + ib_device_put(&rdev->ibdev); return rc; } @@ -1039,14 +1134,13 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev = rdev->en_dev; - struct hwrm_ver_get_output resp = {0}; - struct hwrm_ver_get_input req = {0}; - struct bnxt_fw_msg fw_msg; + struct hwrm_ver_get_output resp = {}; + struct hwrm_ver_get_input req = {}; + struct bnxt_qplib_chip_ctx *cctx; + struct bnxt_fw_msg fw_msg = {}; int rc = 0; - memset(&fw_msg, 0, sizeof(fw_msg)); - bnxt_re_init_hwrm_hdr(rdev, (void *)&req, - HWRM_VER_GET, -1, -1); + bnxt_re_init_hwrm_hdr((void *)&req, HWRM_VER_GET); req.hwrm_intf_maj = HWRM_VERSION_MAJOR; req.hwrm_intf_min = HWRM_VERSION_MINOR; req.hwrm_intf_upd = HWRM_VERSION_UPDATE; @@ -1058,11 +1152,18 @@ static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) rc); return; } - rdev->qplib_ctx.hwrm_intf_ver = + + cctx = rdev->chip_ctx; + cctx->hwrm_intf_ver = (u64)le16_to_cpu(resp.hwrm_intf_major) << 48 | (u64)le16_to_cpu(resp.hwrm_intf_minor) << 32 | (u64)le16_to_cpu(resp.hwrm_intf_build) << 16 | le16_to_cpu(resp.hwrm_intf_patch); + + cctx->hwrm_cmd_max_timeout = le16_to_cpu(resp.max_req_timeout); + + if (!cctx->hwrm_cmd_max_timeout) + cctx->hwrm_cmd_max_timeout = RCFW_FW_STALL_MAX_TIMEOUT; } static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) @@ -1077,8 +1178,6 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) return rc; } dev_info(rdev_to_dev(rdev), "Device registered with IB successfully"); - ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, - &rdev->active_width); set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); event = netif_running(rdev->netdev) && netif_carrier_ok(rdev->netdev) ? @@ -1202,7 +1301,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX); vid = rdev->en_dev->msix_entries[BNXT_RE_AEQ_IDX].vector; rc = bnxt_qplib_enable_rcfw_channel(&rdev->rcfw, - vid, db_offt, rdev->is_virtfn, + vid, db_offt, &bnxt_re_aeq_handler); if (rc) { ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n", @@ -1336,6 +1435,10 @@ static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) { struct bnxt_qplib_cc_param cc_param = {}; + /* Do not enable congestion control on VFs */ + if (rdev->is_virtfn) + return; + /* Currently enabling only for GenP5 adapters */ if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) return; @@ -1495,6 +1598,7 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) */ set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + wake_up_all(&rdev->rcfw.cmdq.waitq); mutex_unlock(&bnxt_re_mutex); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index f139d4cd1712..91aed77ce40d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -399,6 +399,9 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance) void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill) { + if (!nq->requested) + return; + tasklet_disable(&nq->nq_tasklet); /* Mask h/w interrupt */ bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, false); @@ -406,11 +409,12 @@ void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill) synchronize_irq(nq->msix_vec); if (kill) tasklet_kill(&nq->nq_tasklet); - if (nq->requested) { - irq_set_affinity_hint(nq->msix_vec, NULL); - free_irq(nq->msix_vec, nq); - nq->requested = false; - } + + irq_set_affinity_hint(nq->msix_vec, NULL); + free_irq(nq->msix_vec, nq); + kfree(nq->name); + nq->name = NULL; + nq->requested = false; } void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq) @@ -436,6 +440,7 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq) int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, int msix_vector, bool need_init) { + struct bnxt_qplib_res *res = nq->res; int rc; if (nq->requested) @@ -447,10 +452,17 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, else tasklet_enable(&nq->nq_tasklet); - snprintf(nq->name, sizeof(nq->name), "bnxt_qplib_nq-%d", nq_indx); + nq->name = kasprintf(GFP_KERNEL, "bnxt_re-nq-%d@pci:%s", + nq_indx, pci_name(res->pdev)); + if (!nq->name) + return -ENOMEM; rc = request_irq(nq->msix_vec, bnxt_qplib_nq_irq, 0, nq->name, nq); - if (rc) + if (rc) { + kfree(nq->name); + nq->name = NULL; + tasklet_disable(&nq->nq_tasklet); return rc; + } cpumask_clear(&nq->mask); cpumask_set_cpu(nq_indx, &nq->mask); @@ -461,7 +473,7 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, nq->msix_vec, nq_indx); } nq->requested = true; - bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, true); + bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, res->cctx, true); return rc; } @@ -471,7 +483,6 @@ static int bnxt_qplib_map_nq_db(struct bnxt_qplib_nq *nq, u32 reg_offt) resource_size_t reg_base; struct bnxt_qplib_nq_db *nq_db; struct pci_dev *pdev; - int rc = 0; pdev = nq->pdev; nq_db = &nq->nq_db; @@ -481,8 +492,7 @@ static int bnxt_qplib_map_nq_db(struct bnxt_qplib_nq *nq, u32 reg_offt) if (!nq_db->reg.bar_base) { dev_err(&pdev->dev, "QPLIB: NQ BAR region %d resc start is 0!", nq_db->reg.bar_id); - rc = -ENOMEM; - goto fail; + return -ENOMEM; } reg_base = nq_db->reg.bar_base + reg_offt; @@ -492,15 +502,14 @@ static int bnxt_qplib_map_nq_db(struct bnxt_qplib_nq *nq, u32 reg_offt) if (!nq_db->reg.bar_reg) { dev_err(&pdev->dev, "QPLIB: NQ BAR region %d mapping failed", nq_db->reg.bar_id); - rc = -ENOMEM; - goto fail; + return -ENOMEM; } nq_db->dbinfo.db = nq_db->reg.bar_reg; nq_db->dbinfo.hwq = &nq->hwq; nq_db->dbinfo.xid = nq->ring_id; -fail: - return rc; + + return 0; } int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, @@ -614,7 +623,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr); if (rc) - goto exit; + return rc; srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), GFP_KERNEL); @@ -659,7 +668,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, srq->dbinfo.xid = srq->id; srq->dbinfo.db = srq->dpi->dbr; srq->dbinfo.max_slot = 1; - srq->dbinfo.priv_db = res->dpi_tbl.dbr_bar_reg_iomem; + srq->dbinfo.priv_db = res->dpi_tbl.priv_db; if (srq->threshold) bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA); srq->arm_req = false; @@ -668,7 +677,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, fail: bnxt_qplib_free_hwq(res, &srq->hwq); kfree(srq->swq); -exit: + return rc; } @@ -732,15 +741,14 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, struct rq_wqe *srqe; struct sq_sge *hw_sge; u32 sw_prod, sw_cons, count = 0; - int i, rc = 0, next; + int i, next; spin_lock(&srq_hwq->lock); if (srq->start_idx == srq->last_idx) { dev_err(&srq_hwq->pdev->dev, "FP: SRQ (0x%x) is full!\n", srq->id); - rc = -EINVAL; spin_unlock(&srq_hwq->lock); - goto done; + return -EINVAL; } next = srq->start_idx; srq->start_idx = srq->swq[next].next_idx; @@ -781,22 +789,19 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, srq->arm_req = false; bnxt_qplib_srq_arm_db(&srq->dbinfo, srq->threshold); } -done: - return rc; + + return 0; } /* QP */ static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que) { - int rc = 0; int indx; que->swq = kcalloc(que->max_wqe, sizeof(*que->swq), GFP_KERNEL); - if (!que->swq) { - rc = -ENOMEM; - goto out; - } + if (!que->swq) + return -ENOMEM; que->swq_start = 0; que->swq_last = que->max_wqe - 1; @@ -804,8 +809,8 @@ static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que) que->swq[indx].next_idx = indx + 1; que->swq[que->swq_last].next_idx = 0; /* Make it circular */ que->swq_last = 0; -out: - return rc; + + return 0; } int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) @@ -839,7 +844,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr); if (rc) - goto exit; + return rc; rc = bnxt_qplib_alloc_init_swq(sq); if (rc) @@ -927,7 +932,6 @@ sq_swq: kfree(sq->swq); fail_sq: bnxt_qplib_free_hwq(res, &sq->hwq); -exit: return rc; } @@ -992,7 +996,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr); if (rc) - goto exit; + return rc; rc = bnxt_qplib_alloc_init_swq(sq); if (rc) @@ -1140,7 +1144,6 @@ sq_swq: kfree(sq->swq); fail_sq: bnxt_qplib_free_hwq(res, &sq->hwq); -exit: return rc; } @@ -1614,7 +1617,7 @@ static int bnxt_qplib_put_inline(struct bnxt_qplib_qp *qp, il_src = (void *)wqe->sg_list[indx].addr; t_len += len; if (t_len > qp->max_inline_data) - goto bad; + return -ENOMEM; while (len) { if (pull_dst) { pull_dst = false; @@ -1638,8 +1641,6 @@ static int bnxt_qplib_put_inline(struct bnxt_qplib_qp *qp, } return t_len; -bad: - return -ENOMEM; } static u32 bnxt_qplib_put_sges(struct bnxt_qplib_hwq *hwq, @@ -2056,6 +2057,12 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) u32 pg_sz_lvl; int rc; + if (!cq->dpi) { + dev_err(&rcfw->pdev->dev, + "FP: CREATE_CQ failed due to NULL DPI\n"); + return -EINVAL; + } + hwq_attr.res = res; hwq_attr.depth = cq->max_wqe; hwq_attr.stride = sizeof(struct cq_base); @@ -2063,17 +2070,12 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) hwq_attr.sginfo = &cq->sg_info; rc = bnxt_qplib_alloc_init_hwq(&cq->hwq, &hwq_attr); if (rc) - goto exit; + return rc; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_CREATE_CQ, sizeof(req)); - if (!cq->dpi) { - dev_err(&rcfw->pdev->dev, - "FP: CREATE_CQ failed due to NULL DPI\n"); - return -EINVAL; - } req.dpi = cpu_to_le32(cq->dpi->dpi); req.cq_handle = cpu_to_le64(cq->cq_handle); req.cq_size = cpu_to_le32(cq->hwq.max_elements); @@ -2103,7 +2105,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) cq->dbinfo.hwq = &cq->hwq; cq->dbinfo.xid = cq->id; cq->dbinfo.db = cq->dpi->dbr; - cq->dbinfo.priv_db = res->dpi_tbl.dbr_bar_reg_iomem; + cq->dbinfo.priv_db = res->dpi_tbl.priv_db; bnxt_qplib_armen_db(&cq->dbinfo, DBC_DBC_TYPE_CQ_ARMENA); @@ -2111,7 +2113,6 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) fail: bnxt_qplib_free_hwq(res, &cq->hwq); -exit: return rc; } @@ -2504,7 +2505,6 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; u32 wr_id_idx; - int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); @@ -2515,7 +2515,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); - goto done; + return 0; } cqe = *pcqe; @@ -2571,8 +2571,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, } } -done: - return rc; + return 0; } static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, @@ -2585,7 +2584,6 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; u32 wr_id_idx; - int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); @@ -2596,7 +2594,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); - goto done; + return 0; } cqe = *pcqe; cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK; @@ -2658,8 +2656,8 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, bnxt_qplib_add_flush_qp(qp); } } -done: - return rc; + + return 0; } bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq) @@ -2686,7 +2684,6 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; u32 wr_id_idx; - int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); @@ -2697,7 +2694,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); - goto done; + return 0; } cqe = *pcqe; cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK; @@ -2766,8 +2763,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, } } -done: - return rc; + return 0; } static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, @@ -2789,11 +2785,8 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); - if (!qp) { - dev_err(&cq->hwq.pdev->dev, - "FP: CQ Process terminal qp is NULL\n"); + if (!qp) return -EINVAL; - } /* Must block new posting of SQ and RQ */ qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index d74d5ead2e32..a42820821c47 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -472,7 +472,7 @@ typedef int (*srqn_handler_t)(struct bnxt_qplib_nq *nq, struct bnxt_qplib_nq { struct pci_dev *pdev; struct bnxt_qplib_res *res; - char name[32]; + char *name; struct bnxt_qplib_hwq hwq; struct bnxt_qplib_nq_db nq_db; u16 ring_id; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index de9069103177..b30e66b64827 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -53,112 +53,289 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t); -/* Hardware communication channel */ +/** + * bnxt_qplib_map_rc - map return type based on opcode + * @opcode - roce slow path opcode + * + * case #1 + * Firmware initiated error recovery is a safe state machine and + * driver can consider all the underlying rdma resources are free. + * In this state, it is safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). + * + * case #2 + * If driver detect potential firmware stall, it is not safe state machine + * and the driver can not consider all the underlying rdma resources are + * freed. + * In this state, it is not safe to return success for opcodes related to + * destroying rdma resources (like destroy qp, destroy cq etc.). + * + * Scope of this helper function is only for case #1. + * + * Returns: + * 0 to communicate success to caller. + * Non zero error code to communicate failure to caller. + */ +static int bnxt_qplib_map_rc(u8 opcode) +{ + switch (opcode) { + case CMDQ_BASE_OPCODE_DESTROY_QP: + case CMDQ_BASE_OPCODE_DESTROY_SRQ: + case CMDQ_BASE_OPCODE_DESTROY_CQ: + case CMDQ_BASE_OPCODE_DEALLOCATE_KEY: + case CMDQ_BASE_OPCODE_DEREGISTER_MR: + case CMDQ_BASE_OPCODE_DELETE_GID: + case CMDQ_BASE_OPCODE_DESTROY_QP1: + case CMDQ_BASE_OPCODE_DESTROY_AH: + case CMDQ_BASE_OPCODE_DEINITIALIZE_FW: + case CMDQ_BASE_OPCODE_MODIFY_ROCE_CC: + case CMDQ_BASE_OPCODE_SET_LINK_AGGR_MODE: + return 0; + default: + return -ETIMEDOUT; + } +} + +/** + * bnxt_re_is_fw_stalled - Check firmware health + * @rcfw - rcfw channel instance of rdev + * @cookie - cookie to track the command + * + * If firmware has not responded any rcfw command within + * rcfw->max_timeout, consider firmware as stalled. + * + * Returns: + * 0 if firmware is responding + * -ENODEV if firmware is not responding + */ +static int bnxt_re_is_fw_stalled(struct bnxt_qplib_rcfw *rcfw, + u16 cookie) +{ + struct bnxt_qplib_cmdq_ctx *cmdq; + struct bnxt_qplib_crsqe *crsqe; + + crsqe = &rcfw->crsqe_tbl[cookie]; + cmdq = &rcfw->cmdq; + + if (time_after(jiffies, cmdq->last_seen + + (rcfw->max_timeout * HZ))) { + dev_warn_ratelimited(&rcfw->pdev->dev, + "%s: FW STALL Detected. cmdq[%#x]=%#x waited (%d > %d) msec active %d ", + __func__, cookie, crsqe->opcode, + jiffies_to_msecs(jiffies - cmdq->last_seen), + rcfw->max_timeout * 1000, + crsqe->is_in_used); + return -ENODEV; + } + + return 0; +} + +/** + * __wait_for_resp - Don't hold the cpu context and wait for response + * @rcfw - rcfw channel instance of rdev + * @cookie - cookie to track the command + * + * Wait for command completion in sleepable context. + * + * Returns: + * 0 if command is completed by firmware. + * Non zero error code for rest of the case. + */ static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) { struct bnxt_qplib_cmdq_ctx *cmdq; - u16 cbit; - int rc; + struct bnxt_qplib_crsqe *crsqe; + int ret; cmdq = &rcfw->cmdq; - cbit = cookie % rcfw->cmdq_depth; - rc = wait_event_timeout(cmdq->waitq, - !test_bit(cbit, cmdq->cmdq_bitmap), - msecs_to_jiffies(RCFW_CMD_WAIT_TIME_MS)); - return rc ? 0 : -ETIMEDOUT; + crsqe = &rcfw->crsqe_tbl[cookie]; + + do { + if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) + return bnxt_qplib_map_rc(crsqe->opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; + + wait_event_timeout(cmdq->waitq, + !crsqe->is_in_used || + test_bit(ERR_DEVICE_DETACHED, &cmdq->flags), + msecs_to_jiffies(rcfw->max_timeout * 1000)); + + if (!crsqe->is_in_used) + return 0; + + bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); + + if (!crsqe->is_in_used) + return 0; + + ret = bnxt_re_is_fw_stalled(rcfw, cookie); + if (ret) + return ret; + + } while (true); }; +/** + * __block_for_resp - hold the cpu context and wait for response + * @rcfw - rcfw channel instance of rdev + * @cookie - cookie to track the command + * + * This function will hold the cpu (non-sleepable context) and + * wait for command completion. Maximum holding interval is 8 second. + * + * Returns: + * -ETIMEOUT if command is not completed in specific time interval. + * 0 if command is completed by firmware. + */ static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) { - u32 count = RCFW_BLOCKED_CMD_WAIT_COUNT; - struct bnxt_qplib_cmdq_ctx *cmdq; - u16 cbit; + struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; + struct bnxt_qplib_crsqe *crsqe; + unsigned long issue_time = 0; + + issue_time = jiffies; + crsqe = &rcfw->crsqe_tbl[cookie]; - cmdq = &rcfw->cmdq; - cbit = cookie % rcfw->cmdq_depth; - if (!test_bit(cbit, cmdq->cmdq_bitmap)) - goto done; do { + if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) + return bnxt_qplib_map_rc(crsqe->opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; + udelay(1); + bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); - } while (test_bit(cbit, cmdq->cmdq_bitmap) && --count); -done: - return count ? 0 : -ETIMEDOUT; + if (!crsqe->is_in_used) + return 0; + + } while (time_before(jiffies, issue_time + (8 * HZ))); + + return -ETIMEDOUT; }; -static int __send_message(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_cmdqmsg *msg) +/* __send_message_no_waiter - get cookie and post the message. + * @rcfw - rcfw channel instance of rdev + * @msg - qplib message internal + * + * This function will just post and don't bother about completion. + * Current design of this function is - + * user must hold the completion queue hwq->lock. + * user must have used existing completion and free the resources. + * this function will not check queue full condition. + * this function will explicitly set is_waiter_alive=false. + * current use case is - send destroy_ah if create_ah is return + * after waiter of create_ah is lost. It can be extended for other + * use case as well. + * + * Returns: Nothing + * + */ +static void __send_message_no_waiter(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_cmdqmsg *msg) { struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; struct bnxt_qplib_hwq *hwq = &cmdq->hwq; struct bnxt_qplib_crsqe *crsqe; struct bnxt_qplib_cmdqe *cmdqe; u32 sw_prod, cmdq_prod; - struct pci_dev *pdev; - unsigned long flags; - u32 bsize, opcode; - u16 cookie, cbit; + u16 cookie; + u32 bsize; u8 *preq; - pdev = rcfw->pdev; + cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; + __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie)); + crsqe = &rcfw->crsqe_tbl[cookie]; - opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); - if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && - (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC && - opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && - opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { - dev_err(&pdev->dev, - "RCFW not initialized, reject opcode 0x%x\n", opcode); - return -EINVAL; - } + /* Set cmd_size in terms of 16B slots in req. */ + bsize = bnxt_qplib_set_cmd_slots(msg->req); + /* GET_CMD_SIZE would return number of slots in either case of tlv + * and non-tlv commands after call to bnxt_qplib_set_cmd_slots() + */ + crsqe->is_internal_cmd = true; + crsqe->is_waiter_alive = false; + crsqe->is_in_used = true; + crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); - if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && - opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { - dev_err(&pdev->dev, "RCFW already initialized!\n"); - return -EINVAL; - } + preq = (u8 *)msg->req; + do { + /* Locate the next cmdq slot */ + sw_prod = HWQ_CMP(hwq->prod, hwq); + cmdqe = bnxt_qplib_get_qe(hwq, sw_prod, NULL); + /* Copy a segment of the req cmd to the cmdq */ + memset(cmdqe, 0, sizeof(*cmdqe)); + memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe))); + preq += min_t(u32, bsize, sizeof(*cmdqe)); + bsize -= min_t(u32, bsize, sizeof(*cmdqe)); + hwq->prod++; + } while (bsize > 0); + cmdq->seq_num++; - if (test_bit(FIRMWARE_TIMED_OUT, &cmdq->flags)) - return -ETIMEDOUT; + cmdq_prod = hwq->prod; + atomic_inc(&rcfw->timeout_send); + /* ring CMDQ DB */ + wmb(); + writel(cmdq_prod, cmdq->cmdq_mbox.prod); + writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); +} + +static int __send_message(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_cmdqmsg *msg, u8 opcode) +{ + u32 bsize, free_slots, required_slots; + struct bnxt_qplib_cmdq_ctx *cmdq; + struct bnxt_qplib_crsqe *crsqe; + struct bnxt_qplib_cmdqe *cmdqe; + struct bnxt_qplib_hwq *hwq; + u32 sw_prod, cmdq_prod; + struct pci_dev *pdev; + unsigned long flags; + u16 cookie; + u8 *preq; + + cmdq = &rcfw->cmdq; + hwq = &cmdq->hwq; + pdev = rcfw->pdev; /* Cmdq are in 16-byte units, each request can consume 1 or more * cmdqe */ spin_lock_irqsave(&hwq->lock, flags); - if (msg->req->cmd_size >= HWQ_FREE_SLOTS(hwq)) { - dev_err(&pdev->dev, "RCFW: CMDQ is full!\n"); + required_slots = bnxt_qplib_get_cmd_slots(msg->req); + free_slots = HWQ_FREE_SLOTS(hwq); + cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; + crsqe = &rcfw->crsqe_tbl[cookie]; + + if (required_slots >= free_slots) { + dev_info_ratelimited(&pdev->dev, + "CMDQ is full req/free %d/%d!", + required_slots, free_slots); spin_unlock_irqrestore(&hwq->lock, flags); return -EAGAIN; } - - - cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE; - cbit = cookie % rcfw->cmdq_depth; if (msg->block) cookie |= RCFW_CMD_IS_BLOCKING; - - set_bit(cbit, cmdq->cmdq_bitmap); __set_cmdq_base_cookie(msg->req, msg->req_sz, cpu_to_le16(cookie)); - crsqe = &rcfw->crsqe_tbl[cbit]; - if (crsqe->resp) { - spin_unlock_irqrestore(&hwq->lock, flags); - return -EBUSY; - } - /* change the cmd_size to the number of 16byte cmdq unit. - * req->cmd_size is modified here - */ bsize = bnxt_qplib_set_cmd_slots(msg->req); - - memset(msg->resp, 0, sizeof(*msg->resp)); + crsqe->free_slots = free_slots; crsqe->resp = (struct creq_qp_event *)msg->resp; crsqe->resp->cookie = cpu_to_le16(cookie); + crsqe->is_internal_cmd = false; + crsqe->is_waiter_alive = true; + crsqe->is_in_used = true; + crsqe->opcode = opcode; + crsqe->req_size = __get_cmdq_base_cmd_size(msg->req, msg->req_sz); if (__get_cmdq_base_resp_size(msg->req, msg->req_sz) && msg->sb) { struct bnxt_qplib_rcfw_sbuf *sbuf = msg->sb; - __set_cmdq_base_resp_addr(msg->req, msg->req_sz, cpu_to_le64(sbuf->dma_addr)); + + __set_cmdq_base_resp_addr(msg->req, msg->req_sz, + cpu_to_le64(sbuf->dma_addr)); __set_cmdq_base_resp_size(msg->req, msg->req_sz, - ALIGN(sbuf->size, BNXT_QPLIB_CMDQE_UNITS)); + ALIGN(sbuf->size, + BNXT_QPLIB_CMDQE_UNITS)); } preq = (u8 *)msg->req; @@ -166,11 +343,6 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, /* Locate the next cmdq slot */ sw_prod = HWQ_CMP(hwq->prod, hwq); cmdqe = bnxt_qplib_get_qe(hwq, sw_prod, NULL); - if (!cmdqe) { - dev_err(&pdev->dev, - "RCFW request failed with no cmdqe!\n"); - goto done; - } /* Copy a segment of the req cmd to the cmdq */ memset(cmdqe, 0, sizeof(*cmdqe)); memcpy(cmdqe, preq, min_t(u32, bsize, sizeof(*cmdqe))); @@ -180,7 +352,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, } while (bsize > 0); cmdq->seq_num++; - cmdq_prod = hwq->prod; + cmdq_prod = hwq->prod & 0xFFFF; if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) { /* The very first doorbell write * is required to set this flag @@ -194,51 +366,158 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, wmb(); writel(cmdq_prod, cmdq->cmdq_mbox.prod); writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db); -done: spin_unlock_irqrestore(&hwq->lock, flags); /* Return the CREQ response pointer */ return 0; } -int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_cmdqmsg *msg) +/** + * __poll_for_resp - self poll completion for rcfw command + * @rcfw - rcfw channel instance of rdev + * @cookie - cookie to track the command + * + * It works same as __wait_for_resp except this function will + * do self polling in sort interval since interrupt is disabled. + * This function can not be called from non-sleepable context. + * + * Returns: + * -ETIMEOUT if command is not completed in specific time interval. + * 0 if command is completed by firmware. + */ +static int __poll_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie) +{ + struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; + struct bnxt_qplib_crsqe *crsqe; + unsigned long issue_time; + int ret; + + issue_time = jiffies; + crsqe = &rcfw->crsqe_tbl[cookie]; + + do { + if (test_bit(ERR_DEVICE_DETACHED, &cmdq->flags)) + return bnxt_qplib_map_rc(crsqe->opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; + + usleep_range(1000, 1001); + + bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet); + if (!crsqe->is_in_used) + return 0; + if (jiffies_to_msecs(jiffies - issue_time) > + (rcfw->max_timeout * 1000)) { + ret = bnxt_re_is_fw_stalled(rcfw, cookie); + if (ret) + return ret; + } + } while (true); +}; + +static int __send_message_basic_sanity(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_cmdqmsg *msg, + u8 opcode) +{ + struct bnxt_qplib_cmdq_ctx *cmdq; + + cmdq = &rcfw->cmdq; + + /* Prevent posting if f/w is not in a state to process */ + if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) + return bnxt_qplib_map_rc(opcode); + if (test_bit(FIRMWARE_STALL_DETECTED, &cmdq->flags)) + return -ETIMEDOUT; + + if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { + dev_err(&rcfw->pdev->dev, "QPLIB: RCFW already initialized!"); + return -EINVAL; + } + + if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) && + (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC && + opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && + opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { + dev_err(&rcfw->pdev->dev, + "QPLIB: RCFW not initialized, reject opcode 0x%x", + opcode); + return -EOPNOTSUPP; + } + + return 0; +} + +/* This function will just post and do not bother about completion */ +static void __destroy_timedout_ah(struct bnxt_qplib_rcfw *rcfw, + struct creq_create_ah_resp *create_ah_resp) +{ + struct bnxt_qplib_cmdqmsg msg = {}; + struct cmdq_destroy_ah req = {}; + + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, + CMDQ_BASE_OPCODE_DESTROY_AH, + sizeof(req)); + req.ah_cid = create_ah_resp->xid; + msg.req = (struct cmdq_base *)&req; + msg.req_sz = sizeof(req); + __send_message_no_waiter(rcfw, &msg); + dev_info_ratelimited(&rcfw->pdev->dev, + "From %s: ah_cid = %d timeout_send %d\n", + __func__, req.ah_cid, + atomic_read(&rcfw->timeout_send)); +} + +/** + * __bnxt_qplib_rcfw_send_message - qplib interface to send + * and complete rcfw command. + * @rcfw - rcfw channel instance of rdev + * @msg - qplib message internal + * + * This function does not account shadow queue depth. It will send + * all the command unconditionally as long as send queue is not full. + * + * Returns: + * 0 if command completed by firmware. + * Non zero if the command is not completed by firmware. + */ +static int __bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_cmdqmsg *msg) { struct creq_qp_event *evnt = (struct creq_qp_event *)msg->resp; + struct bnxt_qplib_crsqe *crsqe; + unsigned long flags; u16 cookie; - u8 opcode, retry_cnt = 0xFF; int rc = 0; + u8 opcode; - /* Prevent posting if f/w is not in a state to process */ - if (test_bit(ERR_DEVICE_DETACHED, &rcfw->cmdq.flags)) - return 0; + opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); - do { - opcode = __get_cmdq_base_opcode(msg->req, msg->req_sz); - rc = __send_message(rcfw, msg); - cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz)) & - RCFW_MAX_COOKIE_VALUE; - if (!rc) - break; - if (!retry_cnt || (rc != -EAGAIN && rc != -EBUSY)) { - /* send failed */ - dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x send failed\n", - cookie, opcode); - return rc; - } - msg->block ? mdelay(1) : usleep_range(500, 1000); + rc = __send_message_basic_sanity(rcfw, msg, opcode); + if (rc) + return rc; - } while (retry_cnt--); + rc = __send_message(rcfw, msg, opcode); + if (rc) + return rc; + + cookie = le16_to_cpu(__get_cmdq_base_cookie(msg->req, msg->req_sz)) + & RCFW_MAX_COOKIE_VALUE; if (msg->block) rc = __block_for_resp(rcfw, cookie); - else + else if (atomic_read(&rcfw->rcfw_intr_enabled)) rc = __wait_for_resp(rcfw, cookie); + else + rc = __poll_for_resp(rcfw, cookie); + if (rc) { - /* timed out */ - dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n", - cookie, opcode, RCFW_CMD_WAIT_TIME_MS); - set_bit(FIRMWARE_TIMED_OUT, &rcfw->cmdq.flags); - return rc; + spin_lock_irqsave(&rcfw->cmdq.hwq.lock, flags); + crsqe = &rcfw->crsqe_tbl[cookie]; + crsqe->is_waiter_alive = false; + if (rc == -ENODEV) + set_bit(FIRMWARE_STALL_DETECTED, &rcfw->cmdq.flags); + spin_unlock_irqrestore(&rcfw->cmdq.hwq.lock, flags); + return -ETIMEDOUT; } if (evnt->status) { @@ -250,6 +529,48 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, return rc; } + +/** + * bnxt_qplib_rcfw_send_message - qplib interface to send + * and complete rcfw command. + * @rcfw - rcfw channel instance of rdev + * @msg - qplib message internal + * + * Driver interact with Firmware through rcfw channel/slow path in two ways. + * a. Blocking rcfw command send. In this path, driver cannot hold + * the context for longer period since it is holding cpu until + * command is not completed. + * b. Non-blocking rcfw command send. In this path, driver can hold the + * context for longer period. There may be many pending command waiting + * for completion because of non-blocking nature. + * + * Driver will use shadow queue depth. Current queue depth of 8K + * (due to size of rcfw message there can be actual ~4K rcfw outstanding) + * is not optimal for rcfw command processing in firmware. + * + * Restrict at max #RCFW_CMD_NON_BLOCKING_SHADOW_QD Non-Blocking rcfw commands. + * Allow all blocking commands until there is no queue full. + * + * Returns: + * 0 if command completed by firmware. + * Non zero if the command is not completed by firmware. + */ +int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_cmdqmsg *msg) +{ + int ret; + + if (!msg->block) { + down(&rcfw->rcfw_inflight); + ret = __bnxt_qplib_rcfw_send_message(rcfw, msg); + up(&rcfw->rcfw_inflight); + } else { + ret = __bnxt_qplib_rcfw_send_message(rcfw, msg); + } + + return ret; +} + /* Completions */ static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw, struct creq_func_event *func_event) @@ -295,19 +616,20 @@ static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw, } static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, - struct creq_qp_event *qp_event) + struct creq_qp_event *qp_event, + u32 *num_wait) { struct creq_qp_error_notification *err_event; struct bnxt_qplib_hwq *hwq = &rcfw->cmdq.hwq; struct bnxt_qplib_crsqe *crsqe; + u32 qp_id, tbl_indx, req_size; struct bnxt_qplib_qp *qp; - u16 cbit, blocked = 0; + u16 cookie, blocked = 0; + bool is_waiter_alive; struct pci_dev *pdev; unsigned long flags; - __le16 mcookie; - u16 cookie; + u32 wait_cmds = 0; int rc = 0; - u32 qp_id, tbl_indx; pdev = rcfw->pdev; switch (qp_event->event) { @@ -339,33 +661,60 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, spin_lock_irqsave_nested(&hwq->lock, flags, SINGLE_DEPTH_NESTING); cookie = le16_to_cpu(qp_event->cookie); - mcookie = qp_event->cookie; blocked = cookie & RCFW_CMD_IS_BLOCKING; cookie &= RCFW_MAX_COOKIE_VALUE; - cbit = cookie % rcfw->cmdq_depth; - crsqe = &rcfw->crsqe_tbl[cbit]; - if (crsqe->resp && - crsqe->resp->cookie == mcookie) { - memcpy(crsqe->resp, qp_event, sizeof(*qp_event)); - crsqe->resp = NULL; - } else { - if (crsqe->resp && crsqe->resp->cookie) - dev_err(&pdev->dev, - "CMD %s cookie sent=%#x, recd=%#x\n", - crsqe->resp ? "mismatch" : "collision", - crsqe->resp ? crsqe->resp->cookie : 0, - mcookie); + crsqe = &rcfw->crsqe_tbl[cookie]; + crsqe->is_in_used = false; + + if (WARN_ONCE(test_bit(FIRMWARE_STALL_DETECTED, + &rcfw->cmdq.flags), + "QPLIB: Unreponsive rcfw channel detected.!!")) { + dev_info(&pdev->dev, + "rcfw timedout: cookie = %#x, free_slots = %d", + cookie, crsqe->free_slots); + spin_unlock_irqrestore(&hwq->lock, flags); + return rc; } - if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap)) - dev_warn(&pdev->dev, - "CMD bit %d was not requested\n", cbit); - hwq->cons += crsqe->req_size; + + if (crsqe->is_internal_cmd && !qp_event->status) + atomic_dec(&rcfw->timeout_send); + + if (crsqe->is_waiter_alive) { + if (crsqe->resp) + memcpy(crsqe->resp, qp_event, sizeof(*qp_event)); + if (!blocked) + wait_cmds++; + } + + req_size = crsqe->req_size; + is_waiter_alive = crsqe->is_waiter_alive; + crsqe->req_size = 0; + if (!is_waiter_alive) + crsqe->resp = NULL; - if (!blocked) - wake_up(&rcfw->cmdq.waitq); + hwq->cons += req_size; + + /* This is a case to handle below scenario - + * Create AH is completed successfully by firmware, + * but completion took more time and driver already lost + * the context of create_ah from caller. + * We have already return failure for create_ah verbs, + * so let's destroy the same address vector since it is + * no more used in stack. We don't care about completion + * in __send_message_no_waiter. + * If destroy_ah is failued by firmware, there will be AH + * resource leak and relatively not critical + unlikely + * scenario. Current design is not to handle such case. + */ + if (!is_waiter_alive && !qp_event->status && + qp_event->event == CREQ_QP_EVENT_EVENT_CREATE_AH) + __destroy_timedout_ah(rcfw, + (struct creq_create_ah_resp *) + qp_event); spin_unlock_irqrestore(&hwq->lock, flags); } + *num_wait += wait_cmds; return rc; } @@ -379,6 +728,7 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) struct creq_base *creqe; u32 sw_cons, raw_cons; unsigned long flags; + u32 num_wakeup = 0; /* Service the CREQ until budget is over */ spin_lock_irqsave(&hwq->lock, flags); @@ -392,12 +742,14 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) * reading any further. */ dma_rmb(); + rcfw->cmdq.last_seen = jiffies; type = creqe->type & CREQ_BASE_TYPE_MASK; switch (type) { case CREQ_BASE_TYPE_QP_EVENT: bnxt_qplib_process_qp_event - (rcfw, (struct creq_qp_event *)creqe); + (rcfw, (struct creq_qp_event *)creqe, + &num_wakeup); creq->stats.creq_qp_event_processed++; break; case CREQ_BASE_TYPE_FUNC_EVENT: @@ -425,6 +777,8 @@ static void bnxt_qplib_service_creq(struct tasklet_struct *t) rcfw->res->cctx, true); } spin_unlock_irqrestore(&hwq->lock, flags); + if (num_wakeup) + wake_up_nr(&rcfw->cmdq.waitq, num_wakeup); } static irqreturn_t bnxt_qplib_creq_irq(int irq, void *dev_instance) @@ -556,7 +910,6 @@ skip_ctx_setup: void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) { - bitmap_free(rcfw->cmdq.cmdq_bitmap); kfree(rcfw->qp_tbl); kfree(rcfw->crsqe_tbl); bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq.hwq); @@ -593,13 +946,11 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, "HW channel CREQ allocation failed\n"); goto fail; } - if (ctx->hwrm_intf_ver < HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK) - rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_256; - else - rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_8192; + + rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT; sginfo.pgsize = bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth); - hwq_attr.depth = rcfw->cmdq_depth; + hwq_attr.depth = rcfw->cmdq_depth & 0x7FFFFFFF; hwq_attr.stride = BNXT_QPLIB_CMDQE_UNITS; hwq_attr.type = HWQ_TYPE_CTX; if (bnxt_qplib_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) { @@ -613,10 +964,6 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, if (!rcfw->crsqe_tbl) goto fail; - cmdq->cmdq_bitmap = bitmap_zalloc(rcfw->cmdq_depth, GFP_KERNEL); - if (!cmdq->cmdq_bitmap) - goto fail; - /* Allocate one extra to hold the QP1 entries */ rcfw->qp_tbl_size = qp_tbl_sz + 1; rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node), @@ -624,6 +971,8 @@ int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res, if (!rcfw->qp_tbl) goto fail; + rcfw->max_timeout = res->cctx->hwrm_cmd_max_timeout; + return 0; fail: @@ -636,6 +985,10 @@ void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill) struct bnxt_qplib_creq_ctx *creq; creq = &rcfw->creq; + + if (!creq->requested) + return; + tasklet_disable(&creq->creq_tasklet); /* Mask h/w interrupts */ bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false); @@ -644,17 +997,17 @@ void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill) if (kill) tasklet_kill(&creq->creq_tasklet); - if (creq->requested) { - free_irq(creq->msix_vec, rcfw); - creq->requested = false; - } + free_irq(creq->msix_vec, rcfw); + kfree(creq->irq_name); + creq->irq_name = NULL; + creq->requested = false; + atomic_set(&rcfw->rcfw_intr_enabled, 0); } void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) { struct bnxt_qplib_creq_ctx *creq; struct bnxt_qplib_cmdq_ctx *cmdq; - unsigned long indx; creq = &rcfw->creq; cmdq = &rcfw->cmdq; @@ -664,11 +1017,6 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) iounmap(cmdq->cmdq_mbox.reg.bar_reg); iounmap(creq->creq_db.reg.bar_reg); - indx = find_first_bit(cmdq->cmdq_bitmap, rcfw->cmdq_depth); - if (indx != rcfw->cmdq_depth) - dev_err(&rcfw->pdev->dev, - "disabling RCFW with pending cmd-bit %lx\n", indx); - cmdq->cmdq_mbox.reg.bar_reg = NULL; creq->creq_db.reg.bar_reg = NULL; creq->aeq_handler = NULL; @@ -679,9 +1027,11 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, bool need_init) { struct bnxt_qplib_creq_ctx *creq; + struct bnxt_qplib_res *res; int rc; creq = &rcfw->creq; + res = rcfw->res; if (creq->requested) return -EFAULT; @@ -691,24 +1041,32 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, tasklet_setup(&creq->creq_tasklet, bnxt_qplib_service_creq); else tasklet_enable(&creq->creq_tasklet); + + creq->irq_name = kasprintf(GFP_KERNEL, "bnxt_re-creq@pci:%s", + pci_name(res->pdev)); + if (!creq->irq_name) + return -ENOMEM; rc = request_irq(creq->msix_vec, bnxt_qplib_creq_irq, 0, - "bnxt_qplib_creq", rcfw); - if (rc) + creq->irq_name, rcfw); + if (rc) { + kfree(creq->irq_name); + creq->irq_name = NULL; + tasklet_disable(&creq->creq_tasklet); return rc; + } creq->requested = true; - bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, true); + bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, res->cctx, true); + atomic_inc(&rcfw->rcfw_intr_enabled); return 0; } -static int bnxt_qplib_map_cmdq_mbox(struct bnxt_qplib_rcfw *rcfw, bool is_vf) +static int bnxt_qplib_map_cmdq_mbox(struct bnxt_qplib_rcfw *rcfw) { struct bnxt_qplib_cmdq_mbox *mbox; resource_size_t bar_reg; struct pci_dev *pdev; - u16 prod_offt; - int rc = 0; pdev = rcfw->pdev; mbox = &rcfw->cmdq.cmdq_mbox; @@ -733,11 +1091,10 @@ static int bnxt_qplib_map_cmdq_mbox(struct bnxt_qplib_rcfw *rcfw, bool is_vf) return -ENOMEM; } - prod_offt = is_vf ? RCFW_VF_COMM_PROD_OFFSET : - RCFW_PF_COMM_PROD_OFFSET; - mbox->prod = (void __iomem *)(mbox->reg.bar_reg + prod_offt); + mbox->prod = (void __iomem *)(mbox->reg.bar_reg + + RCFW_PF_VF_COMM_PROD_OFFSET); mbox->db = (void __iomem *)(mbox->reg.bar_reg + RCFW_COMM_TRIG_OFFSET); - return rc; + return 0; } static int bnxt_qplib_map_creq_db(struct bnxt_qplib_rcfw *rcfw, u32 reg_offt) @@ -798,7 +1155,7 @@ static void bnxt_qplib_start_rcfw(struct bnxt_qplib_rcfw *rcfw) int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw, int msix_vector, - int cp_bar_reg_off, int virt_fn, + int cp_bar_reg_off, aeq_handler_t aeq_handler) { struct bnxt_qplib_cmdq_ctx *cmdq; @@ -818,7 +1175,7 @@ int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw, creq->stats.creq_func_event_processed = 0; creq->aeq_handler = aeq_handler; - rc = bnxt_qplib_map_cmdq_mbox(rcfw, virt_fn); + rc = bnxt_qplib_map_cmdq_mbox(rcfw); if (rc) return rc; @@ -834,6 +1191,7 @@ int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw, return rc; } + sema_init(&rcfw->rcfw_inflight, RCFW_CMD_NON_BLOCKING_SHADOW_QD); bnxt_qplib_start_rcfw(rcfw); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index dd5651478bbb..7b31bee3e000 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -45,13 +45,13 @@ #define RCFW_COMM_PCI_BAR_REGION 0 #define RCFW_COMM_CONS_PCI_BAR_REGION 2 #define RCFW_COMM_BASE_OFFSET 0x600 -#define RCFW_PF_COMM_PROD_OFFSET 0xc -#define RCFW_VF_COMM_PROD_OFFSET 0xc +#define RCFW_PF_VF_COMM_PROD_OFFSET 0xc #define RCFW_COMM_TRIG_OFFSET 0x100 #define RCFW_COMM_SIZE 0x104 #define RCFW_DBR_PCI_BAR_REGION 2 #define RCFW_DBR_BASE_PAGE_SHIFT 12 +#define RCFW_FW_STALL_MAX_TIMEOUT 40 /* Cmdq contains a fix number of a 16-Byte slots */ struct bnxt_qplib_cmdqe { @@ -67,11 +67,12 @@ static inline void bnxt_qplib_rcfw_cmd_prep(struct cmdq_base *req, req->cmd_size = cmd_size; } +/* Shadow queue depth for non blocking command */ +#define RCFW_CMD_NON_BLOCKING_SHADOW_QD 64 #define RCFW_CMD_WAIT_TIME_MS 20000 /* 20 Seconds timeout */ /* CMDQ elements */ -#define BNXT_QPLIB_CMDQE_MAX_CNT_256 256 -#define BNXT_QPLIB_CMDQE_MAX_CNT_8192 8192 +#define BNXT_QPLIB_CMDQE_MAX_CNT 8192 #define BNXT_QPLIB_CMDQE_BYTES(depth) ((depth) * BNXT_QPLIB_CMDQE_UNITS) static inline u32 bnxt_qplib_cmdqe_npages(u32 depth) @@ -89,6 +90,26 @@ static inline u32 bnxt_qplib_cmdqe_page_size(u32 depth) return (bnxt_qplib_cmdqe_npages(depth) * PAGE_SIZE); } +/* Get the number of command units required for the req. The + * function returns correct value only if called before + * setting using bnxt_qplib_set_cmd_slots + */ +static inline u32 bnxt_qplib_get_cmd_slots(struct cmdq_base *req) +{ + u32 cmd_units = 0; + + if (HAS_TLV_HEADER(req)) { + struct roce_tlv *tlv_req = (struct roce_tlv *)req; + + cmd_units = tlv_req->total_size; + } else { + cmd_units = (req->cmd_size + BNXT_QPLIB_CMDQE_UNITS - 1) / + BNXT_QPLIB_CMDQE_UNITS; + } + + return cmd_units; +} + static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) { u32 cmd_byte = 0; @@ -106,11 +127,10 @@ static inline u32 bnxt_qplib_set_cmd_slots(struct cmdq_base *req) return cmd_byte; } -#define RCFW_MAX_COOKIE_VALUE 0x7FFF +#define RCFW_MAX_COOKIE_VALUE (BNXT_QPLIB_CMDQE_MAX_CNT - 1) #define RCFW_CMD_IS_BLOCKING 0x8000 -#define RCFW_BLOCKED_CMD_WAIT_COUNT 20000000UL /* 20 sec */ -#define HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK 0x1000900020011ULL +#define HWRM_VERSION_DEV_ATTR_MAX_DPI 0x1000A0000000DULL /* Crsq buf is 1024-Byte */ struct bnxt_qplib_crsbe { @@ -132,6 +152,12 @@ typedef int (*aeq_handler_t)(struct bnxt_qplib_rcfw *, void *, void *); struct bnxt_qplib_crsqe { struct creq_qp_event *resp; u32 req_size; + /* Free slots at the time of submission */ + u32 free_slots; + u8 opcode; + bool is_waiter_alive; + bool is_internal_cmd; + bool is_in_used; }; struct bnxt_qplib_rcfw_sbuf { @@ -149,7 +175,7 @@ struct bnxt_qplib_qp_node { #define FIRMWARE_INITIALIZED_FLAG (0) #define FIRMWARE_FIRST_FLAG (31) -#define FIRMWARE_TIMED_OUT (3) +#define FIRMWARE_STALL_DETECTED (3) #define ERR_DEVICE_DETACHED (4) struct bnxt_qplib_cmdq_mbox { @@ -163,7 +189,7 @@ struct bnxt_qplib_cmdq_ctx { struct bnxt_qplib_cmdq_mbox cmdq_mbox; wait_queue_head_t waitq; unsigned long flags; - unsigned long *cmdq_bitmap; + unsigned long last_seen; u32 seq_num; }; @@ -186,6 +212,7 @@ struct bnxt_qplib_creq_ctx { u16 ring_id; int msix_vec; bool requested; /*irq handler installed */ + char *irq_name; }; /* RCFW Communication Channels */ @@ -200,6 +227,11 @@ struct bnxt_qplib_rcfw { u64 oos_prev; u32 init_oos_stats; u32 cmdq_depth; + atomic_t rcfw_intr_enabled; + struct semaphore rcfw_inflight; + atomic_t timeout_send; + /* cached from chip cctx for quick reference in slow path */ + u16 max_timeout; }; struct bnxt_qplib_cmdqmsg { @@ -234,7 +266,7 @@ int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector, bool need_init); int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw, int msix_vector, - int cp_bar_reg_off, int virt_fn, + int cp_bar_reg_off, aeq_handler_t aeq_handler); struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf( diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 126d4f26f75a..5fd8f7c90bb0 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -215,17 +215,9 @@ int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq, return -EINVAL; hwq_attr->sginfo->npages = npages; } else { - unsigned long sginfo_num_pages = ib_umem_num_dma_blocks( - hwq_attr->sginfo->umem, hwq_attr->sginfo->pgsize); - + npages = ib_umem_num_dma_blocks(hwq_attr->sginfo->umem, + hwq_attr->sginfo->pgsize); hwq->is_user = true; - npages = sginfo_num_pages; - npages = (npages * PAGE_SIZE) / - BIT_ULL(hwq_attr->sginfo->pgshft); - if ((sginfo_num_pages * PAGE_SIZE) % - BIT_ULL(hwq_attr->sginfo->pgshft)) - if (!npages) - npages++; } if (npages == MAX_PBL_LVL_0_PGS && !hwq_attr->sginfo->nopte) { @@ -704,44 +696,76 @@ static int bnxt_qplib_alloc_pd_tbl(struct bnxt_qplib_res *res, } /* DPIs */ -int bnxt_qplib_alloc_dpi(struct bnxt_qplib_dpi_tbl *dpit, - struct bnxt_qplib_dpi *dpi, - void *app) +int bnxt_qplib_alloc_dpi(struct bnxt_qplib_res *res, + struct bnxt_qplib_dpi *dpi, + void *app, u8 type) { + struct bnxt_qplib_dpi_tbl *dpit = &res->dpi_tbl; + struct bnxt_qplib_reg_desc *reg; u32 bit_num; + u64 umaddr; + + reg = &dpit->wcreg; + mutex_lock(&res->dpi_tbl_lock); bit_num = find_first_bit(dpit->tbl, dpit->max); - if (bit_num == dpit->max) + if (bit_num == dpit->max) { + mutex_unlock(&res->dpi_tbl_lock); return -ENOMEM; + } /* Found unused DPI */ clear_bit(bit_num, dpit->tbl); dpit->app_tbl[bit_num] = app; - dpi->dpi = bit_num; - dpi->dbr = dpit->dbr_bar_reg_iomem + (bit_num * PAGE_SIZE); - dpi->umdbr = dpit->unmapped_dbr + (bit_num * PAGE_SIZE); + dpi->bit = bit_num; + dpi->dpi = bit_num + (reg->offset - dpit->ucreg.offset) / PAGE_SIZE; + + umaddr = reg->bar_base + reg->offset + bit_num * PAGE_SIZE; + dpi->umdbr = umaddr; + + switch (type) { + case BNXT_QPLIB_DPI_TYPE_KERNEL: + /* privileged dbr was already mapped just initialize it. */ + dpi->umdbr = dpit->ucreg.bar_base + + dpit->ucreg.offset + bit_num * PAGE_SIZE; + dpi->dbr = dpit->priv_db; + dpi->dpi = dpi->bit; + break; + case BNXT_QPLIB_DPI_TYPE_WC: + dpi->dbr = ioremap_wc(umaddr, PAGE_SIZE); + break; + default: + dpi->dbr = ioremap(umaddr, PAGE_SIZE); + break; + } + dpi->type = type; + mutex_unlock(&res->dpi_tbl_lock); return 0; + } int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, - struct bnxt_qplib_dpi_tbl *dpit, - struct bnxt_qplib_dpi *dpi) + struct bnxt_qplib_dpi *dpi) { - if (dpi->dpi >= dpit->max) { - dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d\n", dpi->dpi); - return -EINVAL; - } - if (test_and_set_bit(dpi->dpi, dpit->tbl)) { - dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d\n", - dpi->dpi); + struct bnxt_qplib_dpi_tbl *dpit = &res->dpi_tbl; + + mutex_lock(&res->dpi_tbl_lock); + if (dpi->dpi && dpi->type != BNXT_QPLIB_DPI_TYPE_KERNEL) + pci_iounmap(res->pdev, dpi->dbr); + + if (test_and_set_bit(dpi->bit, dpit->tbl)) { + dev_warn(&res->pdev->dev, + "Freeing an unused DPI? dpi = %d, bit = %d\n", + dpi->dpi, dpi->bit); + mutex_unlock(&res->dpi_tbl_lock); return -EINVAL; } if (dpit->app_tbl) - dpit->app_tbl[dpi->dpi] = NULL; + dpit->app_tbl[dpi->bit] = NULL; memset(dpi, 0, sizeof(*dpi)); - + mutex_unlock(&res->dpi_tbl_lock); return 0; } @@ -750,52 +774,38 @@ static void bnxt_qplib_free_dpi_tbl(struct bnxt_qplib_res *res, { kfree(dpit->tbl); kfree(dpit->app_tbl); - if (dpit->dbr_bar_reg_iomem) - pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem); - memset(dpit, 0, sizeof(*dpit)); + dpit->tbl = NULL; + dpit->app_tbl = NULL; + dpit->max = 0; } -static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, - struct bnxt_qplib_dpi_tbl *dpit, - u32 dbr_offset) +static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, + struct bnxt_qplib_dev_attr *dev_attr) { - u32 dbr_bar_reg = RCFW_DBR_PCI_BAR_REGION; - resource_size_t bar_reg_base; - u32 dbr_len, bytes; - - if (dpit->dbr_bar_reg_iomem) { - dev_err(&res->pdev->dev, "DBR BAR region %d already mapped\n", - dbr_bar_reg); - return -EALREADY; - } + struct bnxt_qplib_dpi_tbl *dpit; + struct bnxt_qplib_reg_desc *reg; + unsigned long bar_len; + u32 dbr_offset; + u32 bytes; - bar_reg_base = pci_resource_start(res->pdev, dbr_bar_reg); - if (!bar_reg_base) { - dev_err(&res->pdev->dev, "BAR region %d resc start failed\n", - dbr_bar_reg); - return -ENOMEM; - } + dpit = &res->dpi_tbl; + reg = &dpit->wcreg; - dbr_len = pci_resource_len(res->pdev, dbr_bar_reg) - dbr_offset; - if (!dbr_len || ((dbr_len & (PAGE_SIZE - 1)) != 0)) { - dev_err(&res->pdev->dev, "Invalid DBR length %d\n", dbr_len); - return -ENOMEM; + if (!bnxt_qplib_is_chip_gen_p5(res->cctx)) { + /* Offest should come from L2 driver */ + dbr_offset = dev_attr->l2_db_size; + dpit->ucreg.offset = dbr_offset; + dpit->wcreg.offset = dbr_offset; } - dpit->dbr_bar_reg_iomem = ioremap(bar_reg_base + dbr_offset, - dbr_len); - if (!dpit->dbr_bar_reg_iomem) { - dev_err(&res->pdev->dev, - "FP: DBR BAR region %d mapping failed\n", dbr_bar_reg); - return -ENOMEM; - } + bar_len = pci_resource_len(res->pdev, reg->bar_id); + dpit->max = (bar_len - reg->offset) / PAGE_SIZE; + if (dev_attr->max_dpi) + dpit->max = min_t(u32, dpit->max, dev_attr->max_dpi); - dpit->unmapped_dbr = bar_reg_base + dbr_offset; - dpit->max = dbr_len / PAGE_SIZE; - - dpit->app_tbl = kcalloc(dpit->max, sizeof(void *), GFP_KERNEL); + dpit->app_tbl = kcalloc(dpit->max, sizeof(void *), GFP_KERNEL); if (!dpit->app_tbl) - goto unmap_io; + return -ENOMEM; bytes = dpit->max >> 3; if (!bytes) @@ -805,17 +815,14 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, if (!dpit->tbl) { kfree(dpit->app_tbl); dpit->app_tbl = NULL; - goto unmap_io; + return -ENOMEM; } memset((u8 *)dpit->tbl, 0xFF, bytes); + dpit->priv_db = dpit->ucreg.bar_reg + dpit->ucreg.offset; return 0; -unmap_io: - pci_iounmap(res->pdev, dpit->dbr_bar_reg_iomem); - dpit->dbr_bar_reg_iomem = NULL; - return -ENOMEM; } /* Stats */ @@ -882,7 +889,7 @@ int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev, if (rc) goto fail; - rc = bnxt_qplib_alloc_dpi_tbl(res, &res->dpi_tbl, dev_attr->l2_db_size); + rc = bnxt_qplib_alloc_dpi_tbl(res, dev_attr); if (rc) goto fail; @@ -892,6 +899,46 @@ fail: return rc; } +void bnxt_qplib_unmap_db_bar(struct bnxt_qplib_res *res) +{ + struct bnxt_qplib_reg_desc *reg; + + reg = &res->dpi_tbl.ucreg; + if (reg->bar_reg) + pci_iounmap(res->pdev, reg->bar_reg); + reg->bar_reg = NULL; + reg->bar_base = 0; + reg->len = 0; + reg->bar_id = 0; +} + +int bnxt_qplib_map_db_bar(struct bnxt_qplib_res *res) +{ + struct bnxt_qplib_reg_desc *ucreg; + struct bnxt_qplib_reg_desc *wcreg; + + wcreg = &res->dpi_tbl.wcreg; + wcreg->bar_id = RCFW_DBR_PCI_BAR_REGION; + wcreg->bar_base = pci_resource_start(res->pdev, wcreg->bar_id); + + ucreg = &res->dpi_tbl.ucreg; + ucreg->bar_id = RCFW_DBR_PCI_BAR_REGION; + ucreg->bar_base = pci_resource_start(res->pdev, ucreg->bar_id); + ucreg->len = ucreg->offset + PAGE_SIZE; + if (!ucreg->len || ((ucreg->len & (PAGE_SIZE - 1)) != 0)) { + dev_err(&res->pdev->dev, "QPLIB: invalid dbr length %d", + (int)ucreg->len); + return -EINVAL; + } + ucreg->bar_reg = ioremap(ucreg->bar_base, ucreg->len); + if (!ucreg->bar_reg) { + dev_err(&res->pdev->dev, "privileged dpi map failed!"); + return -ENOMEM; + } + + return 0; +} + int bnxt_qplib_determine_atomics(struct pci_dev *dev) { int comp; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 982e2c96dac2..d850a553821e 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -47,7 +47,7 @@ extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero; struct bnxt_qplib_drv_modes { u8 wqe_mode; - /* Other modes to follow here */ + bool db_push; }; struct bnxt_qplib_chip_ctx { @@ -55,9 +55,14 @@ struct bnxt_qplib_chip_ctx { u8 chip_rev; u8 chip_metal; u16 hw_stats_size; + u16 hwrm_cmd_max_timeout; struct bnxt_qplib_drv_modes modes; + u64 hwrm_intf_ver; }; +#define BNXT_QPLIB_DBR_PF_DB_OFFSET 0x10000 +#define BNXT_QPLIB_DBR_VF_DB_OFFSET 0x4000 + #define PTR_CNT_PER_PG (PAGE_SIZE / sizeof(void *)) #define PTR_MAX_IDX_PER_PG (PTR_CNT_PER_PG - 1) #define PTR_PG(x) (((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG) @@ -109,6 +114,7 @@ enum bnxt_qplib_hwrm_pg_size { struct bnxt_qplib_reg_desc { u8 bar_id; resource_size_t bar_base; + unsigned long offset; void __iomem *bar_reg; size_t len; }; @@ -185,18 +191,27 @@ struct bnxt_qplib_sgid_tbl { u8 *vlan; }; +enum { + BNXT_QPLIB_DPI_TYPE_KERNEL = 0, + BNXT_QPLIB_DPI_TYPE_UC = 1, + BNXT_QPLIB_DPI_TYPE_WC = 2 +}; + struct bnxt_qplib_dpi { u32 dpi; + u32 bit; void __iomem *dbr; u64 umdbr; + u8 type; }; struct bnxt_qplib_dpi_tbl { void **app_tbl; unsigned long *tbl; u16 max; - void __iomem *dbr_bar_reg_iomem; - u64 unmapped_dbr; + struct bnxt_qplib_reg_desc ucreg; /* Hold entire DB bar. */ + struct bnxt_qplib_reg_desc wcreg; + void __iomem *priv_db; }; struct bnxt_qplib_stats { @@ -241,7 +256,6 @@ struct bnxt_qplib_ctx { struct bnxt_qplib_tqm_ctx tqm_ctx; struct bnxt_qplib_stats stats; struct bnxt_qplib_vf_res vf_res; - u64 hwrm_intf_ver; }; struct bnxt_qplib_res { @@ -253,6 +267,8 @@ struct bnxt_qplib_res { struct bnxt_qplib_pd_tbl pd_tbl; struct bnxt_qplib_sgid_tbl sgid_tbl; struct bnxt_qplib_dpi_tbl dpi_tbl; + /* To protect the dpi table bit map */ + struct mutex dpi_tbl_lock; bool prio; bool is_vf; }; @@ -344,11 +360,10 @@ int bnxt_qplib_alloc_pd(struct bnxt_qplib_pd_tbl *pd_tbl, int bnxt_qplib_dealloc_pd(struct bnxt_qplib_res *res, struct bnxt_qplib_pd_tbl *pd_tbl, struct bnxt_qplib_pd *pd); -int bnxt_qplib_alloc_dpi(struct bnxt_qplib_dpi_tbl *dpit, - struct bnxt_qplib_dpi *dpi, - void *app); +int bnxt_qplib_alloc_dpi(struct bnxt_qplib_res *res, + struct bnxt_qplib_dpi *dpi, + void *app, u8 type); int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, - struct bnxt_qplib_dpi_tbl *dpi_tbl, struct bnxt_qplib_dpi *dpi); void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res); int bnxt_qplib_init_res(struct bnxt_qplib_res *res); @@ -361,6 +376,9 @@ void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res, int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res, struct bnxt_qplib_ctx *ctx, bool virt_fn, bool is_p5); +int bnxt_qplib_map_db_bar(struct bnxt_qplib_res *res); +void bnxt_qplib_unmap_db_bar(struct bnxt_qplib_res *res); + int bnxt_qplib_determine_atomics(struct pci_dev *dev); static inline void bnxt_qplib_hwq_incr_prod(struct bnxt_qplib_hwq *hwq, u32 cnt) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 1714a1e23113..ab45f9d4bb02 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -170,6 +170,9 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->tqm_alloc_reqs[i * 4 + 3] = *(++tqm_alloc); } + if (rcfw->res->cctx->hwrm_intf_ver >= HWRM_VERSION_DEV_ATTR_MAX_DPI) + attr->max_dpi = le32_to_cpu(sb->max_dpi); + attr->is_atomic = bnxt_qplib_is_atomic_cap(rcfw); bail: bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf); @@ -233,10 +236,6 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_rcfw *rcfw = res->rcfw; int index; - if (!sgid_tbl) { - dev_err(&res->pdev->dev, "SGID table not allocated\n"); - return -EINVAL; - } /* Do we need a sgid_lock here? */ if (!sgid_tbl->active) { dev_err(&res->pdev->dev, "SGID table has no active entries\n"); @@ -297,10 +296,6 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_rcfw *rcfw = res->rcfw; int i, free_idx; - if (!sgid_tbl) { - dev_err(&res->pdev->dev, "SGID table not allocated\n"); - return -EINVAL; - } /* Do we need a sgid_lock here? */ if (sgid_tbl->active == sgid_tbl->max) { dev_err(&res->pdev->dev, "SGID table is full\n"); @@ -468,13 +463,14 @@ int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, return 0; } -void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, - bool block) +int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, + bool block) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct creq_destroy_ah_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; struct cmdq_destroy_ah req = {}; + int rc; /* Clean up the AH table in the device */ bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, @@ -485,7 +481,8 @@ void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), sizeof(resp), block); - bnxt_qplib_rcfw_send_message(rcfw, &msg); + rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); + return rc; } /* MRW */ @@ -617,16 +614,15 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, /* Free the hwq if it already exist, must be a rereg */ if (mr->hwq.max_elements) bnxt_qplib_free_hwq(res, &mr->hwq); - /* Use system PAGE_SIZE */ hwq_attr.res = res; hwq_attr.depth = pages; - hwq_attr.stride = buf_pg_size; + hwq_attr.stride = sizeof(dma_addr_t); hwq_attr.type = HWQ_TYPE_MR; hwq_attr.sginfo = &sginfo; hwq_attr.sginfo->umem = umem; hwq_attr.sginfo->npages = pages; - hwq_attr.sginfo->pgsize = PAGE_SIZE; - hwq_attr.sginfo->pgshft = PAGE_SHIFT; + hwq_attr.sginfo->pgsize = buf_pg_size; + hwq_attr.sginfo->pgshft = ilog2(buf_pg_size); rc = bnxt_qplib_alloc_init_hwq(&mr->hwq, &hwq_attr); if (rc) { dev_err(&res->pdev->dev, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 5de874659cdf..264ef3cedc45 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -72,6 +72,7 @@ struct bnxt_qplib_dev_attr { u8 tqm_alloc_reqs[MAX_TQM_ALLOC_REQ]; bool is_atomic; u16 dev_cap_flags; + u32 max_dpi; }; struct bnxt_qplib_pd { @@ -327,8 +328,8 @@ int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, struct bnxt_qplib_ctx *ctx); int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, bool block); -void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, - bool block); +int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah, + bool block); int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw); int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 8eca6c14d0cf..2a195c4b0f17 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1403,7 +1403,7 @@ static int pbl_continuous_initialize(struct efa_dev *dev, */ static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl) { - u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE); + u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, EFA_CHUNK_PAYLOAD_SIZE); struct scatterlist *sgl; int sg_dma_cnt, err; diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index e819e4032490..f190111840e9 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -128,13 +128,8 @@ struct erdma_devattr { int numa_node; enum erdma_cc_alg cc; - u32 grp_num; u32 irq_num; - bool disable_dwqe; - u16 dwqe_pages; - u16 dwqe_entries; - u32 max_qp; u32 max_send_wr; u32 max_recv_wr; @@ -215,15 +210,6 @@ struct erdma_dev { u32 next_alloc_qpn; u32 next_alloc_cqn; - spinlock_t db_bitmap_lock; - /* We provide max 64 uContexts that each has one SQ doorbell Page. */ - DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT); - /* - * We provide max 496 uContexts that each has one SQ normal Db, - * and one directWQE db. - */ - DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT); - atomic_t num_ctx; struct list_head cep_list; }; @@ -268,6 +254,8 @@ static inline u32 erdma_reg_read32_filed(struct erdma_dev *dev, u32 reg, return FIELD_GET(filed_mask, val); } +#define ERDMA_GET(val, name) FIELD_GET(ERDMA_CMD_##name##_MASK, val) + int erdma_cmdq_init(struct erdma_dev *dev); void erdma_finish_cmdq_init(struct erdma_dev *dev); void erdma_cmdq_destroy(struct erdma_dev *dev); diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 76ce2856be28..a882b57aa118 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -82,19 +82,6 @@ #define ERDMA_BAR_CQDB_SPACE_OFFSET \ (ERDMA_BAR_RQDB_SPACE_OFFSET + ERDMA_BAR_RQDB_SPACE_SIZE) -/* Doorbell page resources related. */ -/* - * Max # of parallelly issued directSQE is 3072 per device, - * hardware organizes this into 24 group, per group has 128 credits. - */ -#define ERDMA_DWQE_MAX_GRP_CNT 24 -#define ERDMA_DWQE_NUM_PER_GRP 128 - -#define ERDMA_DWQE_TYPE0_CNT 64 -#define ERDMA_DWQE_TYPE1_CNT 496 -/* type1 DB contains 2 DBs, takes 256Byte. */ -#define ERDMA_DWQE_TYPE1_CNT_PER_PAGE 16 - #define ERDMA_SDB_SHARED_PAGE_INDEX 95 /* Doorbell related. */ @@ -134,7 +121,7 @@ /* CMDQ related. */ #define ERDMA_CMDQ_MAX_OUTSTANDING 128 -#define ERDMA_CMDQ_SQE_SIZE 64 +#define ERDMA_CMDQ_SQE_SIZE 128 /* cmdq sub module definition. */ enum CMDQ_WQE_SUB_MOD { @@ -159,6 +146,9 @@ enum CMDQ_COMMON_OPCODE { CMDQ_OPCODE_DESTROY_EQ = 1, CMDQ_OPCODE_QUERY_FW_INFO = 2, CMDQ_OPCODE_CONF_MTU = 3, + CMDQ_OPCODE_CONF_DEVICE = 5, + CMDQ_OPCODE_ALLOC_DB = 8, + CMDQ_OPCODE_FREE_DB = 9, }; /* cmdq-SQE HDR */ @@ -196,11 +186,41 @@ struct erdma_cmdq_destroy_eq_req { u8 qtype; }; +/* config device cfg */ +#define ERDMA_CMD_CONFIG_DEVICE_PS_EN_MASK BIT(31) +#define ERDMA_CMD_CONFIG_DEVICE_PGSHIFT_MASK GENMASK(4, 0) + +struct erdma_cmdq_config_device_req { + u64 hdr; + u32 cfg; + u32 rsvd[5]; +}; + struct erdma_cmdq_config_mtu_req { u64 hdr; u32 mtu; }; +/* ext db requests(alloc and free) cfg */ +#define ERDMA_CMD_EXT_DB_CQ_EN_MASK BIT(2) +#define ERDMA_CMD_EXT_DB_RQ_EN_MASK BIT(1) +#define ERDMA_CMD_EXT_DB_SQ_EN_MASK BIT(0) + +struct erdma_cmdq_ext_db_req { + u64 hdr; + u32 cfg; + u16 rdb_off; + u16 sdb_off; + u16 rsvd0; + u16 cdb_off; + u32 rsvd1[3]; +}; + +/* alloc db response qword 0 definition */ +#define ERDMA_CMD_ALLOC_DB_RESP_RDB_MASK GENMASK_ULL(63, 48) +#define ERDMA_CMD_ALLOC_DB_RESP_CDB_MASK GENMASK_ULL(47, 32) +#define ERDMA_CMD_ALLOC_DB_RESP_SDB_MASK GENMASK_ULL(15, 0) + /* create_cq cfg0 */ #define ERDMA_CMD_CREATE_CQ_DEPTH_MASK GENMASK(31, 24) #define ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK GENMASK(23, 20) @@ -209,8 +229,12 @@ struct erdma_cmdq_config_mtu_req { /* create_cq cfg1 */ #define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16) #define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15) +#define ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK BIT(11) #define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0) +/* create_cq cfg2 */ +#define ERDMA_CMD_CREATE_CQ_DB_CFG_MASK GENMASK(15, 0) + struct erdma_cmdq_create_cq_req { u64 hdr; u32 cfg0; @@ -219,6 +243,7 @@ struct erdma_cmdq_create_cq_req { u32 cfg1; u64 cq_db_info_addr; u32 first_page_offset; + u32 cfg2; }; /* regmr/deregmr cfg0 */ @@ -278,6 +303,7 @@ struct erdma_cmdq_modify_qp_req { /* create qp cqn_mtt_cfg */ #define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28) +#define ERDMA_CMD_CREATE_QP_DB_CFG_MASK BIT(25) #define ERDMA_CMD_CREATE_QP_CQN_MASK GENMASK(23, 0) /* create qp mtt_cfg */ @@ -285,6 +311,10 @@ struct erdma_cmdq_modify_qp_req { #define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1) #define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0) +/* create qp db cfg */ +#define ERDMA_CMD_CREATE_QP_SQDB_CFG_MASK GENMASK(31, 16) +#define ERDMA_CMD_CREATE_QP_RQDB_CFG_MASK GENMASK(15, 0) + #define ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK GENMASK_ULL(31, 0) struct erdma_cmdq_create_qp_req { @@ -299,6 +329,11 @@ struct erdma_cmdq_create_qp_req { u32 rq_mtt_cfg; u64 sq_db_info_dma_addr; u64 rq_db_info_dma_addr; + + u64 sq_mtt_entry[3]; + u64 rq_mtt_entry[3]; + + u32 db_cfg; }; struct erdma_cmdq_destroy_qp_req { @@ -329,6 +364,7 @@ struct erdma_cmdq_reflush_req { enum { ERDMA_DEV_CAP_FLAGS_ATOMIC = 1 << 7, + ERDMA_DEV_CAP_FLAGS_EXTEND_DB = 1 << 3, }; #define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 7c74abeee864..0880c79a978c 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -130,33 +130,6 @@ static irqreturn_t erdma_comm_irq_handler(int irq, void *data) return IRQ_HANDLED; } -static void erdma_dwqe_resource_init(struct erdma_dev *dev) -{ - int total_pages, type0, type1; - - dev->attrs.grp_num = erdma_reg_read32(dev, ERDMA_REGS_GRP_NUM_REG); - - if (dev->attrs.grp_num < 4) - dev->attrs.disable_dwqe = true; - else - dev->attrs.disable_dwqe = false; - - /* One page contains 4 goups. */ - total_pages = dev->attrs.grp_num * 4; - - if (dev->attrs.grp_num >= ERDMA_DWQE_MAX_GRP_CNT) { - dev->attrs.grp_num = ERDMA_DWQE_MAX_GRP_CNT; - type0 = ERDMA_DWQE_TYPE0_CNT; - type1 = ERDMA_DWQE_TYPE1_CNT / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; - } else { - type1 = total_pages / 3; - type0 = total_pages - type1 - 1; - } - - dev->attrs.dwqe_pages = type0; - dev->attrs.dwqe_entries = type1 * ERDMA_DWQE_TYPE1_CNT_PER_PAGE; -} - static int erdma_request_vectors(struct erdma_dev *dev) { int expect_irq_num = min(num_possible_cpus() + 1, ERDMA_NUM_MSIX_VEC); @@ -199,8 +172,6 @@ static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) { int ret; - erdma_dwqe_resource_init(dev); - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(ERDMA_PCI_WIDTH)); if (ret) @@ -426,6 +397,22 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) return err; } +static int erdma_device_config(struct erdma_dev *dev) +{ + struct erdma_cmdq_config_device_req req = {}; + + if (!(dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_EXTEND_DB)) + return 0; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_CONF_DEVICE); + + req.cfg = FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PGSHIFT_MASK, PAGE_SHIFT) | + FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PS_EN_MASK, 1); + + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); +} + static int erdma_res_cb_init(struct erdma_dev *dev) { int i, j; @@ -512,6 +499,10 @@ static int erdma_ib_device_add(struct pci_dev *pdev) if (ret) return ret; + ret = erdma_device_config(dev); + if (ret) + return ret; + ibdev->node_type = RDMA_NODE_RNIC; memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC)); @@ -537,10 +528,6 @@ static int erdma_ib_device_add(struct pci_dev *pdev) if (ret) return ret; - spin_lock_init(&dev->db_bitmap_lock); - bitmap_zero(dev->sdb_page, ERDMA_DWQE_TYPE0_CNT); - bitmap_zero(dev->sdb_entry, ERDMA_DWQE_TYPE1_CNT); - atomic_set(&dev->num_ctx, 0); mac = erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_L_REG); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 83e1b0d55977..517676fbb8b1 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -19,10 +19,11 @@ #include "erdma_cm.h" #include "erdma_verbs.h" -static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp) +static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) { - struct erdma_cmdq_create_qp_req req; + struct erdma_dev *dev = to_edev(qp->ibqp.device); struct erdma_pd *pd = to_epd(qp->ibqp.pd); + struct erdma_cmdq_create_qp_req req; struct erdma_uqp *user_qp; u64 resp0, resp1; int err; @@ -93,6 +94,16 @@ static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp) req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; + + if (uctx->ext_db.enable) { + req.sq_cqn_mtt_cfg |= + FIELD_PREP(ERDMA_CMD_CREATE_QP_DB_CFG_MASK, 1); + req.db_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_SQDB_CFG_MASK, + uctx->ext_db.sdb_off) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_RQDB_CFG_MASK, + uctx->ext_db.rdb_off); + } } err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, @@ -146,11 +157,12 @@ post_cmd: return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); } -static int create_cq_cmd(struct erdma_dev *dev, struct erdma_cq *cq) +static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) { + struct erdma_dev *dev = to_edev(cq->ibcq.device); struct erdma_cmdq_create_cq_req req; - u32 page_size; struct erdma_mem *mtt; + u32 page_size; erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_CREATE_CQ); @@ -192,6 +204,13 @@ static int create_cq_cmd(struct erdma_dev *dev, struct erdma_cq *cq) req.first_page_offset = mtt->page_offset; req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; + + if (uctx->ext_db.enable) { + req.cfg1 |= FIELD_PREP( + ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK, 1); + req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_DB_CFG_MASK, + uctx->ext_db.cdb_off); + } } return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); @@ -753,7 +772,7 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, qp->attrs.state = ERDMA_QP_STATE_IDLE; INIT_DELAYED_WORK(&qp->reflush_dwork, erdma_flush_worker); - ret = create_qp_cmd(dev, qp); + ret = create_qp_cmd(uctx, qp); if (ret) goto err_out_cmd; @@ -1130,62 +1149,73 @@ void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry) kfree(entry); } -#define ERDMA_SDB_PAGE 0 -#define ERDMA_SDB_ENTRY 1 -#define ERDMA_SDB_SHARED 2 - -static void alloc_db_resources(struct erdma_dev *dev, - struct erdma_ucontext *ctx) +static int alloc_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx, + bool ext_db_en) { - u32 bitmap_idx; - struct erdma_devattr *attrs = &dev->attrs; - - if (attrs->disable_dwqe) - goto alloc_normal_db; - - /* Try to alloc independent SDB page. */ - spin_lock(&dev->db_bitmap_lock); - bitmap_idx = find_first_zero_bit(dev->sdb_page, attrs->dwqe_pages); - if (bitmap_idx != attrs->dwqe_pages) { - set_bit(bitmap_idx, dev->sdb_page); - spin_unlock(&dev->db_bitmap_lock); - - ctx->sdb_type = ERDMA_SDB_PAGE; - ctx->sdb_idx = bitmap_idx; - ctx->sdb_page_idx = bitmap_idx; - ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + - (bitmap_idx << PAGE_SHIFT); - ctx->sdb_page_off = 0; + struct erdma_cmdq_ext_db_req req = {}; + u64 val0, val1; + int ret; - return; + /* + * CAP_SYS_RAWIO is required if hardware does not support extend + * doorbell mechanism. + */ + if (!ext_db_en && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (!ext_db_en) { + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET; + ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; + ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; + return 0; } - bitmap_idx = find_first_zero_bit(dev->sdb_entry, attrs->dwqe_entries); - if (bitmap_idx != attrs->dwqe_entries) { - set_bit(bitmap_idx, dev->sdb_entry); - spin_unlock(&dev->db_bitmap_lock); + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_ALLOC_DB); + + req.cfg = FIELD_PREP(ERDMA_CMD_EXT_DB_CQ_EN_MASK, 1) | + FIELD_PREP(ERDMA_CMD_EXT_DB_RQ_EN_MASK, 1) | + FIELD_PREP(ERDMA_CMD_EXT_DB_SQ_EN_MASK, 1); + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &val0, &val1); + if (ret) + return ret; - ctx->sdb_type = ERDMA_SDB_ENTRY; - ctx->sdb_idx = bitmap_idx; - ctx->sdb_page_idx = attrs->dwqe_pages + - bitmap_idx / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; - ctx->sdb_page_off = bitmap_idx % ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + ctx->ext_db.enable = true; + ctx->ext_db.sdb_off = ERDMA_GET(val0, ALLOC_DB_RESP_SDB); + ctx->ext_db.rdb_off = ERDMA_GET(val0, ALLOC_DB_RESP_RDB); + ctx->ext_db.cdb_off = ERDMA_GET(val0, ALLOC_DB_RESP_CDB); - ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + - (ctx->sdb_page_idx << PAGE_SHIFT); + ctx->sdb = dev->func_bar_addr + (ctx->ext_db.sdb_off << PAGE_SHIFT); + ctx->cdb = dev->func_bar_addr + (ctx->ext_db.rdb_off << PAGE_SHIFT); + ctx->rdb = dev->func_bar_addr + (ctx->ext_db.cdb_off << PAGE_SHIFT); + + return 0; +} +static void free_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx) +{ + struct erdma_cmdq_ext_db_req req = {}; + int ret; + + if (!ctx->ext_db.enable) return; - } - spin_unlock(&dev->db_bitmap_lock); + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_FREE_DB); + + req.cfg = FIELD_PREP(ERDMA_CMD_EXT_DB_CQ_EN_MASK, 1) | + FIELD_PREP(ERDMA_CMD_EXT_DB_RQ_EN_MASK, 1) | + FIELD_PREP(ERDMA_CMD_EXT_DB_SQ_EN_MASK, 1); -alloc_normal_db: - ctx->sdb_type = ERDMA_SDB_SHARED; - ctx->sdb_idx = 0; - ctx->sdb_page_idx = ERDMA_SDB_SHARED_PAGE_INDEX; - ctx->sdb_page_off = 0; + req.sdb_off = ctx->ext_db.sdb_off; + req.rdb_off = ctx->ext_db.rdb_off; + req.cdb_off = ctx->ext_db.cdb_off; - ctx->sdb = dev->func_bar_addr + (ctx->sdb_page_idx << PAGE_SHIFT); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + if (ret) + ibdev_err_ratelimited(&dev->ibdev, + "free db resources failed %d", ret); } static void erdma_uctx_user_mmap_entries_remove(struct erdma_ucontext *uctx) @@ -1207,71 +1237,67 @@ int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) goto err_out; } - INIT_LIST_HEAD(&ctx->dbrecords_page_list); - mutex_init(&ctx->dbrecords_page_mutex); - - alloc_db_resources(dev, ctx); - - ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; - ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; - if (udata->outlen < sizeof(uresp)) { ret = -EINVAL; goto err_out; } + INIT_LIST_HEAD(&ctx->dbrecords_page_list); + mutex_init(&ctx->dbrecords_page_mutex); + + ret = alloc_db_resources(dev, ctx, + !!(dev->attrs.cap_flags & + ERDMA_DEV_CAP_FLAGS_EXTEND_DB)); + if (ret) + goto err_out; + ctx->sq_db_mmap_entry = erdma_user_mmap_entry_insert( ctx, (void *)ctx->sdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.sdb); if (!ctx->sq_db_mmap_entry) { ret = -ENOMEM; - goto err_out; + goto err_free_ext_db; } ctx->rq_db_mmap_entry = erdma_user_mmap_entry_insert( ctx, (void *)ctx->rdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.rdb); if (!ctx->rq_db_mmap_entry) { ret = -EINVAL; - goto err_out; + goto err_put_mmap_entries; } ctx->cq_db_mmap_entry = erdma_user_mmap_entry_insert( ctx, (void *)ctx->cdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.cdb); if (!ctx->cq_db_mmap_entry) { ret = -EINVAL; - goto err_out; + goto err_put_mmap_entries; } uresp.dev_id = dev->pdev->device; - uresp.sdb_type = ctx->sdb_type; - uresp.sdb_offset = ctx->sdb_page_off; ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (ret) - goto err_out; + goto err_put_mmap_entries; return 0; -err_out: +err_put_mmap_entries: erdma_uctx_user_mmap_entries_remove(ctx); + +err_free_ext_db: + free_db_resources(dev, ctx); + +err_out: atomic_dec(&dev->num_ctx); return ret; } void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) { - struct erdma_ucontext *ctx = to_ectx(ibctx); struct erdma_dev *dev = to_edev(ibctx->device); - - spin_lock(&dev->db_bitmap_lock); - if (ctx->sdb_type == ERDMA_SDB_PAGE) - clear_bit(ctx->sdb_idx, dev->sdb_page); - else if (ctx->sdb_type == ERDMA_SDB_ENTRY) - clear_bit(ctx->sdb_idx, dev->sdb_entry); + struct erdma_ucontext *ctx = to_ectx(ibctx); erdma_uctx_user_mmap_entries_remove(ctx); - - spin_unlock(&dev->db_bitmap_lock); - + free_db_resources(dev, ctx); atomic_dec(&dev->num_ctx); } @@ -1438,7 +1464,7 @@ int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_out_xa; } - ret = create_cq_cmd(dev, cq); + ret = create_cq_cmd(ctx, cq); if (ret) goto err_free_res; diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 131cf5f40982..429fc3063f98 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -31,13 +31,18 @@ struct erdma_user_mmap_entry { u8 mmap_flag; }; +struct erdma_ext_db_info { + bool enable; + u16 sdb_off; + u16 rdb_off; + u16 cdb_off; +}; + struct erdma_ucontext { struct ib_ucontext ibucontext; - u32 sdb_type; - u32 sdb_idx; - u32 sdb_page_idx; - u32 sdb_page_off; + struct erdma_ext_db_info ext_db; + u64 sdb; u64 rdb; u64 cdb; diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index 8973a081d641..e7d831330278 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -215,11 +215,11 @@ static int hfi1_ipoib_build_ulp_payload(struct ipoib_txreq *tx, const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; ret = sdma_txadd_page(dd, - NULL, txreq, skb_frag_page(frag), frag->bv_offset, - skb_frag_size(frag)); + skb_frag_size(frag), + NULL, NULL, NULL); if (unlikely(ret)) break; } diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 1cea8b0c78e0..7a51f7d73b61 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -19,8 +19,7 @@ static int mmu_notifier_range_start(struct mmu_notifier *, const struct mmu_notifier_range *); static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, unsigned long, unsigned long); -static void do_remove(struct mmu_rb_handler *handler, - struct list_head *del_list); +static void release_immediate(struct kref *refcount); static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { @@ -106,7 +105,11 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) } spin_unlock_irqrestore(&handler->lock, flags); - do_remove(handler, &del_list); + while (!list_empty(&del_list)) { + rbnode = list_first_entry(&del_list, struct mmu_rb_node, list); + list_del(&rbnode->list); + kref_put(&rbnode->refcount, release_immediate); + } /* Now the mm may be freed. */ mmdrop(handler->mn.mm); @@ -121,7 +124,7 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, unsigned long flags; int ret = 0; - trace_hfi1_mmu_rb_insert(mnode->addr, mnode->len); + trace_hfi1_mmu_rb_insert(mnode); if (current->mm != handler->mn.mm) return -EPERM; @@ -134,12 +137,6 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, } __mmu_int_rb_insert(mnode, &handler->root); list_add_tail(&mnode->list, &handler->lru_list); - - ret = handler->ops->insert(handler->ops_arg, mnode); - if (ret) { - __mmu_int_rb_remove(mnode, &handler->root); - list_del(&mnode->list); /* remove from LRU list */ - } mnode->handler = handler; unlock: spin_unlock_irqrestore(&handler->lock, flags); @@ -183,6 +180,49 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, return node; } +/* + * Must NOT call while holding mnode->handler->lock. + * mnode->handler->ops->remove() may sleep and mnode->handler->lock is a + * spinlock. + */ +static void release_immediate(struct kref *refcount) +{ + struct mmu_rb_node *mnode = + container_of(refcount, struct mmu_rb_node, refcount); + trace_hfi1_mmu_release_node(mnode); + mnode->handler->ops->remove(mnode->handler->ops_arg, mnode); +} + +/* Caller must hold mnode->handler->lock */ +static void release_nolock(struct kref *refcount) +{ + struct mmu_rb_node *mnode = + container_of(refcount, struct mmu_rb_node, refcount); + list_move(&mnode->list, &mnode->handler->del_list); + queue_work(mnode->handler->wq, &mnode->handler->del_work); +} + +/* + * struct mmu_rb_node->refcount kref_put() callback. + * Adds mmu_rb_node to mmu_rb_node->handler->del_list and queues + * handler->del_work on handler->wq. + * Does not remove mmu_rb_node from handler->lru_list or handler->rb_root. + * Acquires mmu_rb_node->handler->lock; do not call while already holding + * handler->lock. + */ +void hfi1_mmu_rb_release(struct kref *refcount) +{ + struct mmu_rb_node *mnode = + container_of(refcount, struct mmu_rb_node, refcount); + struct mmu_rb_handler *handler = mnode->handler; + unsigned long flags; + + spin_lock_irqsave(&handler->lock, flags); + list_move(&mnode->list, &mnode->handler->del_list); + spin_unlock_irqrestore(&handler->lock, flags); + queue_work(handler->wq, &handler->del_work); +} + void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) { struct mmu_rb_node *rbnode, *ptr; @@ -197,6 +237,10 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) spin_lock_irqsave(&handler->lock, flags); list_for_each_entry_safe(rbnode, ptr, &handler->lru_list, list) { + /* refcount == 1 implies mmu_rb_handler has only rbnode ref */ + if (kref_read(&rbnode->refcount) > 1) + continue; + if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg, &stop)) { __mmu_int_rb_remove(rbnode, &handler->root); @@ -209,7 +253,8 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) spin_unlock_irqrestore(&handler->lock, flags); list_for_each_entry_safe(rbnode, ptr, &del_list, list) { - handler->ops->remove(handler->ops_arg, rbnode); + trace_hfi1_mmu_rb_evict(rbnode); + kref_put(&rbnode->refcount, release_immediate); } } @@ -221,7 +266,6 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn, struct rb_root_cached *root = &handler->root; struct mmu_rb_node *node, *ptr = NULL; unsigned long flags; - bool added = false; spin_lock_irqsave(&handler->lock, flags); for (node = __mmu_int_rb_iter_first(root, range->start, range->end-1); @@ -229,40 +273,18 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn, /* Guard against node removal. */ ptr = __mmu_int_rb_iter_next(node, range->start, range->end - 1); - trace_hfi1_mmu_mem_invalidate(node->addr, node->len); - if (handler->ops->invalidate(handler->ops_arg, node)) { - __mmu_int_rb_remove(node, root); - /* move from LRU list to delete list */ - list_move(&node->list, &handler->del_list); - added = true; - } + trace_hfi1_mmu_mem_invalidate(node); + /* Remove from rb tree and lru_list. */ + __mmu_int_rb_remove(node, root); + list_del_init(&node->list); + kref_put(&node->refcount, release_nolock); } spin_unlock_irqrestore(&handler->lock, flags); - if (added) - queue_work(handler->wq, &handler->del_work); - return 0; } /* - * Call the remove function for the given handler and the list. This - * is expected to be called with a delete list extracted from handler. - * The caller should not be holding the handler lock. - */ -static void do_remove(struct mmu_rb_handler *handler, - struct list_head *del_list) -{ - struct mmu_rb_node *node; - - while (!list_empty(del_list)) { - node = list_first_entry(del_list, struct mmu_rb_node, list); - list_del(&node->list); - handler->ops->remove(handler->ops_arg, node); - } -} - -/* * Work queue function to remove all nodes that have been queued up to * be removed. The key feature is that mm->mmap_lock is not being held * and the remove callback can sleep while taking it, if needed. @@ -274,11 +296,17 @@ static void handle_remove(struct work_struct *work) del_work); struct list_head del_list; unsigned long flags; + struct mmu_rb_node *node; /* remove anything that is queued to get removed */ spin_lock_irqsave(&handler->lock, flags); list_replace_init(&handler->del_list, &del_list); spin_unlock_irqrestore(&handler->lock, flags); - do_remove(handler, &del_list); + while (!list_empty(&del_list)) { + node = list_first_entry(&del_list, struct mmu_rb_node, list); + list_del(&node->list); + trace_hfi1_mmu_release_node(node); + handler->ops->remove(handler->ops_arg, node); + } } diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index c4da064188c9..751dc3fe1e02 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -16,18 +16,14 @@ struct mmu_rb_node { struct rb_node node; struct mmu_rb_handler *handler; struct list_head list; + struct kref refcount; }; -/* - * NOTE: filter, insert, invalidate, and evict must not sleep. Only remove is - * allowed to sleep. - */ +/* filter and evict must not sleep. Only remove is allowed to sleep. */ struct mmu_rb_ops { bool (*filter)(struct mmu_rb_node *node, unsigned long addr, unsigned long len); - int (*insert)(void *ops_arg, struct mmu_rb_node *mnode); void (*remove)(void *ops_arg, struct mmu_rb_node *mnode); - int (*invalidate)(void *ops_arg, struct mmu_rb_node *node); int (*evict)(void *ops_arg, struct mmu_rb_node *mnode, void *evict_arg, bool *stop); }; @@ -61,6 +57,8 @@ int hfi1_mmu_rb_register(void *ops_arg, void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, struct mmu_rb_node *mnode); +void hfi1_mmu_rb_release(struct kref *refcount); + void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg); struct mmu_rb_node *hfi1_mmu_rb_get_first(struct mmu_rb_handler *handler, unsigned long addr, diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index bb2552dd29c1..26c62162759b 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1593,7 +1593,20 @@ static inline void sdma_unmap_desc( struct hfi1_devdata *dd, struct sdma_desc *descp) { - system_descriptor_complete(dd, descp); + switch (sdma_mapping_type(descp)) { + case SDMA_MAP_SINGLE: + dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp), + sdma_mapping_len(descp), DMA_TO_DEVICE); + break; + case SDMA_MAP_PAGE: + dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp), + sdma_mapping_len(descp), DMA_TO_DEVICE); + break; + } + + if (descp->pinning_ctx && descp->ctx_put) + descp->ctx_put(descp->pinning_ctx); + descp->pinning_ctx = NULL; } /* @@ -3113,8 +3126,8 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, /* Add descriptor for coalesce buffer */ tx->desc_limit = MAX_DESC; - return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, NULL, tx, - addr, tx->tlen); + return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, + addr, tx->tlen, NULL, NULL, NULL); } return 1; @@ -3157,9 +3170,9 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) make_tx_sdma_desc( tx, SDMA_MAP_NONE, - NULL, dd->sdma_pad_phys, - sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1))); + sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)), + NULL, NULL, NULL); tx->num_desc++; _sdma_close_tx(dd, tx); return rval; diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index 95aaec14c6c2..7fdebab202c4 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -594,9 +594,11 @@ static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d) static inline void make_tx_sdma_desc( struct sdma_txreq *tx, int type, - void *pinning_ctx, dma_addr_t addr, - size_t len) + size_t len, + void *pinning_ctx, + void (*ctx_get)(void *), + void (*ctx_put)(void *)) { struct sdma_desc *desc = &tx->descp[tx->num_desc]; @@ -613,7 +615,11 @@ static inline void make_tx_sdma_desc( << SDMA_DESC0_PHY_ADDR_SHIFT) | (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK) << SDMA_DESC0_BYTE_COUNT_SHIFT); + desc->pinning_ctx = pinning_ctx; + desc->ctx_put = ctx_put; + if (pinning_ctx && ctx_get) + ctx_get(pinning_ctx); } /* helper to extend txreq */ @@ -645,18 +651,20 @@ static inline void _sdma_close_tx(struct hfi1_devdata *dd, static inline int _sdma_txadd_daddr( struct hfi1_devdata *dd, int type, - void *pinning_ctx, struct sdma_txreq *tx, dma_addr_t addr, - u16 len) + u16 len, + void *pinning_ctx, + void (*ctx_get)(void *), + void (*ctx_put)(void *)) { int rval = 0; make_tx_sdma_desc( tx, type, - pinning_ctx, - addr, len); + addr, len, + pinning_ctx, ctx_get, ctx_put); WARN_ON(len > tx->tlen); tx->num_desc++; tx->tlen -= len; @@ -676,11 +684,18 @@ static inline int _sdma_txadd_daddr( /** * sdma_txadd_page() - add a page to the sdma_txreq * @dd: the device to use for mapping - * @pinning_ctx: context to be released at descriptor retirement * @tx: tx request to which the page is added * @page: page to map * @offset: offset within the page * @len: length in bytes + * @pinning_ctx: context to be stored on struct sdma_desc .pinning_ctx. Not + * added if coalesce buffer is used. E.g. pointer to pinned-page + * cache entry for the sdma_desc. + * @ctx_get: optional function to take reference to @pinning_ctx. Not called if + * @pinning_ctx is NULL. + * @ctx_put: optional function to release reference to @pinning_ctx after + * sdma_desc completes. May be called in interrupt context so must + * not sleep. Not called if @pinning_ctx is NULL. * * This is used to add a page/offset/length descriptor. * @@ -692,11 +707,13 @@ static inline int _sdma_txadd_daddr( */ static inline int sdma_txadd_page( struct hfi1_devdata *dd, - void *pinning_ctx, struct sdma_txreq *tx, struct page *page, unsigned long offset, - u16 len) + u16 len, + void *pinning_ctx, + void (*ctx_get)(void *), + void (*ctx_put)(void *)) { dma_addr_t addr; int rval; @@ -720,7 +737,8 @@ static inline int sdma_txadd_page( return -ENOSPC; } - return _sdma_txadd_daddr(dd, SDMA_MAP_PAGE, pinning_ctx, tx, addr, len); + return _sdma_txadd_daddr(dd, SDMA_MAP_PAGE, tx, addr, len, + pinning_ctx, ctx_get, ctx_put); } /** @@ -754,8 +772,8 @@ static inline int sdma_txadd_daddr( return rval; } - return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, NULL, tx, - addr, len); + return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len, + NULL, NULL, NULL); } /** @@ -801,7 +819,8 @@ static inline int sdma_txadd_kvaddr( return -ENOSPC; } - return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, NULL, tx, addr, len); + return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, addr, len, + NULL, NULL, NULL); } struct iowait_work; @@ -1034,6 +1053,4 @@ u16 sdma_get_descq_cnt(void); extern uint mod_num_sdma; void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid); - -void system_descriptor_complete(struct hfi1_devdata *dd, struct sdma_desc *descp); #endif diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h index fad946cb5e0d..85ae7293c274 100644 --- a/drivers/infiniband/hw/hfi1/sdma_txreq.h +++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h @@ -20,6 +20,8 @@ struct sdma_desc { /* private: don't use directly */ u64 qw[2]; void *pinning_ctx; + /* Release reference to @pinning_ctx. May be called in interrupt context. Must not sleep. */ + void (*ctx_put)(void *ctx); }; /** diff --git a/drivers/infiniband/hw/hfi1/trace_mmu.h b/drivers/infiniband/hw/hfi1/trace_mmu.h index 57900ebb7702..82cc12aa3fb8 100644 --- a/drivers/infiniband/hw/hfi1/trace_mmu.h +++ b/drivers/infiniband/hw/hfi1/trace_mmu.h @@ -15,31 +15,53 @@ #define TRACE_SYSTEM hfi1_mmu DECLARE_EVENT_CLASS(hfi1_mmu_rb_template, - TP_PROTO(unsigned long addr, unsigned long len), - TP_ARGS(addr, len), + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node), TP_STRUCT__entry(__field(unsigned long, addr) __field(unsigned long, len) + __field(unsigned int, refcount) ), - TP_fast_assign(__entry->addr = addr; - __entry->len = len; + TP_fast_assign(__entry->addr = node->addr; + __entry->len = node->len; + __entry->refcount = kref_read(&node->refcount); ), - TP_printk("MMU node addr 0x%lx, len %lu", + TP_printk("MMU node addr 0x%lx, len %lu, refcount %u", __entry->addr, - __entry->len + __entry->len, + __entry->refcount ) ); DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_insert, - TP_PROTO(unsigned long addr, unsigned long len), - TP_ARGS(addr, len)); + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); -DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_search, - TP_PROTO(unsigned long addr, unsigned long len), - TP_ARGS(addr, len)); +TRACE_EVENT(hfi1_mmu_rb_search, + TP_PROTO(unsigned long addr, unsigned long len), + TP_ARGS(addr, len), + TP_STRUCT__entry(__field(unsigned long, addr) + __field(unsigned long, len) + ), + TP_fast_assign(__entry->addr = addr; + __entry->len = len; + ), + TP_printk("MMU node addr 0x%lx, len %lu", + __entry->addr, + __entry->len + ) +); DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_mem_invalidate, - TP_PROTO(unsigned long addr, unsigned long len), - TP_ARGS(addr, len)); + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_rb_evict, + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); + +DEFINE_EVENT(hfi1_mmu_rb_template, hfi1_mmu_release_node, + TP_PROTO(struct mmu_rb_node *node), + TP_ARGS(node)); #endif /* __HFI1_TRACE_RC_H */ diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index ae58b48afe07..02bd62b857b7 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -62,18 +62,14 @@ static int defer_packet_queue( static void activate_packet_queue(struct iowait *wait, int reason); static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, unsigned long len); -static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2, bool *stop); static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); -static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); static struct mmu_rb_ops sdma_rb_ops = { .filter = sdma_rb_filter, - .insert = sdma_rb_insert, .evict = sdma_rb_evict, .remove = sdma_rb_remove, - .invalidate = sdma_rb_invalidate }; static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, @@ -247,14 +243,14 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, spin_unlock(&fd->pq_rcu_lock); synchronize_srcu(&fd->pq_srcu); /* at this point there can be no more new requests */ - if (pq->handler) - hfi1_mmu_rb_unregister(pq->handler); iowait_sdma_drain(&pq->busy); /* Wait until all requests have been freed. */ wait_event_interruptible( pq->wait, !atomic_read(&pq->n_reqs)); kfree(pq->reqs); + if (pq->handler) + hfi1_mmu_rb_unregister(pq->handler); bitmap_free(pq->req_in_use); kmem_cache_destroy(pq->txreq_cache); flush_pq_iowait(pq); @@ -1275,25 +1271,17 @@ static void free_system_node(struct sdma_mmu_node *node) kfree(node); } -static inline void acquire_node(struct sdma_mmu_node *node) -{ - atomic_inc(&node->refcount); - WARN_ON(atomic_read(&node->refcount) < 0); -} - -static inline void release_node(struct mmu_rb_handler *handler, - struct sdma_mmu_node *node) -{ - atomic_dec(&node->refcount); - WARN_ON(atomic_read(&node->refcount) < 0); -} - +/* + * kref_get()'s an additional kref on the returned rb_node to prevent rb_node + * from being released until after rb_node is assigned to an SDMA descriptor + * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the + * virtual address range for rb_node is invalidated between now and then. + */ static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, unsigned long start, unsigned long end) { struct mmu_rb_node *rb_node; - struct sdma_mmu_node *node; unsigned long flags; spin_lock_irqsave(&handler->lock, flags); @@ -1302,11 +1290,12 @@ static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, spin_unlock_irqrestore(&handler->lock, flags); return NULL; } - node = container_of(rb_node, struct sdma_mmu_node, rb); - acquire_node(node); + + /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ + kref_get(&rb_node->refcount); spin_unlock_irqrestore(&handler->lock, flags); - return node; + return container_of(rb_node, struct sdma_mmu_node, rb); } static int pin_system_pages(struct user_sdma_request *req, @@ -1355,6 +1344,13 @@ retry: return 0; } +/* + * kref refcount on *node_p will be 2 on successful addition: one kref from + * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being + * released until after *node_p is assigned to an SDMA descriptor (struct + * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual + * address range for *node_p is invalidated between now and then. + */ static int add_system_pinning(struct user_sdma_request *req, struct sdma_mmu_node **node_p, unsigned long start, unsigned long len) @@ -1368,6 +1364,12 @@ static int add_system_pinning(struct user_sdma_request *req, if (!node) return -ENOMEM; + /* First kref "moves" to mmu_rb_handler */ + kref_init(&node->rb.refcount); + + /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */ + kref_get(&node->rb.refcount); + node->pq = pq; ret = pin_system_pages(req, start, len, node, PFN_DOWN(len)); if (ret == 0) { @@ -1431,15 +1433,15 @@ static int get_system_cache_entry(struct user_sdma_request *req, return 0; } - SDMA_DBG(req, "prepend: node->rb.addr %lx, node->refcount %d", - node->rb.addr, atomic_read(&node->refcount)); + SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d", + node->rb.addr, kref_read(&node->rb.refcount)); prepend_len = node->rb.addr - start; /* * This node will not be returned, instead a new node * will be. So release the reference. */ - release_node(handler, node); + kref_put(&node->rb.refcount, hfi1_mmu_rb_release); /* Prepend a node to cover the beginning of the allocation */ ret = add_system_pinning(req, node_p, start, prepend_len); @@ -1451,6 +1453,20 @@ static int get_system_cache_entry(struct user_sdma_request *req, } } +static void sdma_mmu_rb_node_get(void *ctx) +{ + struct mmu_rb_node *node = ctx; + + kref_get(&node->refcount); +} + +static void sdma_mmu_rb_node_put(void *ctx) +{ + struct sdma_mmu_node *node = ctx; + + kref_put(&node->rb.refcount, hfi1_mmu_rb_release); +} + static int add_mapping_to_sdma_packet(struct user_sdma_request *req, struct user_sdma_txreq *tx, struct sdma_mmu_node *cache_entry, @@ -1494,9 +1510,12 @@ static int add_mapping_to_sdma_packet(struct user_sdma_request *req, ctx = cache_entry; } - ret = sdma_txadd_page(pq->dd, ctx, &tx->txreq, + ret = sdma_txadd_page(pq->dd, &tx->txreq, cache_entry->pages[page_index], - page_offset, from_this_page); + page_offset, from_this_page, + ctx, + sdma_mmu_rb_node_get, + sdma_mmu_rb_node_put); if (ret) { /* * When there's a failure, the entire request is freed by @@ -1518,8 +1537,6 @@ static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, struct user_sdma_iovec *iovec, size_t from_this_iovec) { - struct mmu_rb_handler *handler = req->pq->handler; - while (from_this_iovec > 0) { struct sdma_mmu_node *cache_entry; size_t from_this_cache_entry; @@ -1540,15 +1557,15 @@ static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start, from_this_cache_entry); + + /* + * Done adding cache_entry to zero or more sdma_desc. Can + * kref_put() the "safety" kref taken under + * get_system_cache_entry(). + */ + kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release); + if (ret) { - /* - * We're guaranteed that there will be no descriptor - * completion callback that releases this node - * because only the last descriptor referencing it - * has a context attached, and a failure means the - * last descriptor was never added. - */ - release_node(handler, cache_entry); SDMA_DBG(req, "add system segment failed %d", ret); return ret; } @@ -1599,42 +1616,12 @@ static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, return 0; } -void system_descriptor_complete(struct hfi1_devdata *dd, - struct sdma_desc *descp) -{ - switch (sdma_mapping_type(descp)) { - case SDMA_MAP_SINGLE: - dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp), - sdma_mapping_len(descp), DMA_TO_DEVICE); - break; - case SDMA_MAP_PAGE: - dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp), - sdma_mapping_len(descp), DMA_TO_DEVICE); - break; - } - - if (descp->pinning_ctx) { - struct sdma_mmu_node *node = descp->pinning_ctx; - - release_node(node->rb.handler, node); - } -} - static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, unsigned long len) { return (bool)(node->addr == addr); } -static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) -{ - struct sdma_mmu_node *node = - container_of(mnode, struct sdma_mmu_node, rb); - - atomic_inc(&node->refcount); - return 0; -} - /* * Return 1 to remove the node from the rb tree and call the remove op. * @@ -1647,10 +1634,6 @@ static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, container_of(mnode, struct sdma_mmu_node, rb); struct evict_data *evict_data = evict_arg; - /* is this node still being used? */ - if (atomic_read(&node->refcount)) - return 0; /* keep this node */ - /* this node will be evicted, add its pages to our count */ evict_data->cleared += node->npages; @@ -1668,13 +1651,3 @@ static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) free_system_node(node); } - -static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) -{ - struct sdma_mmu_node *node = - container_of(mnode, struct sdma_mmu_node, rb); - - if (!atomic_read(&node->refcount)) - return 1; - return 0; -} diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index a241836371dc..548347d4c5bc 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -104,7 +104,6 @@ struct hfi1_user_sdma_comp_q { struct sdma_mmu_node { struct mmu_rb_node rb; struct hfi1_user_sdma_pkt_q *pq; - atomic_t refcount; struct page **pages; unsigned int npages; }; diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index 727eedfba332..cc6324d2d1dd 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -64,11 +64,11 @@ static noinline int build_vnic_ulp_payload(struct sdma_engine *sde, /* combine physically continuous fragments later? */ ret = sdma_txadd_page(sde->dd, - NULL, &tx->txreq, skb_frag_page(frag), skb_frag_off(frag), - skb_frag_size(frag)); + skb_frag_size(frag), + NULL, NULL, NULL); if (unlikely(ret)) goto bail_txadd; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index aa8a08d1c014..47c0efed1821 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -595,11 +595,12 @@ int hns_roce_table_get(struct hns_roce_dev *hr_dev, } /* Set HEM base address(128K/page, pa) to Hardware */ - if (hr_dev->hw->set_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT)) { + ret = hr_dev->hw->set_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); + if (ret) { hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; - ret = -ENODEV; - dev_err(dev, "set HEM base address to HW failed.\n"); + dev_err(dev, "set HEM base address to HW failed, ret = %d.\n", + ret); goto out; } @@ -618,6 +619,7 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, u32 hop_num = mhop->hop_num; u32 chunk_ba_num; u32 step_idx; + int ret; index->inited = HEM_INDEX_BUF; chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN; @@ -641,16 +643,24 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, else step_idx = hop_num; - if (hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx)) - ibdev_warn(ibdev, "failed to clear hop%u HEM.\n", hop_num); - - if (index->inited & HEM_INDEX_L1) - if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1)) - ibdev_warn(ibdev, "failed to clear HEM step 1.\n"); + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx); + if (ret) + ibdev_warn(ibdev, "failed to clear hop%u HEM, ret = %d.\n", + hop_num, ret); + + if (index->inited & HEM_INDEX_L1) { + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 1); + if (ret) + ibdev_warn(ibdev, "failed to clear HEM step 1, ret = %d.\n", + ret); + } - if (index->inited & HEM_INDEX_L0) - if (hr_dev->hw->clear_hem(hr_dev, table, obj, 0)) - ibdev_warn(ibdev, "failed to clear HEM step 0.\n"); + if (index->inited & HEM_INDEX_L0) { + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0); + if (ret) + ibdev_warn(ibdev, "failed to clear HEM step 0, ret = %d.\n", + ret); + } } } @@ -687,6 +697,7 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, { struct device *dev = hr_dev->dev; unsigned long i; + int ret; if (hns_roce_check_whether_mhop(hr_dev, table->type)) { hns_roce_table_mhop_put(hr_dev, table, obj, 1); @@ -699,8 +710,10 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, &table->mutex)) return; - if (hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT)) - dev_warn(dev, "failed to clear HEM base address.\n"); + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT); + if (ret) + dev_warn(dev, "failed to clear HEM base address, ret = %d.\n", + ret); hns_roce_free_hem(hr_dev, table->hem[i]); table->hem[i] = NULL; @@ -916,6 +929,8 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, { struct device *dev = hr_dev->dev; unsigned long i; + int obj; + int ret; if (hns_roce_check_whether_mhop(hr_dev, table->type)) { hns_roce_cleanup_mhop_hem_table(hr_dev, table); @@ -924,9 +939,11 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, for (i = 0; i < table->num_hem; ++i) if (table->hem[i]) { - if (hr_dev->hw->clear_hem(hr_dev, table, - i * table->table_chunk_size / table->obj_size, 0)) - dev_err(dev, "clear HEM base address failed.\n"); + obj = i * table->table_chunk_size / table->obj_size; + ret = hr_dev->hw->clear_hem(hr_dev, table, obj, 0); + if (ret) + dev_err(dev, "clear HEM base address failed, ret = %d.\n", + ret); hns_roce_free_hem(hr_dev, table->hem[i]); } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 84f1167de1d9..8f7eb11066b4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -373,17 +373,10 @@ static int check_send_valid(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { struct ib_device *ibdev = &hr_dev->ib_dev; - struct ib_qp *ibqp = &hr_qp->ibqp; - if (unlikely(ibqp->qp_type != IB_QPT_RC && - ibqp->qp_type != IB_QPT_GSI && - ibqp->qp_type != IB_QPT_UD)) { - ibdev_err(ibdev, "not supported QP(0x%x)type!\n", - ibqp->qp_type); - return -EOPNOTSUPP; - } else if (unlikely(hr_qp->state == IB_QPS_RESET || - hr_qp->state == IB_QPS_INIT || - hr_qp->state == IB_QPS_RTR)) { + if (unlikely(hr_qp->state == IB_QPS_RESET || + hr_qp->state == IB_QPS_INIT || + hr_qp->state == IB_QPS_RTR)) { ibdev_err(ibdev, "failed to post WQE, QP state %u!\n", hr_qp->state); return -EINVAL; @@ -771,17 +764,6 @@ out: static int check_recv_valid(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { - struct ib_device *ibdev = &hr_dev->ib_dev; - struct ib_qp *ibqp = &hr_qp->ibqp; - - if (unlikely(ibqp->qp_type != IB_QPT_RC && - ibqp->qp_type != IB_QPT_GSI && - ibqp->qp_type != IB_QPT_UD)) { - ibdev_err(ibdev, "unsupported qp type, qp_type = %d.\n", - ibqp->qp_type); - return -EOPNOTSUPP; - } - if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) return -EIO; @@ -4583,11 +4565,9 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, mtu = ib_mtu_enum_to_int(ib_mtu); if (WARN_ON(mtu <= 0)) return -EINVAL; -#define MAX_LP_MSG_LEN 16384 - /* MTU * (2 ^ LP_PKTN_INI) shouldn't be bigger than 16KB */ - lp_pktn_ini = ilog2(MAX_LP_MSG_LEN / mtu); - if (WARN_ON(lp_pktn_ini >= 0xF)) - return -EINVAL; +#define MIN_LP_MSG_LEN 1024 + /* mtu * (2 ^ lp_pktn_ini) should be in the range of 1024 to mtu */ + lp_pktn_ini = ilog2(max(mtu, MIN_LP_MSG_LEN) / mtu); if (attr_mask & IB_QP_PATH_MTU) { hr_reg_write(context, QPC_MTU, ib_mtu); @@ -5012,7 +4992,6 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, static bool check_qp_timeout_cfg_range(struct hns_roce_dev *hr_dev, u8 *timeout) { #define QP_ACK_TIMEOUT_MAX_HIP08 20 -#define QP_ACK_TIMEOUT_OFFSET 10 #define QP_ACK_TIMEOUT_MAX 31 if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { @@ -5021,7 +5000,7 @@ static bool check_qp_timeout_cfg_range(struct hns_roce_dev *hr_dev, u8 *timeout) "local ACK timeout shall be 0 to 20.\n"); return false; } - *timeout += QP_ACK_TIMEOUT_OFFSET; + *timeout += HNS_ROCE_V2_QP_ACK_TIMEOUT_OFS_HIP08; } else if (hr_dev->pci_dev->revision > PCI_REVISION_ID_HIP08) { if (*timeout > QP_ACK_TIMEOUT_MAX) { ibdev_warn(&hr_dev->ib_dev, @@ -5307,6 +5286,18 @@ out: return ret; } +static u8 get_qp_timeout_attr(struct hns_roce_dev *hr_dev, + struct hns_roce_v2_qp_context *context) +{ + u8 timeout; + + timeout = (u8)hr_reg_read(context, QPC_AT); + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + timeout -= HNS_ROCE_V2_QP_ACK_TIMEOUT_OFS_HIP08; + + return timeout; +} + static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) @@ -5384,7 +5375,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_attr->max_dest_rd_atomic = 1 << hr_reg_read(&context, QPC_RR_MAX); qp_attr->min_rnr_timer = (u8)hr_reg_read(&context, QPC_MIN_RNR_TIME); - qp_attr->timeout = (u8)hr_reg_read(&context, QPC_AT); + qp_attr->timeout = get_qp_timeout_attr(hr_dev, &context); qp_attr->retry_cnt = hr_reg_read(&context, QPC_RETRY_NUM_INIT); qp_attr->rnr_retry = hr_reg_read(&context, QPC_RNR_NUM_INIT); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 1b44d2434ab4..7033eae2407c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -44,6 +44,8 @@ #define HNS_ROCE_V2_MAX_XRCD_NUM 0x1000000 #define HNS_ROCE_V2_RSV_XRCD_NUM 0 +#define HNS_ROCE_V2_QP_ACK_TIMEOUT_OFS_HIP08 10 + #define HNS_ROCE_V3_SCCC_SZ 64 #define HNS_ROCE_V3_GMV_ENTRY_SZ 32 diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 37a5cf62f88b..14376490ac22 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -33,6 +33,7 @@ #include <linux/vmalloc.h> #include <rdma/ib_umem.h> +#include <linux/math.h> #include "hns_roce_device.h" #include "hns_roce_cmd.h" #include "hns_roce_hem.h" @@ -909,6 +910,44 @@ static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev, return page_cnt; } +static u64 cal_pages_per_l1ba(unsigned int ba_per_bt, unsigned int hopnum) +{ + return int_pow(ba_per_bt, hopnum - 1); +} + +static unsigned int cal_best_bt_pg_sz(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr, + unsigned int pg_shift) +{ + unsigned long cap = hr_dev->caps.page_size_cap; + struct hns_roce_buf_region *re; + unsigned int pgs_per_l1ba; + unsigned int ba_per_bt; + unsigned int ba_num; + int i; + + for_each_set_bit_from(pg_shift, &cap, sizeof(cap) * BITS_PER_BYTE) { + if (!(BIT(pg_shift) & cap)) + continue; + + ba_per_bt = BIT(pg_shift) / BA_BYTE_LEN; + ba_num = 0; + for (i = 0; i < mtr->hem_cfg.region_count; i++) { + re = &mtr->hem_cfg.region[i]; + if (re->hopnum == 0) + continue; + + pgs_per_l1ba = cal_pages_per_l1ba(ba_per_bt, re->hopnum); + ba_num += DIV_ROUND_UP(re->count, pgs_per_l1ba); + } + + if (ba_num <= ba_per_bt) + return pg_shift; + } + + return 0; +} + static int mtr_alloc_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, unsigned int ba_page_shift) { @@ -917,6 +956,10 @@ static int mtr_alloc_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, hns_roce_hem_list_init(&mtr->hem_list); if (!cfg->is_direct) { + ba_page_shift = cal_best_bt_pg_sz(hr_dev, mtr, ba_page_shift); + if (!ba_page_shift) + return -ERANGE; + ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, cfg->region, cfg->region_count, ba_page_shift); diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 16183e894da7..dd428d915c17 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -93,16 +93,18 @@ static int irdma_nop_1(struct irdma_qp_uk *qp) */ void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx) { - __le64 *wqe; + struct irdma_qp_quanta *sq; u32 wqe_idx; if (!(qp_wqe_idx & 0x7F)) { wqe_idx = (qp_wqe_idx + 128) % qp->sq_ring.size; - wqe = qp->sq_base[wqe_idx].elem; + sq = qp->sq_base + wqe_idx; if (wqe_idx) - memset(wqe, qp->swqe_polarity ? 0 : 0xFF, 0x1000); + memset(sq, qp->swqe_polarity ? 0 : 0xFF, + 128 * sizeof(*sq)); else - memset(wqe, qp->swqe_polarity ? 0xFF : 0, 0x1000); + memset(sq, qp->swqe_polarity ? 0xFF : 0, + 128 * sizeof(*sq)); } } diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index ab5cdf782785..9c4fe4fa9001 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -522,11 +522,6 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (!iwqp->user_mode) cancel_delayed_work_sync(&iwqp->dwork_flush); - irdma_qp_rem_ref(&iwqp->ibqp); - wait_for_completion(&iwqp->free_qp); - irdma_free_lsmm_rsrc(iwqp); - irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); - if (!iwqp->user_mode) { if (iwqp->iwscq) { irdma_clean_cqes(iwqp, iwqp->iwscq); @@ -534,6 +529,12 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) irdma_clean_cqes(iwqp, iwqp->iwrcq); } } + + irdma_qp_rem_ref(&iwqp->ibqp); + wait_for_completion(&iwqp->free_qp); + irdma_free_lsmm_rsrc(iwqp); + irdma_cqp_qp_destroy_cmd(&iwdev->rf->sc_dev, &iwqp->sc_qp); + irdma_remove_push_mmap_entries(iwqp); irdma_free_qp_rsrc(iwqp); @@ -3291,6 +3292,7 @@ static int irdma_post_send(struct ib_qp *ibqp, break; case IB_WR_LOCAL_INV: info.op_type = IRDMA_OP_TYPE_INV_STAG; + info.local_fence = info.read_fence; info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; err = irdma_uk_stag_local_invalidate(ukqp, &info, true); break; @@ -4450,8 +4452,16 @@ static const struct ib_device_ops irdma_roce_dev_ops = { }; static const struct ib_device_ops irdma_iw_dev_ops = { - .modify_qp = irdma_modify_qp, .get_port_immutable = irdma_iw_port_immutable, + .iw_accept = irdma_accept, + .iw_add_ref = irdma_qp_add_ref, + .iw_connect = irdma_connect, + .iw_create_listen = irdma_create_listen, + .iw_destroy_listen = irdma_destroy_listen, + .iw_get_qp = irdma_get_qp, + .iw_reject = irdma_reject, + .iw_rem_ref = irdma_qp_rem_ref, + .modify_qp = irdma_modify_qp, .query_gid = irdma_query_gid, }; @@ -4515,50 +4525,35 @@ static void irdma_init_roce_device(struct irdma_device *iwdev) * irdma_init_iw_device - initialization of iwarp rdma device * @iwdev: irdma device */ -static int irdma_init_iw_device(struct irdma_device *iwdev) +static void irdma_init_iw_device(struct irdma_device *iwdev) { struct net_device *netdev = iwdev->netdev; iwdev->ibdev.node_type = RDMA_NODE_RNIC; addrconf_addr_eui48((u8 *)&iwdev->ibdev.node_guid, netdev->dev_addr); - iwdev->ibdev.ops.iw_add_ref = irdma_qp_add_ref; - iwdev->ibdev.ops.iw_rem_ref = irdma_qp_rem_ref; - iwdev->ibdev.ops.iw_get_qp = irdma_get_qp; - iwdev->ibdev.ops.iw_connect = irdma_connect; - iwdev->ibdev.ops.iw_accept = irdma_accept; - iwdev->ibdev.ops.iw_reject = irdma_reject; - iwdev->ibdev.ops.iw_create_listen = irdma_create_listen; - iwdev->ibdev.ops.iw_destroy_listen = irdma_destroy_listen; memcpy(iwdev->ibdev.iw_ifname, netdev->name, sizeof(iwdev->ibdev.iw_ifname)); ib_set_device_ops(&iwdev->ibdev, &irdma_iw_dev_ops); - - return 0; } /** * irdma_init_rdma_device - initialization of rdma device * @iwdev: irdma device */ -static int irdma_init_rdma_device(struct irdma_device *iwdev) +static void irdma_init_rdma_device(struct irdma_device *iwdev) { struct pci_dev *pcidev = iwdev->rf->pcidev; - int ret; - if (iwdev->roce_mode) { + if (iwdev->roce_mode) irdma_init_roce_device(iwdev); - } else { - ret = irdma_init_iw_device(iwdev); - if (ret) - return ret; - } + else + irdma_init_iw_device(iwdev); + iwdev->ibdev.phys_port_cnt = 1; iwdev->ibdev.num_comp_vectors = iwdev->rf->ceqs_count; iwdev->ibdev.dev.parent = &pcidev->dev; ib_set_device_ops(&iwdev->ibdev, &irdma_dev_ops); - - return 0; } /** @@ -4596,9 +4591,7 @@ int irdma_ib_register_device(struct irdma_device *iwdev) { int ret; - ret = irdma_init_rdma_device(iwdev); - if (ret) - return ret; + irdma_init_rdma_device(iwdev); ret = ib_device_set_netdev(&iwdev->ibdev, iwdev->netdev, 1); if (ret) diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c index 54b61930a7fd..4b3b5b274e84 100644 --- a/drivers/infiniband/hw/mana/qp.c +++ b/drivers/infiniband/hw/mana/qp.c @@ -13,7 +13,7 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, u8 *rx_hash_key) { struct mana_port_context *mpc = netdev_priv(ndev); - struct mana_cfg_rx_steer_req *req = NULL; + struct mana_cfg_rx_steer_req_v2 *req; struct mana_cfg_rx_steer_resp resp = {}; mana_handle_t *req_indir_tab; struct gdma_context *gc; @@ -33,6 +33,8 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size, sizeof(resp)); + req->hdr.req.msg_version = GDMA_MESSAGE_V2; + req->vport = mpc->port_handle; req->rx_enable = 1; req->update_default_rxobj = 1; @@ -46,6 +48,7 @@ static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE; req->indir_tab_offset = sizeof(*req); req->update_indir_tab = true; + req->cqe_coalescing_enable = 1; req_indir_tab = (mana_handle_t *)(req + 1); /* The ind table passed to the hardware must have diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 1c06920505d2..93257fa5aae8 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -209,7 +209,8 @@ static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev, !vport_qcounters_supported(dev)) || !port_num) return &dev->port[0].cnts; - return &dev->port[port_num - 1].cnts; + return is_mdev_switchdev_mode(dev->mdev) ? + &dev->port[1].cnts : &dev->port[port_num - 1].cnts; } /** @@ -262,7 +263,7 @@ static struct rdma_hw_stats * mlx5_ib_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - const struct mlx5_ib_counters *cnts = &dev->port[port_num - 1].cnts; + const struct mlx5_ib_counters *cnts = get_counters(dev, port_num); return do_alloc_stats(cnts); } @@ -329,6 +330,7 @@ static int mlx5_ib_query_q_counters_vport(struct mlx5_ib_dev *dev, { u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {}; + struct mlx5_core_dev *mdev; __be32 val; int ret, i; @@ -336,12 +338,16 @@ static int mlx5_ib_query_q_counters_vport(struct mlx5_ib_dev *dev, dev->port[port_num].rep->vport == MLX5_VPORT_UPLINK) return 0; + mdev = mlx5_eswitch_get_core_dev(dev->port[port_num].rep->esw); + if (!mdev) + return -EOPNOTSUPP; + MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER); MLX5_SET(query_q_counter_in, in, other_vport, 1); MLX5_SET(query_q_counter_in, in, vport_number, dev->port[port_num].rep->vport); MLX5_SET(query_q_counter_in, in, aggregate, 1); - ret = mlx5_cmd_exec_inout(dev->mdev, query_q_counter, in, out); + ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out); if (ret) return ret; @@ -575,43 +581,53 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, bool is_vport = is_mdev_switchdev_mode(dev->mdev) && port_num != MLX5_VPORT_PF; const struct mlx5_ib_counter *names; - int j = 0, i; + int j = 0, i, size; names = is_vport ? vport_basic_q_cnts : basic_q_cnts; - for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) { + size = is_vport ? ARRAY_SIZE(vport_basic_q_cnts) : + ARRAY_SIZE(basic_q_cnts); + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = basic_q_cnts[i].offset; + offsets[j] = names[i].offset; } names = is_vport ? vport_out_of_seq_q_cnts : out_of_seq_q_cnts; + size = is_vport ? ARRAY_SIZE(vport_out_of_seq_q_cnts) : + ARRAY_SIZE(out_of_seq_q_cnts); if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) { - for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = out_of_seq_q_cnts[i].offset; + offsets[j] = names[i].offset; } } names = is_vport ? vport_retrans_q_cnts : retrans_q_cnts; + size = is_vport ? ARRAY_SIZE(vport_retrans_q_cnts) : + ARRAY_SIZE(retrans_q_cnts); if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { - for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = retrans_q_cnts[i].offset; + offsets[j] = names[i].offset; } } names = is_vport ? vport_extended_err_cnts : extended_err_cnts; + size = is_vport ? ARRAY_SIZE(vport_extended_err_cnts) : + ARRAY_SIZE(extended_err_cnts); if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) { - for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = extended_err_cnts[i].offset; + offsets[j] = names[i].offset; } } names = is_vport ? vport_roce_accl_cnts : roce_accl_cnts; + size = is_vport ? ARRAY_SIZE(vport_roce_accl_cnts) : + ARRAY_SIZE(roce_accl_cnts); if (MLX5_CAP_GEN(dev->mdev, roce_accl)) { - for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) { + for (i = 0; i < size; i++, j++) { descs[j].name = names[i].name; - offsets[j] = roce_accl_cnts[i].offset; + offsets[j] = names[i].offset; } } @@ -661,25 +677,37 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, struct mlx5_ib_counters *cnts, u32 port_num) { - u32 num_counters, num_op_counters = 0; + bool is_vport = is_mdev_switchdev_mode(dev->mdev) && + port_num != MLX5_VPORT_PF; + u32 num_counters, num_op_counters = 0, size; - num_counters = ARRAY_SIZE(basic_q_cnts); + size = is_vport ? ARRAY_SIZE(vport_basic_q_cnts) : + ARRAY_SIZE(basic_q_cnts); + num_counters = size; + size = is_vport ? ARRAY_SIZE(vport_out_of_seq_q_cnts) : + ARRAY_SIZE(out_of_seq_q_cnts); if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) - num_counters += ARRAY_SIZE(out_of_seq_q_cnts); + num_counters += size; + size = is_vport ? ARRAY_SIZE(vport_retrans_q_cnts) : + ARRAY_SIZE(retrans_q_cnts); if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) - num_counters += ARRAY_SIZE(retrans_q_cnts); + num_counters += size; + size = is_vport ? ARRAY_SIZE(vport_extended_err_cnts) : + ARRAY_SIZE(extended_err_cnts); if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) - num_counters += ARRAY_SIZE(extended_err_cnts); + num_counters += size; + size = is_vport ? ARRAY_SIZE(vport_roce_accl_cnts) : + ARRAY_SIZE(roce_accl_cnts); if (MLX5_CAP_GEN(dev->mdev, roce_accl)) - num_counters += ARRAY_SIZE(roce_accl_cnts); + num_counters += size; cnts->num_q_counters = num_counters; - if (is_mdev_switchdev_mode(dev->mdev) && port_num != MLX5_VPORT_PF) + if (is_vport) goto skip_non_qcounters; if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { @@ -725,11 +753,11 @@ err: static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {}; - int num_cnt_ports; + int num_cnt_ports = dev->num_ports; int i, j; - num_cnt_ports = (!is_mdev_switchdev_mode(dev->mdev) || - vport_qcounters_supported(dev)) ? dev->num_ports : 1; + if (is_mdev_switchdev_mode(dev->mdev)) + num_cnt_ports = min(2, num_cnt_ports); MLX5_SET(dealloc_q_counter_in, in, opcode, MLX5_CMD_OP_DEALLOC_Q_COUNTER); @@ -761,15 +789,22 @@ static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) { u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {}; u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {}; - int num_cnt_ports; + int num_cnt_ports = dev->num_ports; int err = 0; int i; bool is_shared; MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER); is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0; - num_cnt_ports = (!is_mdev_switchdev_mode(dev->mdev) || - vport_qcounters_supported(dev)) ? dev->num_ports : 1; + + /* + * In switchdev we need to allocate two ports, one that is used for + * the device Q_counters and it is essentially the real Q_counters of + * this device, while the other is used as a helper for PF to be able to + * query all other vports. + */ + if (is_mdev_switchdev_mode(dev->mdev)) + num_cnt_ports = min(2, num_cnt_ports); for (i = 0; i < num_cnt_ports; i++) { err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts, i); diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 3008632a6c20..1e419e080b53 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -695,8 +695,6 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; - if (mlx5_ib_shared_ft_allowed(&dev->ib_dev)) - ft_attr.uid = MLX5_SHARED_RESOURCE_UID; ft_attr.prio = priority; ft_attr.max_fte = num_entries; ft_attr.flags = flags; @@ -2025,6 +2023,237 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject, return 0; } +static int steering_anchor_create_ft(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + enum mlx5_flow_namespace_type ns_type) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_flow_namespace *ns; + struct mlx5_flow_table *ft; + + if (ft_prio->anchor.ft) + return 0; + + ns = mlx5_get_flow_namespace(dev->mdev, ns_type); + if (!ns) + return -EOPNOTSUPP; + + ft_attr.flags = MLX5_FLOW_TABLE_UNMANAGED; + ft_attr.uid = MLX5_SHARED_RESOURCE_UID; + ft_attr.prio = 0; + ft_attr.max_fte = 2; + ft_attr.level = 1; + + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) + return PTR_ERR(ft); + + ft_prio->anchor.ft = ft; + + return 0; +} + +static void steering_anchor_destroy_ft(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.ft) { + mlx5_destroy_flow_table(ft_prio->anchor.ft); + ft_prio->anchor.ft = NULL; + } +} + +static int +steering_anchor_create_fg_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + void *flow_group_in; + int err = 0; + + if (ft_prio->anchor.fg_drop) + return 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, 1); + + fg = mlx5_create_flow_group(ft_prio->anchor.ft, flow_group_in); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + goto out; + } + + ft_prio->anchor.fg_drop = fg; + +out: + kvfree(flow_group_in); + + return err; +} + +static void +steering_anchor_destroy_fg_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.fg_drop) { + mlx5_destroy_flow_group(ft_prio->anchor.fg_drop); + ft_prio->anchor.fg_drop = NULL; + } +} + +static int +steering_anchor_create_fg_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + void *flow_group_in; + int err = 0; + + if (ft_prio->anchor.fg_goto_table) + return 0; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + fg = mlx5_create_flow_group(ft_prio->anchor.ft, flow_group_in); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + goto out; + } + ft_prio->anchor.fg_goto_table = fg; + +out: + kvfree(flow_group_in); + + return err; +} + +static void +steering_anchor_destroy_fg_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.fg_goto_table) { + mlx5_destroy_flow_group(ft_prio->anchor.fg_goto_table); + ft_prio->anchor.fg_goto_table = NULL; + } +} + +static int +steering_anchor_create_rule_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *handle; + + if (ft_prio->anchor.rule_drop) + return 0; + + flow_act.fg = ft_prio->anchor.fg_drop; + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; + + handle = mlx5_add_flow_rules(ft_prio->anchor.ft, NULL, &flow_act, + NULL, 0); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ft_prio->anchor.rule_drop = handle; + + return 0; +} + +static void steering_anchor_destroy_rule_drop(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.rule_drop) { + mlx5_del_flow_rules(ft_prio->anchor.rule_drop); + ft_prio->anchor.rule_drop = NULL; + } +} + +static int +steering_anchor_create_rule_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *handle; + + if (ft_prio->anchor.rule_goto_table) + return 0; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + flow_act.flags |= FLOW_ACT_IGNORE_FLOW_LEVEL; + flow_act.fg = ft_prio->anchor.fg_goto_table; + + dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest.ft = ft_prio->flow_table; + + handle = mlx5_add_flow_rules(ft_prio->anchor.ft, NULL, &flow_act, + &dest, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ft_prio->anchor.rule_goto_table = handle; + + return 0; +} + +static void +steering_anchor_destroy_rule_goto_table(struct mlx5_ib_flow_prio *ft_prio) +{ + if (ft_prio->anchor.rule_goto_table) { + mlx5_del_flow_rules(ft_prio->anchor.rule_goto_table); + ft_prio->anchor.rule_goto_table = NULL; + } +} + +static int steering_anchor_create_res(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + enum mlx5_flow_namespace_type ns_type) +{ + int err; + + err = steering_anchor_create_ft(dev, ft_prio, ns_type); + if (err) + return err; + + err = steering_anchor_create_fg_drop(ft_prio); + if (err) + goto destroy_ft; + + err = steering_anchor_create_fg_goto_table(ft_prio); + if (err) + goto destroy_fg_drop; + + err = steering_anchor_create_rule_drop(ft_prio); + if (err) + goto destroy_fg_goto_table; + + err = steering_anchor_create_rule_goto_table(ft_prio); + if (err) + goto destroy_rule_drop; + + return 0; + +destroy_rule_drop: + steering_anchor_destroy_rule_drop(ft_prio); +destroy_fg_goto_table: + steering_anchor_destroy_fg_goto_table(ft_prio); +destroy_fg_drop: + steering_anchor_destroy_fg_drop(ft_prio); +destroy_ft: + steering_anchor_destroy_ft(ft_prio); + + return err; +} + +static void mlx5_steering_anchor_destroy_res(struct mlx5_ib_flow_prio *ft_prio) +{ + steering_anchor_destroy_rule_goto_table(ft_prio); + steering_anchor_destroy_rule_drop(ft_prio); + steering_anchor_destroy_fg_goto_table(ft_prio); + steering_anchor_destroy_fg_drop(ft_prio); + steering_anchor_destroy_ft(ft_prio); +} + static int steering_anchor_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) @@ -2035,6 +2264,9 @@ static int steering_anchor_cleanup(struct ib_uobject *uobject, return -EBUSY; mutex_lock(&obj->dev->flow_db->lock); + if (!--obj->ft_prio->anchor.rule_goto_table_ref) + steering_anchor_destroy_rule_goto_table(obj->ft_prio); + put_flow_table(obj->dev, obj->ft_prio, true); mutex_unlock(&obj->dev->flow_db->lock); @@ -2042,6 +2274,24 @@ static int steering_anchor_cleanup(struct ib_uobject *uobject, return 0; } +static void fs_cleanup_anchor(struct mlx5_ib_flow_prio *prio, + int count) +{ + while (count--) + mlx5_steering_anchor_destroy_res(&prio[count]); +} + +void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) +{ + fs_cleanup_anchor(dev->flow_db->prios, MLX5_IB_NUM_FLOW_FT); + fs_cleanup_anchor(dev->flow_db->egress_prios, MLX5_IB_NUM_FLOW_FT); + fs_cleanup_anchor(dev->flow_db->sniffer, MLX5_IB_NUM_SNIFFER_FTS); + fs_cleanup_anchor(dev->flow_db->egress, MLX5_IB_NUM_EGRESS_FTS); + fs_cleanup_anchor(dev->flow_db->fdb, MLX5_IB_NUM_FDB_FTS); + fs_cleanup_anchor(dev->flow_db->rdma_rx, MLX5_IB_NUM_FLOW_FT); + fs_cleanup_anchor(dev->flow_db->rdma_tx, MLX5_IB_NUM_FLOW_FT); +} + static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, struct mlx5_ib_flow_matcher *obj) { @@ -2182,21 +2432,31 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( return -ENOMEM; mutex_lock(&dev->flow_db->lock); + ft_prio = _get_flow_table(dev, priority, ns_type, 0); if (IS_ERR(ft_prio)) { - mutex_unlock(&dev->flow_db->lock); err = PTR_ERR(ft_prio); goto free_obj; } ft_prio->refcount++; - ft_id = mlx5_flow_table_id(ft_prio->flow_table); - mutex_unlock(&dev->flow_db->lock); + + if (!ft_prio->anchor.rule_goto_table_ref) { + err = steering_anchor_create_res(dev, ft_prio, ns_type); + if (err) + goto put_flow_table; + } + + ft_prio->anchor.rule_goto_table_ref++; + + ft_id = mlx5_flow_table_id(ft_prio->anchor.ft); err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, &ft_id, sizeof(ft_id)); if (err) - goto put_flow_table; + goto destroy_res; + + mutex_unlock(&dev->flow_db->lock); uobj->object = obj; obj->dev = dev; @@ -2205,8 +2465,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( return 0; +destroy_res: + --ft_prio->anchor.rule_goto_table_ref; + mlx5_steering_anchor_destroy_res(ft_prio); put_flow_table: - mutex_lock(&dev->flow_db->lock); put_flow_table(dev, ft_prio, true); mutex_unlock(&dev->flow_db->lock); free_obj: diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h index ad320adaf321..b9734904f5f0 100644 --- a/drivers/infiniband/hw/mlx5/fs.h +++ b/drivers/infiniband/hw/mlx5/fs.h @@ -10,6 +10,7 @@ #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_fs_init(struct mlx5_ib_dev *dev); +void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev); #else static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) { @@ -21,9 +22,24 @@ static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev) mutex_init(&dev->flow_db->lock); return 0; } + +inline void mlx5_ib_fs_cleanup_anchor(struct mlx5_ib_dev *dev) {} #endif + static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev) { + /* When a steering anchor is created, a special flow table is also + * created for the user to reference. Since the user can reference it, + * the kernel cannot trust that when the user destroys the steering + * anchor, they no longer reference the flow table. + * + * To address this issue, when a user destroys a steering anchor, only + * the flow steering rule in the table is destroyed, but the table + * itself is kept to deal with the above scenario. The remaining + * resources are only removed when the RDMA device is destroyed, which + * is a safe assumption that all references are gone. + */ + mlx5_ib_fs_cleanup_anchor(dev); kfree(dev->flow_db); } #endif /* _MLX5_IB_FS_H */ diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index ddcfc116b19a..c7a4ee896121 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -30,45 +30,65 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); +static void mlx5_ib_num_ports_update(struct mlx5_core_dev *dev, u32 *num_ports) +{ + struct mlx5_core_dev *peer_dev; + int i; + + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + u32 peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev); + + if (mlx5_lag_is_mpesw(peer_dev)) + *num_ports += peer_num_ports; + else + /* Only 1 ib port is the representor for all uplinks */ + *num_ports += peer_num_ports - 1; + } +} + static int mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { u32 num_ports = mlx5_eswitch_get_total_vports(dev); + struct mlx5_core_dev *lag_master = dev; const struct mlx5_ib_profile *profile; struct mlx5_core_dev *peer_dev; struct mlx5_ib_dev *ibdev; - int second_uplink = false; - u32 peer_num_ports; + int new_uplink = false; int vport_index; int ret; + int i; vport_index = rep->vport_index; if (mlx5_lag_is_shared_fdb(dev)) { - peer_dev = mlx5_lag_get_peer_mdev(dev); - peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev); if (mlx5_lag_is_master(dev)) { - if (mlx5_lag_is_mpesw(dev)) - num_ports += peer_num_ports; - else - num_ports += peer_num_ports - 1; - + mlx5_ib_num_ports_update(dev, &num_ports); } else { if (rep->vport == MLX5_VPORT_UPLINK) { if (!mlx5_lag_is_mpesw(dev)) return 0; - second_uplink = true; + new_uplink = true; } + mlx5_lag_for_each_peer_mdev(dev, peer_dev, i) { + u32 peer_n_ports = mlx5_eswitch_get_total_vports(peer_dev); + + if (mlx5_lag_is_master(peer_dev)) + lag_master = peer_dev; + else if (!mlx5_lag_is_mpesw(dev)) + /* Only 1 ib port is the representor for all uplinks */ + peer_n_ports--; - vport_index += peer_num_ports; - dev = peer_dev; + if (mlx5_get_dev_index(peer_dev) < mlx5_get_dev_index(dev)) + vport_index += peer_n_ports; + } } } - if (rep->vport == MLX5_VPORT_UPLINK && !second_uplink) + if (rep->vport == MLX5_VPORT_UPLINK && !new_uplink) profile = &raw_eth_profile; else - return mlx5_ib_set_vport_rep(dev, rep, vport_index); + return mlx5_ib_set_vport_rep(lag_master, rep, vport_index); ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev); if (!ibdev) @@ -85,8 +105,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) vport_index = rep->vport_index; ibdev->port[vport_index].rep = rep; ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport); - ibdev->mdev = dev; + mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport); + ibdev->mdev = lag_master; ibdev->num_ports = num_ports; ret = __mlx5_ib_add(ibdev, profile); @@ -94,8 +114,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) goto fail_add; rep->rep_data[REP_IB].priv = ibdev; - if (mlx5_lag_is_shared_fdb(dev)) - mlx5_ib_register_peer_vport_reps(dev); + if (mlx5_lag_is_shared_fdb(lag_master)) + mlx5_ib_register_peer_vport_reps(lag_master); return 0; @@ -118,23 +138,27 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) struct mlx5_ib_dev *dev = mlx5_ib_rep_to_dev(rep); int vport_index = rep->vport_index; struct mlx5_ib_port *port; + int i; if (WARN_ON(!mdev)) return; + if (!dev) + return; + if (mlx5_lag_is_shared_fdb(mdev) && !mlx5_lag_is_master(mdev)) { - struct mlx5_core_dev *peer_mdev; - - if (rep->vport == MLX5_VPORT_UPLINK) + if (rep->vport == MLX5_VPORT_UPLINK && !mlx5_lag_is_mpesw(mdev)) return; - peer_mdev = mlx5_lag_get_peer_mdev(mdev); - vport_index += mlx5_eswitch_get_total_vports(peer_mdev); + for (i = 0; i < dev->num_ports; i++) { + if (dev->port[i].rep == rep) + break; + } + if (WARN_ON(i == dev->num_ports)) + return; + vport_index = i; } - if (!dev) - return; - port = &dev->port[vport_index]; write_lock(&port->roce.netdev_lock); port->roce.netdev = NULL; @@ -143,13 +167,18 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) port->rep = NULL; if (rep->vport == MLX5_VPORT_UPLINK) { - struct mlx5_core_dev *peer_mdev; - struct mlx5_eswitch *esw; + + if (mlx5_lag_is_shared_fdb(mdev) && !mlx5_lag_is_master(mdev)) + return; if (mlx5_lag_is_shared_fdb(mdev)) { - peer_mdev = mlx5_lag_get_peer_mdev(mdev); - esw = peer_mdev->priv.eswitch; - mlx5_eswitch_unregister_vport_reps(esw, REP_IB); + struct mlx5_core_dev *peer_mdev; + struct mlx5_eswitch *esw; + + mlx5_lag_for_each_peer_mdev(mdev, peer_mdev, i) { + esw = peer_mdev->priv.eswitch; + mlx5_eswitch_unregister_vport_reps(esw, REP_IB); + } } __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } @@ -163,14 +192,14 @@ static const struct mlx5_eswitch_rep_ops rep_ops = { static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev) { - struct mlx5_core_dev *peer_mdev = mlx5_lag_get_peer_mdev(mdev); + struct mlx5_core_dev *peer_mdev; struct mlx5_eswitch *esw; + int i; - if (!peer_mdev) - return; - - esw = peer_mdev->priv.eswitch; - mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB); + mlx5_lag_for_each_peer_mdev(mdev, peer_mdev, i) { + esw = peer_mdev->priv.eswitch; + mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB); + } } struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5d45de223c43..f0b394ed7452 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4275,6 +4275,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), + STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, + mlx5_ib_stage_delay_drop_init, + mlx5_ib_stage_delay_drop_cleanup), STAGE_CREATE(MLX5_IB_STAGE_RESTRACK, mlx5_ib_restrack_init, NULL), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index efa4dc6e7dee..9c33d960af3c 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -25,6 +25,7 @@ #include <rdma/mlx5_user_ioctl_verbs.h> #include "srq.h" +#include "qp.h" #define mlx5_ib_dbg(_dev, format, arg...) \ dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ @@ -237,8 +238,19 @@ enum { #define MLX5_IB_NUM_SNIFFER_FTS 2 #define MLX5_IB_NUM_EGRESS_FTS 1 #define MLX5_IB_NUM_FDB_FTS MLX5_BY_PASS_NUM_REGULAR_PRIOS + +struct mlx5_ib_anchor { + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg_goto_table; + struct mlx5_flow_group *fg_drop; + struct mlx5_flow_handle *rule_goto_table; + struct mlx5_flow_handle *rule_drop; + unsigned int rule_goto_table_ref; +}; + struct mlx5_ib_flow_prio { struct mlx5_flow_table *flow_table; + struct mlx5_ib_anchor anchor; unsigned int refcount; }; @@ -1587,6 +1599,9 @@ static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) MLX5_CAP_PORT_SELECTION(dev->mdev, port_select_flow_table_bypass)) return 0; + if (mlx5_lag_is_lacp_owner(dev->mdev) && !dev->lag_active) + return 0; + return dev->lag_active || (MLX5_CAP_GEN(dev->mdev, num_lag_ports) > 1 && MLX5_CAP_GEN(dev->mdev, lag_tx_port_affinity)); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 70ca8ffa9256..78b96bfb4e6a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1237,6 +1237,9 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid); MLX5_SET(tisc, tisc, transport_domain, tdn); + if (!mlx5_ib_lag_should_assign_affinity(dev) && + mlx5_lag_is_lacp_owner(dev->mdev)) + MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1); if (qp->flags & IB_QP_CREATE_SOURCE_QPN) MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index 77f9b4a54816..b6ee7c3ee1ca 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -6,7 +6,17 @@ #ifndef _MLX5_IB_QP_H #define _MLX5_IB_QP_H -#include "mlx5_ib.h" +struct mlx5_ib_dev; + +struct mlx5_qp_table { + struct notifier_block nb; + struct xarray dct_xa; + + /* protect radix tree + */ + spinlock_t lock; + struct radix_tree_root tree; +}; int mlx5_init_qp_table(struct mlx5_ib_dev *dev); void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev); diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index bae0334d6e7f..d9cf6982d645 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -88,23 +88,35 @@ static bool is_event_type_allowed(int rsc_type, int event_type) } } +static int dct_event_notifier(struct mlx5_ib_dev *dev, struct mlx5_eqe *eqe) +{ + struct mlx5_core_dct *dct; + unsigned long flags; + u32 qpn; + + qpn = be32_to_cpu(eqe->data.dct.dctn) & 0xFFFFFF; + xa_lock_irqsave(&dev->qp_table.dct_xa, flags); + dct = xa_load(&dev->qp_table.dct_xa, qpn); + if (dct) + complete(&dct->drained); + xa_unlock_irqrestore(&dev->qp_table.dct_xa, flags); + return NOTIFY_OK; +} + static int rsc_event_notifier(struct notifier_block *nb, unsigned long type, void *data) { + struct mlx5_ib_dev *dev = + container_of(nb, struct mlx5_ib_dev, qp_table.nb); struct mlx5_core_rsc_common *common; - struct mlx5_qp_table *table; - struct mlx5_core_dct *dct; + struct mlx5_eqe *eqe = data; u8 event_type = (u8)type; struct mlx5_core_qp *qp; - struct mlx5_eqe *eqe; u32 rsn; switch (event_type) { case MLX5_EVENT_TYPE_DCT_DRAINED: - eqe = data; - rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; - rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); - break; + return dct_event_notifier(dev, eqe); case MLX5_EVENT_TYPE_PATH_MIG: case MLX5_EVENT_TYPE_COMM_EST: case MLX5_EVENT_TYPE_SQ_DRAINED: @@ -113,7 +125,6 @@ static int rsc_event_notifier(struct notifier_block *nb, case MLX5_EVENT_TYPE_PATH_MIG_FAILED: case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: - eqe = data; rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN); break; @@ -121,8 +132,7 @@ static int rsc_event_notifier(struct notifier_block *nb, return NOTIFY_DONE; } - table = container_of(nb, struct mlx5_qp_table, nb); - common = mlx5_get_rsc(table, rsn); + common = mlx5_get_rsc(&dev->qp_table, rsn); if (!common) return NOTIFY_OK; @@ -137,11 +147,6 @@ static int rsc_event_notifier(struct notifier_block *nb, qp->event(qp, event_type); /* Need to put resource in event handler */ return NOTIFY_OK; - case MLX5_RES_DCT: - dct = (struct mlx5_core_dct *)common; - if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) - complete(&dct->drained); - break; default: break; } @@ -188,28 +193,15 @@ static void destroy_resource_common(struct mlx5_ib_dev *dev, } static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, - struct mlx5_core_dct *dct, bool need_cleanup) + struct mlx5_core_dct *dct) { u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {}; struct mlx5_core_qp *qp = &dct->mqp; - int err; - err = mlx5_core_drain_dct(dev, dct); - if (err) { - if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) - goto destroy; - - return err; - } - wait_for_completion(&dct->drained); -destroy: - if (need_cleanup) - destroy_resource_common(dev, &dct->mqp); MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); MLX5_SET(destroy_dct_in, in, uid, qp->uid); - err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in); - return err; + return mlx5_cmd_exec_in(dev->mdev, destroy_dct, in); } int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, @@ -227,13 +219,13 @@ int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, qp->qpn = MLX5_GET(create_dct_out, out, dctn); qp->uid = MLX5_GET(create_dct_in, in, uid); - err = create_resource_common(dev, qp, MLX5_RES_DCT); + err = xa_err(xa_store_irq(&dev->qp_table.dct_xa, qp->qpn, dct, GFP_KERNEL)); if (err) goto err_cmd; return 0; err_cmd: - _mlx5_core_destroy_dct(dev, dct, false); + _mlx5_core_destroy_dct(dev, dct); return err; } @@ -284,7 +276,31 @@ static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev, int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct) { - return _mlx5_core_destroy_dct(dev, dct, true); + struct mlx5_qp_table *table = &dev->qp_table; + struct mlx5_core_dct *tmp; + int err; + + err = mlx5_core_drain_dct(dev, dct); + if (err) { + if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto destroy; + + return err; + } + wait_for_completion(&dct->drained); + +destroy: + tmp = xa_cmpxchg_irq(&table->dct_xa, dct->mqp.qpn, dct, XA_ZERO_ENTRY, GFP_KERNEL); + if (WARN_ON(tmp != dct)) + return xa_err(tmp) ?: -EINVAL; + + err = _mlx5_core_destroy_dct(dev, dct); + if (err) { + xa_cmpxchg_irq(&table->dct_xa, dct->mqp.qpn, XA_ZERO_ENTRY, dct, 0); + return err; + } + xa_erase_irq(&table->dct_xa, dct->mqp.qpn); + return 0; } int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) @@ -298,8 +314,7 @@ int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp) MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); MLX5_SET(destroy_qp_in, in, uid, qp->uid); - mlx5_cmd_exec_in(dev->mdev, destroy_qp, in); - return 0; + return mlx5_cmd_exec_in(dev->mdev, destroy_qp, in); } int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev, @@ -488,6 +503,7 @@ int mlx5_init_qp_table(struct mlx5_ib_dev *dev) spin_lock_init(&table->lock); INIT_RADIX_TREE(&table->tree, GFP_ATOMIC); + xa_init(&table->dct_xa); mlx5_qp_debugfs_init(dev->mdev); table->nb.notifier_call = rsc_event_notifier; @@ -551,14 +567,14 @@ int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn) return mlx5_cmd_exec_in(dev->mdev, dealloc_xrcd, in); } -static void destroy_rq_tracked(struct mlx5_ib_dev *dev, u32 rqn, u16 uid) +static int destroy_rq_tracked(struct mlx5_ib_dev *dev, u32 rqn, u16 uid) { u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {}; MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ); MLX5_SET(destroy_rq_in, in, rqn, rqn); MLX5_SET(destroy_rq_in, in, uid, uid); - mlx5_cmd_exec_in(dev->mdev, destroy_rq, in); + return mlx5_cmd_exec_in(dev->mdev, destroy_rq, in); } int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen, @@ -589,8 +605,7 @@ int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev, struct mlx5_core_qp *rq) { destroy_resource_common(dev, rq); - destroy_rq_tracked(dev, rq->qpn, rq->uid); - return 0; + return destroy_rq_tracked(dev, rq->qpn, rq->uid); } static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid) diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index f693bc753b6b..1bb7507325bc 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -111,7 +111,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, ret = pin_user_pages(start_page + got * PAGE_SIZE, num_pages - got, FOLL_LONGTERM | FOLL_WRITE, - p + got, NULL); + p + got); if (ret < 0) { mmap_read_unlock(current->mm); goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 2a5cac2658ec..84e0f41e7dfa 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -140,7 +140,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = pin_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + gup_flags, page_list); if (ret < 0) goto out; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index f83cd4a9d992..98b2a0090bf2 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -709,14 +709,6 @@ int pvrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, goto out; } - if (unlikely(wr->opcode < 0)) { - dev_warn_ratelimited(&dev->pdev->dev, - "invalid send opcode\n"); - *bad_wr = wr; - ret = -EINVAL; - goto out; - } - /* * Only support UD, RC. * Need to check opcode table for thorough checking. diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 7a7e713de52d..54c723a6edda 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -212,10 +212,16 @@ static int __init rxe_module_init(void) { int err; - err = rxe_net_init(); + err = rxe_alloc_wq(); if (err) return err; + err = rxe_net_init(); + if (err) { + rxe_destroy_wq(); + return err; + } + rdma_link_register(&rxe_link_ops); pr_info("loaded\n"); return 0; @@ -226,6 +232,7 @@ static void __exit rxe_module_exit(void) rdma_link_unregister(&rxe_link_ops); ib_unregister_driver(RDMA_DRIVER_RXE); rxe_net_exit(); + rxe_destroy_wq(); pr_info("unloaded\n"); } diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index db18ace74d2b..5111735aafae 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -115,15 +115,16 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) void retransmit_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, retrans_timer); + unsigned long flags; rxe_dbg_qp(qp, "retransmit timer fired\n"); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { qp->comp.timeout = 1; rxe_sched_task(&qp->comp.task); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) @@ -481,11 +482,13 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) static void comp_check_sq_drain_done(struct rxe_qp *qp) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(qp_state(qp) == IB_QPS_SQD)) { if (qp->attr.sq_draining && qp->comp.psn == qp->req.psn) { qp->attr.sq_draining = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->ibqp.event_handler) { struct ib_event ev; @@ -499,7 +502,7 @@ static void comp_check_sq_drain_done(struct rxe_qp *qp) return; } } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static inline enum comp_state complete_ack(struct rxe_qp *qp, @@ -625,13 +628,15 @@ static void free_pkt(struct rxe_pkt_info *pkt) */ static void reset_retry_timer(struct rxe_qp *qp) { + unsigned long flags; + if (qp_type(qp) == IB_QPT_RC && qp->qp_timeout_jiffies) { - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) >= IB_QPS_RTS && psn_compare(qp->req.psn, qp->comp.psn) > 0) mod_timer(&qp->retrans_timer, jiffies + qp->qp_timeout_jiffies); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } } @@ -643,18 +648,19 @@ int rxe_completer(struct rxe_qp *qp) struct rxe_pkt_info *pkt = NULL; enum comp_state state; int ret; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); drain_resp_pkts(qp); flush_send_queue(qp, notify); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->comp.timeout) { qp->comp.timeout_retry = 1; @@ -826,7 +832,7 @@ int rxe_completer(struct rxe_qp *qp) } /* A non-zero return value will cause rxe_do_task to - * exit its loop and end the tasklet. A zero return + * exit its loop and end the work item. A zero return * will continue looping and return to rxe_completer */ done: diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 20ff0c0c4605..d5486cbb3f10 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -113,15 +113,14 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); - spin_unlock_irqrestore(&cq->cq_lock, flags); - - if ((cq->notify == IB_CQ_NEXT_COMP) || - (cq->notify == IB_CQ_SOLICITED && solicited)) { + if ((cq->notify & IB_CQ_NEXT_COMP) || + (cq->notify & IB_CQ_SOLICITED && solicited)) { cq->notify = 0; - cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } + spin_unlock_irqrestore(&cq->cq_lock, flags); + return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 804b15e929dd..666e06a82bc9 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -31,8 +31,6 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); -void rxe_cq_disable(struct rxe_cq *cq); - void rxe_cq_cleanup(struct rxe_pool_elem *elem); /* rxe_mcast.c */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 0e538fafcc20..f54042e9aeb2 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -45,22 +45,17 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) } } -#define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \ - | IB_ACCESS_REMOTE_WRITE \ - | IB_ACCESS_REMOTE_ATOMIC) - static void rxe_mr_init(int access, struct rxe_mr *mr) { - u32 lkey = mr->elem.index << 8 | rxe_get_next_key(-1); - u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; + u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); /* set ibmr->l/rkey and also copy into private l/rkey * for user MRs these will always be the same * for cases where caller 'owns' the key portion * they may be different until REG_MR WQE is executed. */ - mr->lkey = mr->ibmr.lkey = lkey; - mr->rkey = mr->ibmr.rkey = rkey; + mr->lkey = mr->ibmr.lkey = key; + mr->rkey = mr->ibmr.rkey = key; mr->access = access; mr->ibmr.page_size = PAGE_SIZE; @@ -195,7 +190,7 @@ int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) int err; /* always allow remote access for FMRs */ - rxe_mr_init(IB_ACCESS_REMOTE, mr); + rxe_mr_init(RXE_ACCESS_REMOTE, mr); err = rxe_mr_alloc(mr, max_pages); if (err) @@ -644,6 +639,7 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct rxe_mr *mr; + int remote; int ret; mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); @@ -653,9 +649,10 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) goto err; } - if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) { + remote = mr->access & RXE_ACCESS_REMOTE; + if (remote ? (key != mr->rkey) : (key != mr->lkey)) { rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", - key, (mr->rkey ? mr->rkey : mr->lkey)); + key, (remote ? mr->rkey : mr->lkey)); ret = -EINVAL; goto err_drop_ref; } @@ -715,7 +712,7 @@ int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) mr->access = access; mr->lkey = key; - mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0; + mr->rkey = key; mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; mr->state = RXE_MR_STATE_VALID; diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index afa5ce1a7116..d8a43d87de93 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -48,7 +48,7 @@ int rxe_dealloc_mw(struct ib_mw *ibmw) } static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - struct rxe_mw *mw, struct rxe_mr *mr) + struct rxe_mw *mw, struct rxe_mr *mr, int access) { if (mw->ibmw.type == IB_MW_TYPE_1) { if (unlikely(mw->state != RXE_MW_STATE_VALID)) { @@ -58,7 +58,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, } /* o10-36.2.2 */ - if (unlikely((mw->access & IB_ZERO_BASED))) { + if (unlikely((access & IB_ZERO_BASED))) { rxe_dbg_mw(mw, "attempt to bind a zero based type 1 MW\n"); return -EINVAL; } @@ -104,7 +104,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, } /* C10-74 */ - if (unlikely((mw->access & + if (unlikely((access & (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) && !(mr->access & IB_ACCESS_LOCAL_WRITE))) { rxe_dbg_mw(mw, @@ -113,7 +113,7 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, } /* C10-75 */ - if (mw->access & IB_ZERO_BASED) { + if (access & IB_ZERO_BASED) { if (unlikely(wqe->wr.wr.mw.length > mr->ibmr.length)) { rxe_dbg_mw(mw, "attempt to bind a ZB MW outside of the MR\n"); @@ -133,12 +133,12 @@ static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, } static void rxe_do_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - struct rxe_mw *mw, struct rxe_mr *mr) + struct rxe_mw *mw, struct rxe_mr *mr, int access) { u32 key = wqe->wr.wr.mw.rkey & 0xff; mw->rkey = (mw->rkey & ~0xff) | key; - mw->access = wqe->wr.wr.mw.access; + mw->access = access; mw->state = RXE_MW_STATE_VALID; mw->addr = wqe->wr.wr.mw.addr; mw->length = wqe->wr.wr.mw.length; @@ -169,6 +169,7 @@ int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe) struct rxe_dev *rxe = to_rdev(qp->ibqp.device); u32 mw_rkey = wqe->wr.wr.mw.mw_rkey; u32 mr_lkey = wqe->wr.wr.mw.mr_lkey; + int access = wqe->wr.wr.mw.access; mw = rxe_pool_get_index(&rxe->mw_pool, mw_rkey >> 8); if (unlikely(!mw)) { @@ -196,13 +197,18 @@ int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe) mr = NULL; } + if (access & ~RXE_ACCESS_SUPPORTED_MW) { + rxe_err_mw(mw, "access %#x not supported", access); + return -EOPNOTSUPP; + } + spin_lock_bh(&mw->lock); - ret = rxe_check_bind_mw(qp, wqe, mw, mr); + ret = rxe_check_bind_mw(qp, wqe, mw, mr, access); if (ret) goto err_unlock; - rxe_do_bind_mw(qp, wqe, mw, mr); + rxe_do_bind_mw(qp, wqe, mw, mr, access); err_unlock: spin_unlock_bh(&mw->lock); err_drop_mr: diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 2bc7361152ea..cd59666158b1 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -159,6 +159,9 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) pkt->mask = RXE_GRH_MASK; pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); + /* remove udp header */ + skb_pull(skb, sizeof(struct udphdr)); + rxe_rcv(skb); return 0; @@ -401,6 +404,9 @@ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) return -EIO; } + /* remove udp header */ + skb_pull(skb, sizeof(struct udphdr)); + rxe_rcv(skb); return 0; @@ -412,15 +418,16 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, int err; int is_request = pkt->mask & RXE_REQ_MASK; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || (!is_request && (qp_state(qp) < IB_QPS_RTR))) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_icrc_generate(skb, pkt); diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h index cea4e0a63919..5686b691d6b8 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.h +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -91,6 +91,9 @@ enum rxe_hdr_mask { RXE_READ_OR_ATOMIC_MASK = (RXE_READ_MASK | RXE_ATOMIC_MASK), RXE_WRITE_OR_SEND_MASK = (RXE_WRITE_MASK | RXE_SEND_MASK), RXE_READ_OR_WRITE_MASK = (RXE_READ_MASK | RXE_WRITE_MASK), + RXE_RDMA_OP_MASK = (RXE_READ_MASK | RXE_WRITE_MASK | + RXE_ATOMIC_WRITE_MASK | RXE_FLUSH_MASK | + RXE_ATOMIC_MASK), }; #define OPCODE_NONE (-1) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 7b41d79e72b2..d2f57ead78ad 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -112,7 +112,7 @@ enum rxe_device_param { RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64, RXE_INFLIGHT_SKBS_PER_QP_LOW = 16, - /* Max number of interations of each tasklet + /* Max number of interations of each work item * before yielding the cpu to let other * work make progress */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index c5451a4488ca..a569b111a9d2 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -176,6 +176,9 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, spin_lock_init(&qp->rq.producer_lock); spin_lock_init(&qp->rq.consumer_lock); + skb_queue_head_init(&qp->req_pkts); + skb_queue_head_init(&qp->resp_pkts); + atomic_set(&qp->ssn, 0); atomic_set(&qp->skb_out, 0); } @@ -234,8 +237,6 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, qp->req.opcode = -1; qp->comp.opcode = -1; - skb_queue_head_init(&qp->req_pkts); - rxe_init_task(&qp->req.task, qp, rxe_requester); rxe_init_task(&qp->comp.task, qp, rxe_completer); @@ -279,8 +280,6 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, } } - skb_queue_head_init(&qp->resp_pkts); - rxe_init_task(&qp->resp.task, qp, rxe_responder); qp->resp.opcode = OPCODE_NONE; @@ -300,6 +299,7 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, struct rxe_cq *rcq = to_rcq(init->recv_cq); struct rxe_cq *scq = to_rcq(init->send_cq); struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; + unsigned long flags; rxe_get(pd); rxe_get(rcq); @@ -325,10 +325,10 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, if (err) goto err2; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_RESET; qp->valid = 1; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return 0; @@ -392,6 +392,13 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) goto err1; + if (mask & IB_QP_ACCESS_FLAGS) { + if (!(qp_type(qp) == IB_QPT_RC || qp_type(qp) == IB_QPT_UC)) + goto err1; + if (attr->qp_access_flags & ~RXE_ACCESS_SUPPORTED_QP) + goto err1; + } + if (mask & IB_QP_AV && rxe_av_chk_attr(qp, &attr->ah_attr)) goto err1; @@ -492,24 +499,28 @@ static void rxe_qp_reset(struct rxe_qp *qp) /* move the qp to the error state */ void rxe_qp_error(struct rxe_qp *qp) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ rxe_sched_task(&qp->resp.task); rxe_sched_task(&qp->comp.task); rxe_sched_task(&qp->req.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); qp->attr.sq_draining = 1; rxe_sched_task(&qp->comp.task); rxe_sched_task(&qp->req.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } /* caller should hold qp->state_lock */ @@ -555,14 +566,16 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, qp->attr.cur_qp_state = attr->qp_state; if (mask & IB_QP_STATE) { - spin_lock_bh(&qp->state_lock); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); err = __qp_chk_state(qp, attr, mask); if (!err) { qp->attr.qp_state = attr->qp_state; rxe_dbg_qp(qp, "state -> %s\n", qps2str[attr->qp_state]); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (err) return err; @@ -688,6 +701,8 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, /* called by the query qp verb */ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) { + unsigned long flags; + *attr = qp->attr; attr->rq_psn = qp->resp.psn; @@ -708,12 +723,13 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) /* Applications that get this state typically spin on it. * Yield the processor */ - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp->attr.sq_draining) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); cond_resched(); + } else { + spin_unlock_irqrestore(&qp->state_lock, flags); } - spin_unlock_bh(&qp->state_lock); return 0; } @@ -736,10 +752,11 @@ int rxe_qp_chk_destroy(struct rxe_qp *qp) static void rxe_qp_do_cleanup(struct work_struct *work) { struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); qp->valid = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; if (qp_type(qp) == IB_QPT_RC) { diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 2f953cc74256..5861e4244049 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -14,6 +14,7 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct rxe_qp *qp) { unsigned int pkt_type; + unsigned long flags; if (unlikely(!qp->valid)) return -EINVAL; @@ -38,19 +39,19 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return -EINVAL; } - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (pkt->mask & RXE_REQ_MASK) { if (unlikely(qp_state(qp) < IB_QPS_RTR)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return -EINVAL; } } else { if (unlikely(qp_state(qp) < IB_QPS_RTS)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return -EINVAL; } } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 65134a9aefe7..2171f19494bc 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -99,17 +99,18 @@ static void req_retry(struct rxe_qp *qp) void rnr_nak_timer(struct timer_list *t) { struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); + unsigned long flags; rxe_dbg_qp(qp, "nak timer fired\n"); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { /* request a send queue retry */ qp->req.need_retry = 1; qp->req.wait_for_rnr_timer = 0; rxe_sched_task(&qp->req.task); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static void req_check_sq_drain_done(struct rxe_qp *qp) @@ -118,8 +119,9 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) unsigned int index; unsigned int cons; struct rxe_send_wqe *wqe; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_SQD) { q = qp->sq.queue; index = qp->req.wqe_index; @@ -140,7 +142,7 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) break; qp->attr.sq_draining = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->ibqp.event_handler) { struct ib_event ev; @@ -154,7 +156,7 @@ static void req_check_sq_drain_done(struct rxe_qp *qp) return; } while (0); } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); } static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp) @@ -173,6 +175,7 @@ static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp) static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) { struct rxe_send_wqe *wqe; + unsigned long flags; req_check_sq_drain_done(qp); @@ -180,13 +183,13 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) if (wqe == NULL) return NULL; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely((qp_state(qp) == IB_QPS_SQD) && (wqe->state != wqe_state_processing))) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return NULL; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); return wqe; @@ -676,16 +679,17 @@ int rxe_requester(struct rxe_qp *qp) struct rxe_queue *q = qp->sq.queue; struct rxe_ah *ah; struct rxe_av *av; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(!qp->valid)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } if (unlikely(qp_state(qp) == IB_QPS_ERR)) { wqe = __req_next_wqe(qp); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (wqe) goto err; else @@ -700,10 +704,10 @@ int rxe_requester(struct rxe_qp *qp) qp->req.wait_psn = 0; qp->req.need_retry = 0; qp->req.wait_for_rnr_timer = 0; - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); /* we come here if the retransmit timer has fired * or if the rnr timer has fired. If the retransmit @@ -853,7 +857,7 @@ int rxe_requester(struct rxe_qp *qp) update_state(qp, &pkt); /* A non-zero return value will cause rxe_do_task to - * exit its loop and end the tasklet. A zero return + * exit its loop and end the work item. A zero return * will continue looping and return to rxe_requester */ done: diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 68f6cd188d8e..64c64f5f36a8 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -387,7 +387,10 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, } } - return RESPST_CHK_RKEY; + if (pkt->mask & RXE_RDMA_OP_MASK) + return RESPST_CHK_RKEY; + else + return RESPST_EXECUTE; } /* if the reth length field is zero we can assume nothing @@ -434,6 +437,10 @@ static enum resp_states check_rkey(struct rxe_qp *qp, enum resp_states state; int access = 0; + /* parse RETH or ATMETH header for first/only packets + * for va, length, rkey, etc. or use current value for + * middle/last packets. + */ if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (pkt->mask & RXE_RETH_MASK) qp_resp_from_reth(qp, pkt); @@ -454,7 +461,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, qp_resp_from_atmeth(qp, pkt); access = IB_ACCESS_REMOTE_ATOMIC; } else { - return RESPST_EXECUTE; + /* shouldn't happen */ + WARN_ON(1); } /* A zero-byte read or write op is not required to @@ -489,8 +497,9 @@ static enum resp_states check_rkey(struct rxe_qp *qp, if (mw->access & IB_ZERO_BASED) qp->resp.offset = mw->addr; - rxe_put(mw); rxe_get(mr); + rxe_put(mw); + mw = NULL; } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { @@ -1047,6 +1056,7 @@ static enum resp_states do_complete(struct rxe_qp *qp, struct ib_uverbs_wc *uwc = &cqe.uibwc; struct rxe_recv_wqe *wqe = qp->resp.wqe; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + unsigned long flags; if (!wqe) goto finish; @@ -1137,12 +1147,12 @@ static enum resp_states do_complete(struct rxe_qp *qp, return RESPST_ERR_CQ_OVERFLOW; finish: - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(qp_state(qp) == IB_QPS_ERR)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return RESPST_CHK_RESOURCE; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(!pkt)) return RESPST_DONE; @@ -1447,8 +1457,17 @@ static void flush_recv_queue(struct rxe_qp *qp, bool notify) struct rxe_recv_wqe *wqe; int err; - if (qp->srq) + if (qp->srq) { + if (notify && qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } return; + } while ((wqe = queue_head(q, q->type))) { if (notify) { @@ -1468,18 +1487,19 @@ int rxe_responder(struct rxe_qp *qp) enum resp_states state; struct rxe_pkt_info *pkt = NULL; int ret; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); drain_req_pkts(qp); flush_recv_queue(qp, notify); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; @@ -1654,7 +1674,7 @@ int rxe_responder(struct rxe_qp *qp) } /* A non-zero return value will cause rxe_do_task to - * exit its loop and end the tasklet. A zero return + * exit its loop and end the work item. A zero return * will continue looping and return to rxe_responder */ done: diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index fb9a6bc8e620..1501120d4f52 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -6,8 +6,24 @@ #include "rxe.h" +static struct workqueue_struct *rxe_wq; + +int rxe_alloc_wq(void) +{ + rxe_wq = alloc_workqueue("rxe_wq", WQ_UNBOUND, WQ_MAX_ACTIVE); + if (!rxe_wq) + return -ENOMEM; + + return 0; +} + +void rxe_destroy_wq(void) +{ + destroy_workqueue(rxe_wq); +} + /* Check if task is idle i.e. not running, not scheduled in - * tasklet queue and not draining. If so move to busy to + * work queue and not draining. If so move to busy to * reserve a slot in do_task() by setting to busy and taking * a qp reference to cover the gap from now until the task finishes. * state will move out of busy if task returns a non zero value @@ -21,9 +37,6 @@ static bool __reserve_if_idle(struct rxe_task *task) { WARN_ON(rxe_read(task->qp) <= 0); - if (task->tasklet.state & BIT(TASKLET_STATE_SCHED)) - return false; - if (task->state == TASK_STATE_IDLE) { rxe_get(task->qp); task->state = TASK_STATE_BUSY; @@ -38,7 +51,7 @@ static bool __reserve_if_idle(struct rxe_task *task) } /* check if task is idle or drained and not currently - * scheduled in the tasklet queue. This routine is + * scheduled in the work queue. This routine is * called by rxe_cleanup_task or rxe_disable_task to * see if the queue is empty. * Context: caller should hold task->lock. @@ -46,7 +59,7 @@ static bool __reserve_if_idle(struct rxe_task *task) */ static bool __is_done(struct rxe_task *task) { - if (task->tasklet.state & BIT(TASKLET_STATE_SCHED)) + if (work_pending(&task->work)) return false; if (task->state == TASK_STATE_IDLE || @@ -77,23 +90,23 @@ static bool is_done(struct rxe_task *task) * schedules the task. They must call __reserve_if_idle to * move the task to busy before calling or scheduling. * The task can also be moved to drained or invalid - * by calls to rxe-cleanup_task or rxe_disable_task. + * by calls to rxe_cleanup_task or rxe_disable_task. * In that case tasks which get here are not executed but * just flushed. The tasks are designed to look to see if - * there is work to do and do part of it before returning + * there is work to do and then do part of it before returning * here with a return value of zero until all the work - * has been consumed then it retuens a non-zero value. + * has been consumed then it returns a non-zero value. * The number of times the task can be run is limited by * max iterations so one task cannot hold the cpu forever. + * If the limit is hit and work remains the task is rescheduled. */ -static void do_task(struct tasklet_struct *t) +static void do_task(struct rxe_task *task) { - int cont; - int ret; - struct rxe_task *task = from_tasklet(task, t, tasklet); unsigned int iterations; unsigned long flags; int resched = 0; + int cont; + int ret; WARN_ON(rxe_read(task->qp) <= 0); @@ -115,25 +128,22 @@ static void do_task(struct tasklet_struct *t) } while (ret == 0 && iterations-- > 0); spin_lock_irqsave(&task->lock, flags); + /* we're not done yet but we ran out of iterations. + * yield the cpu and reschedule the task + */ + if (!ret) { + task->state = TASK_STATE_IDLE; + resched = 1; + goto exit; + } + switch (task->state) { case TASK_STATE_BUSY: - if (ret) { - task->state = TASK_STATE_IDLE; - } else { - /* This can happen if the client - * can add work faster than the - * tasklet can finish it. - * Reschedule the tasklet and exit - * the loop to give up the cpu - */ - task->state = TASK_STATE_IDLE; - resched = 1; - } + task->state = TASK_STATE_IDLE; break; - /* someone tried to run the task since the last time we called - * func, so we will call one more time regardless of the - * return value + /* someone tried to schedule the task while we + * were running, keep going */ case TASK_STATE_ARMED: task->state = TASK_STATE_BUSY; @@ -141,22 +151,24 @@ static void do_task(struct tasklet_struct *t) break; case TASK_STATE_DRAINING: - if (ret) - task->state = TASK_STATE_DRAINED; - else - cont = 1; + task->state = TASK_STATE_DRAINED; break; default: WARN_ON(1); - rxe_info_qp(task->qp, "unexpected task state = %d", task->state); + rxe_dbg_qp(task->qp, "unexpected task state = %d", + task->state); + task->state = TASK_STATE_IDLE; } +exit: if (!cont) { task->num_done++; if (WARN_ON(task->num_done != task->num_sched)) - rxe_err_qp(task->qp, "%ld tasks scheduled, %ld tasks done", - task->num_sched, task->num_done); + rxe_dbg_qp( + task->qp, + "%ld tasks scheduled, %ld tasks done", + task->num_sched, task->num_done); } spin_unlock_irqrestore(&task->lock, flags); } while (cont); @@ -169,6 +181,12 @@ static void do_task(struct tasklet_struct *t) rxe_put(task->qp); } +/* wrapper around do_task to fix argument for work queue */ +static void do_work(struct work_struct *work) +{ + do_task(container_of(work, struct rxe_task, work)); +} + int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, int (*func)(struct rxe_qp *)) { @@ -176,11 +194,9 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, task->qp = qp; task->func = func; - - tasklet_setup(&task->tasklet, do_task); - task->state = TASK_STATE_IDLE; spin_lock_init(&task->lock); + INIT_WORK(&task->work, do_work); return 0; } @@ -213,8 +229,6 @@ void rxe_cleanup_task(struct rxe_task *task) while (!is_done(task)) cond_resched(); - tasklet_kill(&task->tasklet); - spin_lock_irqsave(&task->lock, flags); task->state = TASK_STATE_INVALID; spin_unlock_irqrestore(&task->lock, flags); @@ -226,7 +240,7 @@ void rxe_cleanup_task(struct rxe_task *task) void rxe_run_task(struct rxe_task *task) { unsigned long flags; - int run; + bool run; WARN_ON(rxe_read(task->qp) <= 0); @@ -235,11 +249,11 @@ void rxe_run_task(struct rxe_task *task) spin_unlock_irqrestore(&task->lock, flags); if (run) - do_task(&task->tasklet); + do_task(task); } -/* schedule the task to run later as a tasklet. - * the tasklet)schedule call can be called holding +/* schedule the task to run later as a work queue entry. + * the queue_work call can be called holding * the lock. */ void rxe_sched_task(struct rxe_task *task) @@ -250,7 +264,7 @@ void rxe_sched_task(struct rxe_task *task) spin_lock_irqsave(&task->lock, flags); if (__reserve_if_idle(task)) - tasklet_schedule(&task->tasklet); + queue_work(rxe_wq, &task->work); spin_unlock_irqrestore(&task->lock, flags); } @@ -277,7 +291,9 @@ void rxe_disable_task(struct rxe_task *task) while (!is_done(task)) cond_resched(); - tasklet_disable(&task->tasklet); + spin_lock_irqsave(&task->lock, flags); + task->state = TASK_STATE_DRAINED; + spin_unlock_irqrestore(&task->lock, flags); } void rxe_enable_task(struct rxe_task *task) @@ -291,7 +307,7 @@ void rxe_enable_task(struct rxe_task *task) spin_unlock_irqrestore(&task->lock, flags); return; } + task->state = TASK_STATE_IDLE; - tasklet_enable(&task->tasklet); spin_unlock_irqrestore(&task->lock, flags); } diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index facb7c8e3729..a63e258b3d66 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -22,7 +22,7 @@ enum { * called again. */ struct rxe_task { - struct tasklet_struct tasklet; + struct work_struct work; int state; spinlock_t lock; struct rxe_qp *qp; @@ -32,6 +32,10 @@ struct rxe_task { long num_done; }; +int rxe_alloc_wq(void); + +void rxe_destroy_wq(void); + /* * init rxe_task structure * qp => parameter to pass to func diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index dea605b7f683..903f0b71447e 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -904,10 +904,10 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, if (!err) rxe_sched_task(&qp->req.task); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->comp.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return err; } @@ -917,22 +917,23 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, { struct rxe_qp *qp = to_rqp(ibqp); int err; + unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed"); return -EINVAL; } if (unlikely(qp_state(qp) < IB_QPS_RTS)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_err_qp(qp, "qp not ready to send"); return -EINVAL; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->is_user) { /* Utilize process context to do protocol processing */ @@ -1008,22 +1009,22 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, struct rxe_rq *rq = &qp->rq; unsigned long flags; - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed"); return -EINVAL; } /* see C10-97.2.1 */ if (unlikely((qp_state(qp) < IB_QPS_INIT))) { - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_dbg_qp(qp, "qp not ready to post recv"); return -EINVAL; } - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(qp->srq)) { *bad_wr = wr; @@ -1044,10 +1045,10 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_unlock_irqrestore(&rq->producer_lock, flags); - spin_lock_bh(&qp->state_lock); + spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->resp.task); - spin_unlock_bh(&qp->state_lock); + spin_unlock_irqrestore(&qp->state_lock, flags); return err; } @@ -1181,9 +1182,7 @@ static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) unsigned long irq_flags; spin_lock_irqsave(&cq->cq_lock, irq_flags); - if (cq->notify != IB_CQ_NEXT_COMP) - cq->notify = flags & IB_CQ_SOLICITED_MASK; - + cq->notify |= flags & IB_CQ_SOLICITED_MASK; empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP); if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) @@ -1260,6 +1259,12 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, struct rxe_mr *mr; int err, cleanup_err; + if (access & ~RXE_ACCESS_SUPPORTED_MR) { + rxe_err_pd(pd, "access = %#x not supported (%#x)", access, + RXE_ACCESS_SUPPORTED_MR); + return ERR_PTR(-EOPNOTSUPP); + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -1293,6 +1298,40 @@ err_free: return ERR_PTR(err); } +static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags, + u64 start, u64 length, u64 iova, + int access, struct ib_pd *ibpd, + struct ib_udata *udata) +{ + struct rxe_mr *mr = to_rmr(ibmr); + struct rxe_pd *old_pd = to_rpd(ibmr->pd); + struct rxe_pd *pd = to_rpd(ibpd); + + /* for now only support the two easy cases: + * rereg_pd and rereg_access + */ + if (flags & ~RXE_MR_REREG_SUPPORTED) { + rxe_err_mr(mr, "flags = %#x not supported", flags); + return ERR_PTR(-EOPNOTSUPP); + } + + if (flags & IB_MR_REREG_PD) { + rxe_put(old_pd); + rxe_get(pd); + mr->ibmr.pd = ibpd; + } + + if (flags & IB_MR_REREG_ACCESS) { + if (access & ~RXE_ACCESS_SUPPORTED_MR) { + rxe_err_mr(mr, "access = %#x not supported", access); + return ERR_PTR(-EOPNOTSUPP); + } + mr->access = access; + } + + return NULL; +} + static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg) { @@ -1356,7 +1395,7 @@ static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err); - kfree_rcu(mr); + kfree_rcu_mightsleep(mr); return 0; err_out: @@ -1445,6 +1484,7 @@ static const struct ib_device_ops rxe_dev_ops = { .query_srq = rxe_query_srq, .reg_user_mr = rxe_reg_user_mr, .req_notify_cq = rxe_req_notify_cq, + .rereg_user_mr = rxe_rereg_user_mr, .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 26a20f088692..ccb9d19ffe8a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -253,6 +253,22 @@ struct rxe_qp { struct execute_work cleanup_work; }; +enum { + RXE_ACCESS_REMOTE = IB_ACCESS_REMOTE_READ + | IB_ACCESS_REMOTE_WRITE + | IB_ACCESS_REMOTE_ATOMIC, + RXE_ACCESS_SUPPORTED_MR = RXE_ACCESS_REMOTE + | IB_ACCESS_LOCAL_WRITE + | IB_ACCESS_MW_BIND + | IB_ACCESS_ON_DEMAND + | IB_ACCESS_FLUSH_GLOBAL + | IB_ACCESS_FLUSH_PERSISTENT + | IB_ACCESS_OPTIONAL, + RXE_ACCESS_SUPPORTED_QP = RXE_ACCESS_SUPPORTED_MR, + RXE_ACCESS_SUPPORTED_MW = RXE_ACCESS_SUPPORTED_MR + | IB_ZERO_BASED, +}; + enum rxe_mr_state { RXE_MR_STATE_INVALID, RXE_MR_STATE_FREE, @@ -269,6 +285,11 @@ enum rxe_mr_lookup_type { RXE_LOOKUP_REMOTE, }; +enum rxe_rereg { + RXE_MR_REREG_SUPPORTED = IB_MR_REREG_PD + | IB_MR_REREG_ACCESS, +}; + static inline int rkey_is_mw(u32 rkey) { u32 index = rkey >> 8; diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index f51ab2ccf151..e6e25f15567d 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -422,7 +422,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) umem->page_chunk[i].plist = plist; while (nents) { rv = pin_user_pages(first_page_va, nents, foll_flags, - plist, NULL); + plist); if (rv < 0) goto out_sem_up; diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 4b292e0504f1..7c7a51d36d0c 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -312,7 +312,7 @@ static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, } /* - * 0copy TCP transmit interface: Use do_tcp_sendpages. + * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES. * * Using sendpage to push page by page appears to be less efficient * than using sendmsg, even if data are copied. @@ -323,20 +323,26 @@ static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, size_t size) { + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SPLICE_PAGES), + }; struct sock *sk = s->sk; - int i = 0, rv = 0, sent = 0, - flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST; + int i = 0, rv = 0, sent = 0; while (size) { size_t bytes = min_t(size_t, PAGE_SIZE - offset, size); if (size + offset <= PAGE_SIZE) - flags = MSG_MORE | MSG_DONTWAIT; + msg.msg_flags &= ~MSG_MORE; tcp_rate_check_app_limited(sk); + bvec_set_page(&bvec, page[i], bytes, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); + try_page_again: lock_sock(sk); - rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags); + rv = tcp_sendmsg_locked(sk, &msg, size); release_sock(sk); if (rv > 0) { diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index f290cd49698e..92e1e7587af8 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -657,9 +657,13 @@ static int isert_connect_error(struct rdma_cm_id *cma_id) { struct isert_conn *isert_conn = cma_id->qp->qp_context; + struct isert_np *isert_np = cma_id->context; ib_drain_qp(isert_conn->qp); + + mutex_lock(&isert_np->mutex); list_del_init(&isert_conn->node); + mutex_unlock(&isert_np->mutex); isert_conn->cm_id = NULL; isert_put_conn(isert_conn); @@ -2431,6 +2435,7 @@ isert_free_np(struct iscsi_np *np) { struct isert_np *isert_np = np->np_context; struct isert_conn *isert_conn, *n; + LIST_HEAD(drop_conn_list); if (isert_np->cm_id) rdma_destroy_id(isert_np->cm_id); @@ -2450,7 +2455,7 @@ isert_free_np(struct iscsi_np *np) node) { isert_info("cleaning isert_conn %p state (%d)\n", isert_conn, isert_conn->state); - isert_connect_release(isert_conn); + list_move_tail(&isert_conn->node, &drop_conn_list); } } @@ -2461,11 +2466,16 @@ isert_free_np(struct iscsi_np *np) node) { isert_info("cleaning isert_conn %p state (%d)\n", isert_conn, isert_conn->state); - isert_connect_release(isert_conn); + list_move_tail(&isert_conn->node, &drop_conn_list); } } mutex_unlock(&isert_np->mutex); + list_for_each_entry_safe(isert_conn, n, &drop_conn_list, node) { + list_del_init(&isert_conn->node); + isert_connect_release(isert_conn); + } + np->np_context = NULL; kfree(isert_np); } @@ -2560,8 +2570,6 @@ static void isert_wait_conn(struct iscsit_conn *conn) isert_put_unsol_pending_cmds(conn); isert_wait4cmds(conn); isert_wait4logout(isert_conn); - - queue_work(isert_release_wq, &isert_conn->release_work); } static void isert_free_conn(struct iscsit_conn *conn) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index edb2e3a25880..b32941dd67cb 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1710,7 +1710,6 @@ static int create_con_cq_qp(struct rtrs_clt_con *con) return -ENOMEM; con->queue_num = cq_num; } - cq_num = max_send_wr + max_recv_wr; cq_vector = con->cpu % clt_path->s.dev->ib_dev->num_comp_vectors; if (con->c.cid >= clt_path->s.irq_con_num) err = rtrs_cq_qp_create(&clt_path->s, &con->c, max_send_sge, @@ -2040,6 +2039,7 @@ static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id, return 0; } +/* The caller should do the cleanup in case of error */ static int create_cm(struct rtrs_clt_con *con) { struct rtrs_path *s = con->c.path; @@ -2062,14 +2062,14 @@ static int create_cm(struct rtrs_clt_con *con) err = rdma_set_reuseaddr(cm_id, 1); if (err != 0) { rtrs_err(s, "Set address reuse failed, err: %d\n", err); - goto destroy_cm; + return err; } err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr, (struct sockaddr *)&clt_path->s.dst_addr, RTRS_CONNECT_TIMEOUT_MS); if (err) { rtrs_err(s, "Failed to resolve address, err: %d\n", err); - goto destroy_cm; + return err; } /* * Combine connection status and session events. This is needed @@ -2084,29 +2084,15 @@ static int create_cm(struct rtrs_clt_con *con) if (err == 0) err = -ETIMEDOUT; /* Timedout or interrupted */ - goto errr; - } - if (con->cm_err < 0) { - err = con->cm_err; - goto errr; + return err; } - if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTING) { + if (con->cm_err < 0) + return con->cm_err; + if (READ_ONCE(clt_path->state) != RTRS_CLT_CONNECTING) /* Device removal */ - err = -ECONNABORTED; - goto errr; - } + return -ECONNABORTED; return 0; - -errr: - stop_cm(con); - mutex_lock(&con->con_mutex); - destroy_con_cq_qp(con); - mutex_unlock(&con->con_mutex); -destroy_cm: - destroy_cm(con); - - return err; } static void rtrs_clt_path_up(struct rtrs_clt_path *clt_path) @@ -2334,7 +2320,7 @@ static void rtrs_clt_close_work(struct work_struct *work) static int init_conns(struct rtrs_clt_path *clt_path) { unsigned int cid; - int err; + int err, i; /* * On every new session connections increase reconnect counter @@ -2350,10 +2336,8 @@ static int init_conns(struct rtrs_clt_path *clt_path) goto destroy; err = create_cm(to_clt_con(clt_path->s.con[cid])); - if (err) { - destroy_con(to_clt_con(clt_path->s.con[cid])); + if (err) goto destroy; - } } err = alloc_path_reqs(clt_path); if (err) @@ -2364,15 +2348,21 @@ static int init_conns(struct rtrs_clt_path *clt_path) return 0; destroy: - while (cid--) { - struct rtrs_clt_con *con = to_clt_con(clt_path->s.con[cid]); + /* Make sure we do the cleanup in the order they are created */ + for (i = 0; i <= cid; i++) { + struct rtrs_clt_con *con; - stop_cm(con); + if (!clt_path->s.con[i]) + break; - mutex_lock(&con->con_mutex); - destroy_con_cq_qp(con); - mutex_unlock(&con->con_mutex); - destroy_cm(con); + con = to_clt_con(clt_path->s.con[i]); + if (con->c.cm_id) { + stop_cm(con); + mutex_lock(&con->con_mutex); + destroy_con_cq_qp(con); + mutex_unlock(&con->con_mutex); + destroy_cm(con); + } destroy_con(con); } /* diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index 4bf9d868cc52..3696f367ff51 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -37,8 +37,10 @@ struct rtrs_iu *rtrs_iu_alloc(u32 iu_num, size_t size, gfp_t gfp_mask, goto err; iu->dma_addr = ib_dma_map_single(dma_dev, iu->buf, size, dir); - if (ib_dma_mapping_error(dma_dev, iu->dma_addr)) + if (ib_dma_mapping_error(dma_dev, iu->dma_addr)) { + kfree(iu->buf); goto err; + } iu->cqe.done = done; iu->size = size; |