diff options
Diffstat (limited to 'net')
72 files changed, 6105 insertions, 1296 deletions
diff --git a/net/ceph/Makefile b/net/ceph/Makefile index 958d9856912c..84cbed630c4b 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ - pagevec.o snapshot.o + pagevec.o snapshot.o string_table.o diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 55d2bfee16d7..bddfcf6f09c2 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -747,6 +747,8 @@ out: static void __exit exit_ceph_lib(void) { dout("exit_ceph_lib\n"); + WARN_ON(!ceph_strings_empty()); + ceph_osdc_cleanup(); ceph_msgr_exit(); ceph_crypto_shutdown(); diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c index 41466ccb972a..7d54e944de5e 100644 --- a/net/ceph/ceph_fs.c +++ b/net/ceph/ceph_fs.c @@ -9,9 +9,9 @@ */ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) { - __u32 su = le32_to_cpu(layout->fl_stripe_unit); - __u32 sc = le32_to_cpu(layout->fl_stripe_count); - __u32 os = le32_to_cpu(layout->fl_object_size); + __u32 su = layout->stripe_unit; + __u32 sc = layout->stripe_count; + __u32 os = layout->object_size; /* stripe unit, object size must be non-zero, 64k increment */ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) @@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) return 1; } +void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit); + fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count); + fl->object_size = le32_to_cpu(legacy->fl_object_size); + fl->pool_id = le32_to_cpu(legacy->fl_pg_pool); + if (fl->pool_id == 0) + fl->pool_id = -1; +} +EXPORT_SYMBOL(ceph_file_layout_from_legacy); + +void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, + struct ceph_file_layout_legacy *legacy) +{ + legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit); + legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count); + legacy->fl_object_size = cpu_to_le32(fl->object_size); + if (fl->pool_id >= 0) + legacy->fl_pg_pool = cpu_to_le32(fl->pool_id); + else + legacy->fl_pg_pool = 0; +} +EXPORT_SYMBOL(ceph_file_layout_to_legacy); int ceph_flags_to_mode(int flags) { diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index e77b04ca7802..c62b2b029a6e 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -156,8 +156,16 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) seq_printf(s, "]/%d\t[", t->up.primary); for (i = 0; i < t->acting.size; i++) seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); - seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary, - t->target_oid.name_len, t->target_oid.name, t->flags); + seq_printf(s, "]/%d\t", t->acting.primary); + if (t->target_oloc.pool_ns) { + seq_printf(s, "%*pE/%*pE\t0x%x", + (int)t->target_oloc.pool_ns->len, + t->target_oloc.pool_ns->str, + t->target_oid.name_len, t->target_oid.name, t->flags); + } else { + seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len, + t->target_oid.name, t->flags); + } if (t->paused) seq_puts(s, "\tP"); } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 37c38a7fb5c5..c83326c5ba58 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -227,9 +227,10 @@ static void __schedule_delayed(struct ceph_mon_client *monc) } const char *ceph_sub_str[] = { - [CEPH_SUB_MDSMAP] = "mdsmap", [CEPH_SUB_MONMAP] = "monmap", [CEPH_SUB_OSDMAP] = "osdmap", + [CEPH_SUB_FSMAP] = "fsmap.user", + [CEPH_SUB_MDSMAP] = "mdsmap", }; /* @@ -1193,6 +1194,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: + case CEPH_MSG_FS_MAP_USER: m = ceph_msg_new(type, front_len, GFP_NOFS, false); if (!m) return NULL; /* ENOMEM--return skip == 0 */ diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index ddec1c10ac80..aaed59a47b1d 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/vmalloc.h> +#include <linux/ceph/messenger.h> #include <linux/ceph/msgpool.h> static void *msgpool_alloc(gfp_t gfp_mask, void *arg) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 89469592076c..b5ec09612ff7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -387,7 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest, static void target_destroy(struct ceph_osd_request_target *t) { ceph_oid_destroy(&t->base_oid); + ceph_oloc_destroy(&t->base_oloc); ceph_oid_destroy(&t->target_oid); + ceph_oloc_destroy(&t->target_oloc); } /* @@ -533,6 +535,11 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); +static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc) +{ + return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); +} + int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) { struct ceph_osd_client *osdc = req->r_osdc; @@ -540,11 +547,13 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) int msg_size; WARN_ON(ceph_oid_empty(&req->r_base_oid)); + WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); /* create request message */ msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ - msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ + msg_size += CEPH_ENCODING_START_BLK_LEN + + ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pgid */ msg_size += 4 + req->r_base_oid.name_len; /* oid */ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); @@ -932,7 +941,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { osd_req_op_init(req, which, opcode, 0); } else { - u32 object_size = le32_to_cpu(layout->fl_object_size); + u32 object_size = layout->object_size; u32 object_base = off - objoff; if (!(truncate_seq == 1 && truncate_size == -1ULL)) { if (truncate_size <= object_base) { @@ -948,7 +957,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } req->r_flags = flags; - req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); + req->r_base_oloc.pool = layout->pool_id; + req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); req->r_snapid = vino.snap; @@ -1489,12 +1499,16 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) p += sizeof(req->r_replay_version); /* oloc */ - ceph_encode_8(&p, 4); - ceph_encode_8(&p, 4); - ceph_encode_32(&p, 8 + 4 + 4); + ceph_start_encoding(&p, 5, 4, + ceph_oloc_encoding_size(&req->r_t.target_oloc)); ceph_encode_64(&p, req->r_t.target_oloc.pool); ceph_encode_32(&p, -1); /* preferred */ ceph_encode_32(&p, 0); /* key len */ + if (req->r_t.target_oloc.pool_ns) + ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str, + req->r_t.target_oloc.pool_ns->len); + else + ceph_encode_32(&p, 0); /* pgid */ ceph_encode_8(&p, 1); @@ -2594,9 +2608,22 @@ static int ceph_oloc_decode(void **p, void *end, } if (struct_v >= 5) { + bool changed = false; + len = ceph_decode_32(p); if (len > 0) { - pr_warn("ceph_object_locator::nspace is set\n"); + ceph_decode_need(p, end, len, e_inval); + if (!oloc->pool_ns || + ceph_compare_string(oloc->pool_ns, *p, len)) + changed = true; + *p += len; + } else { + if (oloc->pool_ns) + changed = true; + } + if (changed) { + /* redirect changes namespace */ + pr_warn("ceph_object_locator::nspace is changed\n"); goto e_inval; } } @@ -2806,7 +2833,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) goto out_unlock_session; } + m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns; ret = decode_MOSDOpReply(msg, &m); + m.redirect.oloc.pool_ns = NULL; if (ret) { pr_err("failed to decode MOSDOpReply for tid %llu: %d\n", req->r_tid, ret); @@ -2835,7 +2864,11 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) unlink_request(osd, req); mutex_unlock(&osd->lock); - ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc); + /* + * Not ceph_oloc_copy() - changing pool_ns is not + * supported. + */ + req->r_t.target_oloc.pool = m.redirect.oloc.pool; req->r_flags |= CEPH_OSD_FLAG_REDIRECTED; req->r_tid = 0; __submit_request(req, false); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7e480bf75bcf..d2436880b305 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1510,6 +1510,24 @@ bad: return ERR_PTR(err); } +void ceph_oloc_copy(struct ceph_object_locator *dest, + const struct ceph_object_locator *src) +{ + WARN_ON(!ceph_oloc_empty(dest)); + WARN_ON(dest->pool_ns); /* empty() only covers ->pool */ + + dest->pool = src->pool; + if (src->pool_ns) + dest->pool_ns = ceph_get_string(src->pool_ns); +} +EXPORT_SYMBOL(ceph_oloc_copy); + +void ceph_oloc_destroy(struct ceph_object_locator *oloc) +{ + ceph_put_string(oloc->pool_ns); +} +EXPORT_SYMBOL(ceph_oloc_destroy); + void ceph_oid_copy(struct ceph_object_id *dest, const struct ceph_object_id *src) { @@ -1770,9 +1788,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *ono, u64 *oxoff, u64 *oxlen) { - u32 osize = le32_to_cpu(layout->fl_object_size); - u32 su = le32_to_cpu(layout->fl_stripe_unit); - u32 sc = le32_to_cpu(layout->fl_stripe_count); + u32 osize = layout->object_size; + u32 su = layout->stripe_unit; + u32 sc = layout->stripe_count; u32 bl, stripeno, stripepos, objsetno; u32 su_per_object; u64 t, su_offset; @@ -1844,12 +1862,34 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, if (!pi) return -ENOENT; - raw_pgid->pool = oloc->pool; - raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, - oid->name_len); - - dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, - raw_pgid->pool, raw_pgid->seed); + if (!oloc->pool_ns) { + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); + dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, + raw_pgid->pool, raw_pgid->seed); + } else { + char stack_buf[256]; + char *buf = stack_buf; + int nsl = oloc->pool_ns->len; + size_t total = nsl + 1 + oid->name_len; + + if (total > sizeof(stack_buf)) { + buf = kmalloc(total, GFP_NOIO); + if (!buf) + return -ENOMEM; + } + memcpy(buf, oloc->pool_ns->str, nsl); + buf[nsl] = '\037'; + memcpy(buf + nsl + 1, oid->name, oid->name_len); + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total); + if (buf != stack_buf) + kfree(buf); + dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__, + oid->name, nsl, oloc->pool_ns->str, + raw_pgid->pool, raw_pgid->seed); + } return 0; } EXPORT_SYMBOL(ceph_object_locator_to_pg); diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c new file mode 100644 index 000000000000..ca53c8319209 --- /dev/null +++ b/net/ceph/string_table.c @@ -0,0 +1,111 @@ +#include <linux/slab.h> +#include <linux/gfp.h> +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/ceph/string_table.h> + +static DEFINE_SPINLOCK(string_tree_lock); +static struct rb_root string_tree = RB_ROOT; + +struct ceph_string *ceph_find_or_create_string(const char* str, size_t len) +{ + struct ceph_string *cs, *exist; + struct rb_node **p, *parent; + int ret; + + exist = NULL; + spin_lock(&string_tree_lock); + p = &string_tree.rb_node; + while (*p) { + exist = rb_entry(*p, struct ceph_string, node); + ret = ceph_compare_string(exist, str, len); + if (ret > 0) + p = &(*p)->rb_left; + else if (ret < 0) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + if (exist && !kref_get_unless_zero(&exist->kref)) { + rb_erase(&exist->node, &string_tree); + RB_CLEAR_NODE(&exist->node); + exist = NULL; + } + spin_unlock(&string_tree_lock); + if (exist) + return exist; + + cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS); + if (!cs) + return NULL; + + kref_init(&cs->kref); + cs->len = len; + memcpy(cs->str, str, len); + cs->str[len] = 0; + +retry: + exist = NULL; + parent = NULL; + p = &string_tree.rb_node; + spin_lock(&string_tree_lock); + while (*p) { + parent = *p; + exist = rb_entry(*p, struct ceph_string, node); + ret = ceph_compare_string(exist, str, len); + if (ret > 0) + p = &(*p)->rb_left; + else if (ret < 0) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + ret = 0; + if (!exist) { + rb_link_node(&cs->node, parent, p); + rb_insert_color(&cs->node, &string_tree); + } else if (!kref_get_unless_zero(&exist->kref)) { + rb_erase(&exist->node, &string_tree); + RB_CLEAR_NODE(&exist->node); + ret = -EAGAIN; + } + spin_unlock(&string_tree_lock); + if (ret == -EAGAIN) + goto retry; + + if (exist) { + kfree(cs); + cs = exist; + } + + return cs; +} +EXPORT_SYMBOL(ceph_find_or_create_string); + +static void ceph_free_string(struct rcu_head *head) +{ + struct ceph_string *cs = container_of(head, struct ceph_string, rcu); + kfree(cs); +} + +void ceph_release_string(struct kref *ref) +{ + struct ceph_string *cs = container_of(ref, struct ceph_string, kref); + + spin_lock(&string_tree_lock); + if (!RB_EMPTY_NODE(&cs->node)) { + rb_erase(&cs->node, &string_tree); + RB_CLEAR_NODE(&cs->node); + } + spin_unlock(&string_tree_lock); + + call_rcu(&cs->rcu, ceph_free_string); +} +EXPORT_SYMBOL(ceph_release_string); + +bool ceph_strings_empty(void) +{ + return RB_EMPTY_ROOT(&string_tree); +} diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 3ff137d9471d..3828f94b234c 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -216,14 +216,17 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req skb = dccp_make_response(sk, dst, req); if (skb != NULL) { struct dccp_hdr *dh = dccp_hdr(skb); + struct ipv6_txoptions *opt; dh->dccph_checksum = dccp_v6_csum_finish(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6.daddr = ireq->ir_v6_rmt_addr; rcu_read_lock(); - err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt), - np->tclass); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); + err = ip6_xmit(sk, skb, &fl6, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -236,6 +239,7 @@ done: static void dccp_v6_reqsk_destructor(struct request_sock *req) { dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); + kfree(inet_rsk(req)->ipv6_opt); kfree_skb(inet_rsk(req)->pktopts); } @@ -494,7 +498,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, * Yes, keeping reference count would be much more clever, but we make * one more one thing there: reattach optmem to newsk. */ - opt = rcu_dereference(np->opt); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 40d6b87713a1..72d6f056d863 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -135,76 +135,6 @@ int cipso_v4_rbm_strictvalid = 1; */ /** - * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit - * @bitmap: the bitmap - * @bitmap_len: length in bits - * @offset: starting offset - * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit - * - * Description: - * Starting at @offset, walk the bitmap from left to right until either the - * desired bit is found or we reach the end. Return the bit offset, -1 if - * not found, or -2 if error. - */ -static int cipso_v4_bitmap_walk(const unsigned char *bitmap, - u32 bitmap_len, - u32 offset, - u8 state) -{ - u32 bit_spot; - u32 byte_offset; - unsigned char bitmask; - unsigned char byte; - - /* gcc always rounds to zero when doing integer division */ - byte_offset = offset / 8; - byte = bitmap[byte_offset]; - bit_spot = offset; - bitmask = 0x80 >> (offset % 8); - - while (bit_spot < bitmap_len) { - if ((state && (byte & bitmask) == bitmask) || - (state == 0 && (byte & bitmask) == 0)) - return bit_spot; - - bit_spot++; - bitmask >>= 1; - if (bitmask == 0) { - byte = bitmap[++byte_offset]; - bitmask = 0x80; - } - } - - return -1; -} - -/** - * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap - * @bitmap: the bitmap - * @bit: the bit - * @state: if non-zero, set the bit (1) else clear the bit (0) - * - * Description: - * Set a single bit in the bitmask. Returns zero on success, negative values - * on error. - */ -static void cipso_v4_bitmap_setbit(unsigned char *bitmap, - u32 bit, - u8 state) -{ - u32 byte_spot; - u8 bitmask; - - /* gcc always rounds to zero when doing integer division */ - byte_spot = bit / 8; - bitmask = 0x80 >> (bit % 8); - if (state) - bitmap[byte_spot] |= bitmask; - else - bitmap[byte_spot] &= ~bitmask; -} - -/** * cipso_v4_cache_entry_free - Frees a cache entry * @entry: the entry to free * @@ -840,10 +770,10 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, cipso_cat_size = doi_def->map.std->cat.cipso_size; cipso_array = doi_def->map.std->cat.cipso; for (;;) { - cat = cipso_v4_bitmap_walk(bitmap, - bitmap_len_bits, - cat + 1, - 1); + cat = netlbl_bitmap_walk(bitmap, + bitmap_len_bits, + cat + 1, + 1); if (cat < 0) break; if (cat >= cipso_cat_size || @@ -909,7 +839,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, } if (net_spot >= net_clen_bits) return -ENOSPC; - cipso_v4_bitmap_setbit(net_cat, net_spot, 1); + netlbl_bitmap_setbit(net_cat, net_spot, 1); if (net_spot > net_spot_max) net_spot_max = net_spot; @@ -951,10 +881,10 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, } for (;;) { - net_spot = cipso_v4_bitmap_walk(net_cat, - net_clen_bits, - net_spot + 1, - 1); + net_spot = netlbl_bitmap_walk(net_cat, + net_clen_bits, + net_spot + 1, + 1); if (net_spot < 0) { if (net_spot == -2) return -EFAULT; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f9f9e375d7de..3ebf45b38bc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6147,6 +6147,9 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, kmemcheck_annotate_bitfield(ireq, flags); ireq->opt = NULL; +#if IS_ENABLED(CONFIG_IPV6) + ireq->pktopts = NULL; +#endif atomic64_set(&ireq->ir_cookie, 0); ireq->ireq_state = TCP_NEW_SYN_RECV; write_pnet(&ireq->ireq_net, sock_net(sk_listener)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b26aa870adc0..bdaef7fd6e47 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -236,7 +236,8 @@ void tcp_select_initial_window(int __space, __u32 mss, /* Set window scaling on max possible window * See RFC1323 for an explanation of the limit to 14 */ - space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); + space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > 65535 && (*rcv_wscale) < 14) { space >>= 1; diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 6d8ea099213e..c174ccb340a1 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -22,6 +22,7 @@ ipv6-$(CONFIG_NETFILTER) += netfilter.o ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o ipv6-$(CONFIG_PROC_FS) += proc.o ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o +ipv6-$(CONFIG_NETLABEL) += calipso.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6287a8b9f428..ab3e796596b1 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3624,8 +3624,7 @@ restart: state = ifa->state; ifa->state = INET6_IFADDR_STATE_DEAD; - list_del(&ifa->if_list); - list_add(&ifa->if_list, &del_list); + list_move(&ifa->if_list, &del_list); } spin_unlock_bh(&ifa->lock); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2076c21107d0..b454055ba625 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -60,6 +60,7 @@ #ifdef CONFIG_IPV6_TUNNEL #include <net/ip6_tunnel.h> #endif +#include <net/calipso.h> #include <asm/uaccess.h> #include <linux/mroute6.h> @@ -983,6 +984,10 @@ static int __init inet6_init(void) if (err) goto pingv6_fail; + err = calipso_init(); + if (err) + goto calipso_fail; + #ifdef CONFIG_SYSCTL err = ipv6_sysctl_register(); if (err) @@ -993,8 +998,10 @@ out: #ifdef CONFIG_SYSCTL sysctl_fail: - pingv6_exit(); + calipso_exit(); #endif +calipso_fail: + pingv6_exit(); pingv6_fail: ipv6_packet_cleanup(); ipv6_packet_fail: diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c new file mode 100644 index 000000000000..c53b92c617c5 --- /dev/null +++ b/net/ipv6/calipso.c @@ -0,0 +1,1473 @@ +/* + * CALIPSO - Common Architecture Label IPv6 Security Option + * + * This is an implementation of the CALIPSO protocol as specified in + * RFC 5570. + * + * Authors: Paul Moore <paul.moore@hp.com> + * Huw Davies <huw@codeweavers.com> + * + */ + +/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 + * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/jhash.h> +#include <linux/audit.h> +#include <linux/slab.h> +#include <net/ip.h> +#include <net/icmp.h> +#include <net/tcp.h> +#include <net/netlabel.h> +#include <net/calipso.h> +#include <linux/atomic.h> +#include <linux/bug.h> +#include <asm/unaligned.h> +#include <linux/crc-ccitt.h> + +/* Maximium size of the calipso option including + * the two-byte TLV header. + */ +#define CALIPSO_OPT_LEN_MAX (2 + 252) + +/* Size of the minimum calipso option including + * the two-byte TLV header. + */ +#define CALIPSO_HDR_LEN (2 + 8) + +/* Maximium size of the calipso option including + * the two-byte TLV header and upto 3 bytes of + * leading pad and 7 bytes of trailing pad. + */ +#define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) + + /* Maximium size of u32 aligned buffer required to hold calipso + * option. Max of 3 initial pad bytes starting from buffer + 3. + * i.e. the worst case is when the previous tlv finishes on 4n + 3. + */ +#define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX) + +/* List of available DOI definitions */ +static DEFINE_SPINLOCK(calipso_doi_list_lock); +static LIST_HEAD(calipso_doi_list); + +/* Label mapping cache */ +int calipso_cache_enabled = 1; +int calipso_cache_bucketsize = 10; +#define CALIPSO_CACHE_BUCKETBITS 7 +#define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS) +#define CALIPSO_CACHE_REORDERLIMIT 10 +struct calipso_map_cache_bkt { + spinlock_t lock; + u32 size; + struct list_head list; +}; + +struct calipso_map_cache_entry { + u32 hash; + unsigned char *key; + size_t key_len; + + struct netlbl_lsm_cache *lsm_data; + + u32 activity; + struct list_head list; +}; + +static struct calipso_map_cache_bkt *calipso_cache; + +/* Label Mapping Cache Functions + */ + +/** + * calipso_cache_entry_free - Frees a cache entry + * @entry: the entry to free + * + * Description: + * This function frees the memory associated with a cache entry including the + * LSM cache data if there are no longer any users, i.e. reference count == 0. + * + */ +static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry) +{ + if (entry->lsm_data) + netlbl_secattr_cache_free(entry->lsm_data); + kfree(entry->key); + kfree(entry); +} + +/** + * calipso_map_cache_hash - Hashing function for the CALIPSO cache + * @key: the hash key + * @key_len: the length of the key in bytes + * + * Description: + * The CALIPSO tag hashing function. Returns a 32-bit hash value. + * + */ +static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +/** + * calipso_cache_init - Initialize the CALIPSO cache + * + * Description: + * Initializes the CALIPSO label mapping cache, this function should be called + * before any of the other functions defined in this file. Returns zero on + * success, negative values on error. + * + */ +static int __init calipso_cache_init(void) +{ + u32 iter; + + calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS, + sizeof(struct calipso_map_cache_bkt), + GFP_KERNEL); + if (!calipso_cache) + return -ENOMEM; + + for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { + spin_lock_init(&calipso_cache[iter].lock); + calipso_cache[iter].size = 0; + INIT_LIST_HEAD(&calipso_cache[iter].list); + } + + return 0; +} + +/** + * calipso_cache_invalidate - Invalidates the current CALIPSO cache + * + * Description: + * Invalidates and frees any entries in the CALIPSO cache. Returns zero on + * success and negative values on failure. + * + */ +static void calipso_cache_invalidate(void) +{ + struct calipso_map_cache_entry *entry, *tmp_entry; + u32 iter; + + for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { + spin_lock_bh(&calipso_cache[iter].lock); + list_for_each_entry_safe(entry, + tmp_entry, + &calipso_cache[iter].list, list) { + list_del(&entry->list); + calipso_cache_entry_free(entry); + } + calipso_cache[iter].size = 0; + spin_unlock_bh(&calipso_cache[iter].lock); + } +} + +/** + * calipso_cache_check - Check the CALIPSO cache for a label mapping + * @key: the buffer to check + * @key_len: buffer length in bytes + * @secattr: the security attribute struct to use + * + * Description: + * This function checks the cache to see if a label mapping already exists for + * the given key. If there is a match then the cache is adjusted and the + * @secattr struct is populated with the correct LSM security attributes. The + * cache is adjusted in the following manner if the entry is not already the + * first in the cache bucket: + * + * 1. The cache entry's activity counter is incremented + * 2. The previous (higher ranking) entry's activity counter is decremented + * 3. If the difference between the two activity counters is geater than + * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped + * + * Returns zero on success, -ENOENT for a cache miss, and other negative values + * on error. + * + */ +static int calipso_cache_check(const unsigned char *key, + u32 key_len, + struct netlbl_lsm_secattr *secattr) +{ + u32 bkt; + struct calipso_map_cache_entry *entry; + struct calipso_map_cache_entry *prev_entry = NULL; + u32 hash; + + if (!calipso_cache_enabled) + return -ENOENT; + + hash = calipso_map_cache_hash(key, key_len); + bkt = hash & (CALIPSO_CACHE_BUCKETS - 1); + spin_lock_bh(&calipso_cache[bkt].lock); + list_for_each_entry(entry, &calipso_cache[bkt].list, list) { + if (entry->hash == hash && + entry->key_len == key_len && + memcmp(entry->key, key, key_len) == 0) { + entry->activity += 1; + atomic_inc(&entry->lsm_data->refcount); + secattr->cache = entry->lsm_data; + secattr->flags |= NETLBL_SECATTR_CACHE; + secattr->type = NETLBL_NLTYPE_CALIPSO; + if (!prev_entry) { + spin_unlock_bh(&calipso_cache[bkt].lock); + return 0; + } + + if (prev_entry->activity > 0) + prev_entry->activity -= 1; + if (entry->activity > prev_entry->activity && + entry->activity - prev_entry->activity > + CALIPSO_CACHE_REORDERLIMIT) { + __list_del(entry->list.prev, entry->list.next); + __list_add(&entry->list, + prev_entry->list.prev, + &prev_entry->list); + } + + spin_unlock_bh(&calipso_cache[bkt].lock); + return 0; + } + prev_entry = entry; + } + spin_unlock_bh(&calipso_cache[bkt].lock); + + return -ENOENT; +} + +/** + * calipso_cache_add - Add an entry to the CALIPSO cache + * @calipso_ptr: the CALIPSO option + * @secattr: the packet's security attributes + * + * Description: + * Add a new entry into the CALIPSO label mapping cache. Add the new entry to + * head of the cache bucket's list, if the cache bucket is out of room remove + * the last entry in the list first. It is important to note that there is + * currently no checking for duplicate keys. Returns zero on success, + * negative values on failure. The key stored starts at calipso_ptr + 2, + * i.e. the type and length bytes are not stored, this corresponds to + * calipso_ptr[1] bytes of data. + * + */ +static int calipso_cache_add(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -EPERM; + u32 bkt; + struct calipso_map_cache_entry *entry = NULL; + struct calipso_map_cache_entry *old_entry = NULL; + u32 calipso_ptr_len; + + if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0) + return 0; + + calipso_ptr_len = calipso_ptr[1]; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) + return -ENOMEM; + entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC); + if (!entry->key) { + ret_val = -ENOMEM; + goto cache_add_failure; + } + entry->key_len = calipso_ptr_len; + entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len); + atomic_inc(&secattr->cache->refcount); + entry->lsm_data = secattr->cache; + + bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1); + spin_lock_bh(&calipso_cache[bkt].lock); + if (calipso_cache[bkt].size < calipso_cache_bucketsize) { + list_add(&entry->list, &calipso_cache[bkt].list); + calipso_cache[bkt].size += 1; + } else { + old_entry = list_entry(calipso_cache[bkt].list.prev, + struct calipso_map_cache_entry, list); + list_del(&old_entry->list); + list_add(&entry->list, &calipso_cache[bkt].list); + calipso_cache_entry_free(old_entry); + } + spin_unlock_bh(&calipso_cache[bkt].lock); + + return 0; + +cache_add_failure: + if (entry) + calipso_cache_entry_free(entry); + return ret_val; +} + +/* DOI List Functions + */ + +/** + * calipso_doi_search - Searches for a DOI definition + * @doi: the DOI to search for + * + * Description: + * Search the DOI definition list for a DOI definition with a DOI value that + * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). + * Returns a pointer to the DOI definition on success and NULL on failure. + */ +static struct calipso_doi *calipso_doi_search(u32 doi) +{ + struct calipso_doi *iter; + + list_for_each_entry_rcu(iter, &calipso_doi_list, list) + if (iter->doi == doi && atomic_read(&iter->refcount)) + return iter; + return NULL; +} + +/** + * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine + * @doi_def: the DOI structure + * @audit_info: NetLabel audit information + * + * Description: + * The caller defines a new DOI for use by the CALIPSO engine and calls this + * function to add it to the list of acceptable domains. The caller must + * ensure that the mapping table specified in @doi_def->map meets all of the + * requirements of the mapping type (see calipso.h for details). Returns + * zero on success and non-zero on failure. + * + */ +static int calipso_doi_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ + int ret_val = -EINVAL; + u32 doi; + u32 doi_type; + struct audit_buffer *audit_buf; + + doi = doi_def->doi; + doi_type = doi_def->type; + + if (doi_def->doi == CALIPSO_DOI_UNKNOWN) + goto doi_add_return; + + atomic_set(&doi_def->refcount, 1); + + spin_lock(&calipso_doi_list_lock); + if (calipso_doi_search(doi_def->doi)) { + spin_unlock(&calipso_doi_list_lock); + ret_val = -EEXIST; + goto doi_add_return; + } + list_add_tail_rcu(&doi_def->list, &calipso_doi_list); + spin_unlock(&calipso_doi_list_lock); + ret_val = 0; + +doi_add_return: + audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info); + if (audit_buf) { + const char *type_str; + + switch (doi_type) { + case CALIPSO_MAP_PASS: + type_str = "pass"; + break; + default: + type_str = "(unknown)"; + } + audit_log_format(audit_buf, + " calipso_doi=%u calipso_type=%s res=%u", + doi, type_str, ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + return ret_val; +} + +/** + * calipso_doi_free - Frees a DOI definition + * @doi_def: the DOI definition + * + * Description: + * This function frees all of the memory associated with a DOI definition. + * + */ +static void calipso_doi_free(struct calipso_doi *doi_def) +{ + kfree(doi_def); +} + +/** + * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer + * @entry: the entry's RCU field + * + * Description: + * This function is designed to be used as a callback to the call_rcu() + * function so that the memory allocated to the DOI definition can be released + * safely. + * + */ +static void calipso_doi_free_rcu(struct rcu_head *entry) +{ + struct calipso_doi *doi_def; + + doi_def = container_of(entry, struct calipso_doi, rcu); + calipso_doi_free(doi_def); +} + +/** + * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine + * @doi: the DOI value + * @audit_secid: the LSM secid to use in the audit message + * + * Description: + * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will + * be called to release their own LSM domain mappings as well as our own + * domain list. Returns zero on success and negative values on failure. + * + */ +static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) +{ + int ret_val; + struct calipso_doi *doi_def; + struct audit_buffer *audit_buf; + + spin_lock(&calipso_doi_list_lock); + doi_def = calipso_doi_search(doi); + if (!doi_def) { + spin_unlock(&calipso_doi_list_lock); + ret_val = -ENOENT; + goto doi_remove_return; + } + if (!atomic_dec_and_test(&doi_def->refcount)) { + spin_unlock(&calipso_doi_list_lock); + ret_val = -EBUSY; + goto doi_remove_return; + } + list_del_rcu(&doi_def->list); + spin_unlock(&calipso_doi_list_lock); + + call_rcu(&doi_def->rcu, calipso_doi_free_rcu); + ret_val = 0; + +doi_remove_return: + audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info); + if (audit_buf) { + audit_log_format(audit_buf, + " calipso_doi=%u res=%u", + doi, ret_val == 0 ? 1 : 0); + audit_log_end(audit_buf); + } + + return ret_val; +} + +/** + * calipso_doi_getdef - Returns a reference to a valid DOI definition + * @doi: the DOI value + * + * Description: + * Searches for a valid DOI definition and if one is found it is returned to + * the caller. Otherwise NULL is returned. The caller must ensure that + * calipso_doi_putdef() is called when the caller is done. + * + */ +static struct calipso_doi *calipso_doi_getdef(u32 doi) +{ + struct calipso_doi *doi_def; + + rcu_read_lock(); + doi_def = calipso_doi_search(doi); + if (!doi_def) + goto doi_getdef_return; + if (!atomic_inc_not_zero(&doi_def->refcount)) + doi_def = NULL; + +doi_getdef_return: + rcu_read_unlock(); + return doi_def; +} + +/** + * calipso_doi_putdef - Releases a reference for the given DOI definition + * @doi_def: the DOI definition + * + * Description: + * Releases a DOI definition reference obtained from calipso_doi_getdef(). + * + */ +static void calipso_doi_putdef(struct calipso_doi *doi_def) +{ + if (!doi_def) + return; + + if (!atomic_dec_and_test(&doi_def->refcount)) + return; + spin_lock(&calipso_doi_list_lock); + list_del_rcu(&doi_def->list); + spin_unlock(&calipso_doi_list_lock); + + call_rcu(&doi_def->rcu, calipso_doi_free_rcu); +} + +/** + * calipso_doi_walk - Iterate through the DOI definitions + * @skip_cnt: skip past this number of DOI definitions, updated + * @callback: callback for each DOI definition + * @cb_arg: argument for the callback function + * + * Description: + * Iterate over the DOI definition list, skipping the first @skip_cnt entries. + * For each entry call @callback, if @callback returns a negative value stop + * 'walking' through the list and return. Updates the value in @skip_cnt upon + * return. Returns zero on success, negative values on failure. + * + */ +static int calipso_doi_walk(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, + void *arg), + void *cb_arg) +{ + int ret_val = -ENOENT; + u32 doi_cnt = 0; + struct calipso_doi *iter_doi; + + rcu_read_lock(); + list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list) + if (atomic_read(&iter_doi->refcount) > 0) { + if (doi_cnt++ < *skip_cnt) + continue; + ret_val = callback(iter_doi, cb_arg); + if (ret_val < 0) { + doi_cnt--; + goto doi_walk_return; + } + } + +doi_walk_return: + rcu_read_unlock(); + *skip_cnt = doi_cnt; + return ret_val; +} + +/** + * calipso_validate - Validate a CALIPSO option + * @skb: the packet + * @option: the start of the option + * + * Description: + * This routine is called to validate a CALIPSO option. + * If the option is valid then %true is returned, otherwise + * %false is returned. + * + * The caller should have already checked that the length of the + * option (including the TLV header) is >= 10 and that the catmap + * length is consistent with the option length. + * + * We leave checks on the level and categories to the socket layer. + */ +bool calipso_validate(const struct sk_buff *skb, const unsigned char *option) +{ + struct calipso_doi *doi_def; + bool ret_val; + u16 crc, len = option[1] + 2; + static const u8 zero[2]; + + /* The original CRC runs over the option including the TLV header + * with the CRC-16 field (at offset 8) zeroed out. */ + crc = crc_ccitt(0xffff, option, 8); + crc = crc_ccitt(crc, zero, sizeof(zero)); + if (len > 10) + crc = crc_ccitt(crc, option + 10, len - 10); + crc = ~crc; + if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff)) + return false; + + rcu_read_lock(); + doi_def = calipso_doi_search(get_unaligned_be32(option + 2)); + ret_val = !!doi_def; + rcu_read_unlock(); + + return ret_val; +} + +/** + * calipso_map_cat_hton - Perform a category mapping from host to network + * @doi_def: the DOI definition + * @secattr: the security attributes + * @net_cat: the zero'd out category bitmap in network/CALIPSO format + * @net_cat_len: the length of the CALIPSO bitmap in bytes + * + * Description: + * Perform a label mapping to translate a local MLS category bitmap to the + * correct CALIPSO bitmap using the given DOI definition. Returns the minimum + * size in bytes of the network bitmap on success, negative values otherwise. + * + */ +static int calipso_map_cat_hton(const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr, + unsigned char *net_cat, + u32 net_cat_len) +{ + int spot = -1; + u32 net_spot_max = 0; + u32 net_clen_bits = net_cat_len * 8; + + for (;;) { + spot = netlbl_catmap_walk(secattr->attr.mls.cat, + spot + 1); + if (spot < 0) + break; + if (spot >= net_clen_bits) + return -ENOSPC; + netlbl_bitmap_setbit(net_cat, spot, 1); + + if (spot > net_spot_max) + net_spot_max = spot; + } + + return (net_spot_max / 32 + 1) * 4; +} + +/** + * calipso_map_cat_ntoh - Perform a category mapping from network to host + * @doi_def: the DOI definition + * @net_cat: the category bitmap in network/CALIPSO format + * @net_cat_len: the length of the CALIPSO bitmap in bytes + * @secattr: the security attributes + * + * Description: + * Perform a label mapping to translate a CALIPSO bitmap to the correct local + * MLS category bitmap using the given DOI definition. Returns zero on + * success, negative values on failure. + * + */ +static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, + const unsigned char *net_cat, + u32 net_cat_len, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + int spot = -1; + u32 net_clen_bits = net_cat_len * 8; + + for (;;) { + spot = netlbl_bitmap_walk(net_cat, + net_clen_bits, + spot + 1, + 1); + if (spot < 0) { + if (spot == -2) + return -EFAULT; + return 0; + } + + ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, + spot, + GFP_ATOMIC); + if (ret_val != 0) + return ret_val; + } + + return -EINVAL; +} + +/** + * calipso_pad_write - Writes pad bytes in TLV format + * @buf: the buffer + * @offset: offset from start of buffer to write padding + * @count: number of pad bytes to write + * + * Description: + * Write @count bytes of TLV padding into @buffer starting at offset @offset. + * @count should be less than 8 - see RFC 4942. + * + */ +static int calipso_pad_write(unsigned char *buf, unsigned int offset, + unsigned int count) +{ + if (WARN_ON_ONCE(count >= 8)) + return -EINVAL; + + switch (count) { + case 0: + break; + case 1: + buf[offset] = IPV6_TLV_PAD1; + break; + default: + buf[offset] = IPV6_TLV_PADN; + buf[offset + 1] = count - 2; + if (count > 2) + memset(buf + offset + 2, 0, count - 2); + break; + } + return 0; +} + +/** + * calipso_genopt - Generate a CALIPSO option + * @buf: the option buffer + * @start: offset from which to write + * @buf_len: the size of opt_buf + * @doi_def: the CALIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Generate a CALIPSO option using the DOI definition and security attributes + * passed to the function. This also generates upto three bytes of leading + * padding that ensures that the option is 4n + 2 aligned. It returns the + * number of bytes written (including any initial padding). + */ +static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + u32 len, pad; + u16 crc; + static const unsigned char padding[4] = {2, 1, 0, 3}; + unsigned char *calipso; + + /* CALIPSO has 4n + 2 alignment */ + pad = padding[start & 3]; + if (buf_len <= start + pad + CALIPSO_HDR_LEN) + return -ENOSPC; + + if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) + return -EPERM; + + len = CALIPSO_HDR_LEN; + + if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { + ret_val = calipso_map_cat_hton(doi_def, + secattr, + buf + start + pad + len, + buf_len - start - pad - len); + if (ret_val < 0) + return ret_val; + len += ret_val; + } + + calipso_pad_write(buf, start, pad); + calipso = buf + start + pad; + + calipso[0] = IPV6_TLV_CALIPSO; + calipso[1] = len - 2; + *(__be32 *)(calipso + 2) = htonl(doi_def->doi); + calipso[6] = (len - CALIPSO_HDR_LEN) / 4; + calipso[7] = secattr->attr.mls.lvl, + crc = ~crc_ccitt(0xffff, calipso, len); + calipso[8] = crc & 0xff; + calipso[9] = (crc >> 8) & 0xff; + return pad + len; +} + +/* Hop-by-hop hdr helper functions + */ + +/** + * calipso_opt_update - Replaces socket's hop options with a new set + * @sk: the socket + * @hop: new hop options + * + * Description: + * Replaces @sk's hop options with @hop. @hop may be NULL to leave + * the socket with no hop options. + * + */ +static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) +{ + struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; + + txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS, + hop, hop ? ipv6_optlen(hop) : 0); + txopt_put(old); + if (IS_ERR(txopts)) + return PTR_ERR(txopts); + + txopts = ipv6_update_options(sk, txopts); + if (txopts) { + atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); + txopt_put(txopts); + } + + return 0; +} + +/** + * calipso_tlv_len - Returns the length of the TLV + * @opt: the option header + * @offset: offset of the TLV within the header + * + * Description: + * Returns the length of the TLV option at offset @offset within + * the option header @opt. Checks that the entire TLV fits inside + * the option header, returns a negative value if this is not the case. + */ +static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset) +{ + unsigned char *tlv = (unsigned char *)opt; + unsigned int opt_len = ipv6_optlen(opt), tlv_len; + + if (offset < sizeof(*opt) || offset >= opt_len) + return -EINVAL; + if (tlv[offset] == IPV6_TLV_PAD1) + return 1; + if (offset + 1 >= opt_len) + return -EINVAL; + tlv_len = tlv[offset + 1] + 2; + if (offset + tlv_len > opt_len) + return -EINVAL; + return tlv_len; +} + +/** + * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header + * @hop: the hop options header + * @start: on return holds the offset of any leading padding + * @end: on return holds the offset of the first non-pad TLV after CALIPSO + * + * Description: + * Finds the space occupied by a CALIPSO option (including any leading and + * trailing padding). + * + * If a CALIPSO option exists set @start and @end to the + * offsets within @hop of the start of padding before the first + * CALIPSO option and the end of padding after the first CALIPSO + * option. In this case the function returns 0. + * + * In the absence of a CALIPSO option, @start and @end will be + * set to the start and end of any trailing padding in the header. + * This is useful when appending a new option, as the caller may want + * to overwrite some of this padding. In this case the function will + * return -ENOENT. + */ +static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start, + unsigned int *end) +{ + int ret_val = -ENOENT, tlv_len; + unsigned int opt_len, offset, offset_s = 0, offset_e = 0; + unsigned char *opt = (unsigned char *)hop; + + opt_len = ipv6_optlen(hop); + offset = sizeof(*hop); + + while (offset < opt_len) { + tlv_len = calipso_tlv_len(hop, offset); + if (tlv_len < 0) + return tlv_len; + + switch (opt[offset]) { + case IPV6_TLV_PAD1: + case IPV6_TLV_PADN: + if (offset_e) + offset_e = offset; + break; + case IPV6_TLV_CALIPSO: + ret_val = 0; + offset_e = offset; + break; + default: + if (offset_e == 0) + offset_s = offset; + else + goto out; + } + offset += tlv_len; + } + +out: + if (offset_s) + *start = offset_s + calipso_tlv_len(hop, offset_s); + else + *start = sizeof(*hop); + if (offset_e) + *end = offset_e + calipso_tlv_len(hop, offset_e); + else + *end = opt_len; + + return ret_val; +} + +/** + * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr + * @hop: the original hop options header + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Creates a new hop options header based on @hop with a + * CALIPSO option added to it. If @hop already contains a CALIPSO + * option this is overwritten, otherwise the new option is appended + * after any existing options. If @hop is NULL then the new header + * will contain just the CALIPSO option and any needed padding. + * + */ +static struct ipv6_opt_hdr * +calipso_opt_insert(struct ipv6_opt_hdr *hop, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + unsigned int start, end, buf_len, pad, hop_len; + struct ipv6_opt_hdr *new; + int ret_val; + + if (hop) { + hop_len = ipv6_optlen(hop); + ret_val = calipso_opt_find(hop, &start, &end); + if (ret_val && ret_val != -ENOENT) + return ERR_PTR(ret_val); + } else { + hop_len = 0; + start = sizeof(*hop); + end = 0; + } + + buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD; + new = kzalloc(buf_len, GFP_ATOMIC); + if (!new) + return ERR_PTR(-ENOMEM); + + if (start > sizeof(*hop)) + memcpy(new, hop, start); + ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def, + secattr); + if (ret_val < 0) + return ERR_PTR(ret_val); + + buf_len = start + ret_val; + /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */ + pad = ((buf_len & 4) + (end & 7)) & 7; + calipso_pad_write((unsigned char *)new, buf_len, pad); + buf_len += pad; + + if (end != hop_len) { + memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end); + buf_len += hop_len - end; + } + new->nexthdr = 0; + new->hdrlen = buf_len / 8 - 1; + + return new; +} + +/** + * calipso_opt_del - Removes the CALIPSO option from an option header + * @hop: the original header + * @new: the new header + * + * Description: + * Creates a new header based on @hop without any CALIPSO option. If @hop + * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains + * no other non-padding options, it returns zero with @new set to NULL. + * Otherwise it returns zero, creates a new header without the CALIPSO + * option (and removing as much padding as possible) and returns with + * @new set to that header. + * + */ +static int calipso_opt_del(struct ipv6_opt_hdr *hop, + struct ipv6_opt_hdr **new) +{ + int ret_val; + unsigned int start, end, delta, pad, hop_len; + + ret_val = calipso_opt_find(hop, &start, &end); + if (ret_val) + return ret_val; + + hop_len = ipv6_optlen(hop); + if (start == sizeof(*hop) && end == hop_len) { + /* There's no other option in the header so return NULL */ + *new = NULL; + return 0; + } + + delta = (end - start) & ~7; + *new = kzalloc(hop_len - delta, GFP_ATOMIC); + if (!*new) + return -ENOMEM; + + memcpy(*new, hop, start); + (*new)->hdrlen -= delta / 8; + pad = (end - start) & 7; + calipso_pad_write((unsigned char *)*new, start, pad); + if (end != hop_len) + memcpy((char *)*new + start + pad, (char *)hop + end, + hop_len - end); + + return 0; +} + +/** + * calipso_opt_getattr - Get the security attributes from a memory block + * @calipso: the CALIPSO option + * @secattr: the security attributes + * + * Description: + * Inspect @calipso and return the security attributes in @secattr. + * Returns zero on success and negative values on failure. + * + */ +static int calipso_opt_getattr(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + u32 doi, len = calipso[1], cat_len = calipso[6] * 4; + struct calipso_doi *doi_def; + + if (cat_len + 8 > len) + return -EINVAL; + + if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0) + return 0; + + doi = get_unaligned_be32(calipso + 2); + rcu_read_lock(); + doi_def = calipso_doi_search(doi); + if (!doi_def) + goto getattr_return; + + secattr->attr.mls.lvl = calipso[7]; + secattr->flags |= NETLBL_SECATTR_MLS_LVL; + + if (cat_len) { + ret_val = calipso_map_cat_ntoh(doi_def, + calipso + 10, + cat_len, + secattr); + if (ret_val != 0) { + netlbl_catmap_free(secattr->attr.mls.cat); + goto getattr_return; + } + + secattr->flags |= NETLBL_SECATTR_MLS_CAT; + } + + secattr->type = NETLBL_NLTYPE_CALIPSO; + +getattr_return: + rcu_read_unlock(); + return ret_val; +} + +/* sock functions. + */ + +/** + * calipso_sock_getattr - Get the security attributes from a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Query @sk to see if there is a CALIPSO option attached to the sock and if + * there is return the CALIPSO security attributes in @secattr. This function + * requires that @sk be locked, or privately held, but it does not do any + * locking itself. Returns zero on success and negative values on failure. + * + */ +static int calipso_sock_getattr(struct sock *sk, + struct netlbl_lsm_secattr *secattr) +{ + struct ipv6_opt_hdr *hop; + int opt_len, len, ret_val = -ENOMSG, offset; + unsigned char *opt; + struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + + if (!txopts || !txopts->hopopt) + goto done; + + hop = txopts->hopopt; + opt = (unsigned char *)hop; + opt_len = ipv6_optlen(hop); + offset = sizeof(*hop); + while (offset < opt_len) { + len = calipso_tlv_len(hop, offset); + if (len < 0) { + ret_val = len; + goto done; + } + switch (opt[offset]) { + case IPV6_TLV_CALIPSO: + if (len < CALIPSO_HDR_LEN) + ret_val = -EINVAL; + else + ret_val = calipso_opt_getattr(&opt[offset], + secattr); + goto done; + default: + offset += len; + break; + } + } +done: + txopt_put(txopts); + return ret_val; +} + +/** + * calipso_sock_setattr - Add a CALIPSO option to a socket + * @sk: the socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. This function requires + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. + * + */ +static int calipso_sock_setattr(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct ipv6_opt_hdr *old, *new; + struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + + old = NULL; + if (txopts) + old = txopts->hopopt; + + new = calipso_opt_insert(old, doi_def, secattr); + txopt_put(txopts); + if (IS_ERR(new)) + return PTR_ERR(new); + + ret_val = calipso_opt_update(sk, new); + + kfree(new); + return ret_val; +} + +/** + * calipso_sock_delattr - Delete the CALIPSO option from a socket + * @sk: the socket + * + * Description: + * Removes the CALIPSO option from a socket, if present. + * + */ +static void calipso_sock_delattr(struct sock *sk) +{ + struct ipv6_opt_hdr *new_hop; + struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + + if (!txopts || !txopts->hopopt) + goto done; + + if (calipso_opt_del(txopts->hopopt, &new_hop)) + goto done; + + calipso_opt_update(sk, new_hop); + kfree(new_hop); + +done: + txopt_put(txopts); +} + +/* request sock functions. + */ + +/** + * calipso_req_setattr - Add a CALIPSO option to a connection request socket + * @req: the connection request socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. Returns zero on success and + * negative values on failure. + * + */ +static int calipso_req_setattr(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + struct ipv6_txoptions *txopts; + struct inet_request_sock *req_inet = inet_rsk(req); + struct ipv6_opt_hdr *old, *new; + struct sock *sk = sk_to_full_sk(req_to_sk(req)); + + if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) + old = req_inet->ipv6_opt->hopopt; + else + old = NULL; + + new = calipso_opt_insert(old, doi_def, secattr); + if (IS_ERR(new)) + return PTR_ERR(new); + + txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, + new, new ? ipv6_optlen(new) : 0); + + kfree(new); + + if (IS_ERR(txopts)) + return PTR_ERR(txopts); + + txopts = xchg(&req_inet->ipv6_opt, txopts); + if (txopts) { + atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); + txopt_put(txopts); + } + + return 0; +} + +/** + * calipso_req_delattr - Delete the CALIPSO option from a request socket + * @reg: the request socket + * + * Description: + * Removes the CALIPSO option from a request socket, if present. + * + */ +static void calipso_req_delattr(struct request_sock *req) +{ + struct inet_request_sock *req_inet = inet_rsk(req); + struct ipv6_opt_hdr *new; + struct ipv6_txoptions *txopts; + struct sock *sk = sk_to_full_sk(req_to_sk(req)); + + if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) + return; + + if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) + return; /* Nothing to do */ + + txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, + new, new ? ipv6_optlen(new) : 0); + + if (!IS_ERR(txopts)) { + txopts = xchg(&req_inet->ipv6_opt, txopts); + if (txopts) { + atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); + txopt_put(txopts); + } + } + kfree(new); +} + +/* skbuff functions. + */ + +/** + * calipso_skbuff_optptr - Find the CALIPSO option in the packet + * @skb: the packet + * + * Description: + * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer + * to the start of the CALIPSO option on success, NULL if one if not found. + * + */ +static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb) +{ + const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb); + int offset; + + if (ip6_hdr->nexthdr != NEXTHDR_HOP) + return NULL; + + offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO); + if (offset >= 0) + return (unsigned char *)ip6_hdr + offset; + + return NULL; +} + +/** + * calipso_skbuff_setattr - Set the CALIPSO option on a packet + * @skb: the packet + * @doi_def: the CALIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Set the CALIPSO option on the given packet based on the security attributes. + * Returns a pointer to the IP header on success and NULL on failure. + * + */ +static int calipso_skbuff_setattr(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val; + struct ipv6hdr *ip6_hdr; + struct ipv6_opt_hdr *hop; + unsigned char buf[CALIPSO_MAX_BUFFER]; + int len_delta, new_end, pad; + unsigned int start, end; + + ip6_hdr = ipv6_hdr(skb); + if (ip6_hdr->nexthdr == NEXTHDR_HOP) { + hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); + ret_val = calipso_opt_find(hop, &start, &end); + if (ret_val && ret_val != -ENOENT) + return ret_val; + } else { + start = 0; + end = 0; + } + + memset(buf, 0, sizeof(buf)); + ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr); + if (ret_val < 0) + return ret_val; + + new_end = start + ret_val; + /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */ + pad = ((new_end & 4) + (end & 7)) & 7; + len_delta = new_end - (int)end + pad; + ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + if (ret_val < 0) + return ret_val; + + if (len_delta) { + if (len_delta > 0) + skb_push(skb, len_delta); + else + skb_pull(skb, -len_delta); + memmove((char *)ip6_hdr - len_delta, ip6_hdr, + sizeof(*ip6_hdr) + start); + skb_reset_network_header(skb); + ip6_hdr = ipv6_hdr(skb); + } + + hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); + if (start == 0) { + struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf; + + new_hop->nexthdr = ip6_hdr->nexthdr; + new_hop->hdrlen = len_delta / 8 - 1; + ip6_hdr->nexthdr = NEXTHDR_HOP; + } else { + hop->hdrlen += len_delta / 8; + } + memcpy((char *)hop + start, buf + (start & 3), new_end - start); + calipso_pad_write((unsigned char *)hop, new_end, pad); + + return 0; +} + +/** + * calipso_skbuff_delattr - Delete any CALIPSO options from a packet + * @skb: the packet + * + * Description: + * Removes any and all CALIPSO options from the given packet. Returns zero on + * success, negative values on failure. + * + */ +static int calipso_skbuff_delattr(struct sk_buff *skb) +{ + int ret_val; + struct ipv6hdr *ip6_hdr; + struct ipv6_opt_hdr *old_hop; + u32 old_hop_len, start = 0, end = 0, delta, size, pad; + + if (!calipso_skbuff_optptr(skb)) + return 0; + + /* since we are changing the packet we should make a copy */ + ret_val = skb_cow(skb, skb_headroom(skb)); + if (ret_val < 0) + return ret_val; + + ip6_hdr = ipv6_hdr(skb); + old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); + old_hop_len = ipv6_optlen(old_hop); + + ret_val = calipso_opt_find(old_hop, &start, &end); + if (ret_val) + return ret_val; + + if (start == sizeof(*old_hop) && end == old_hop_len) { + /* There's no other option in the header so we delete + * the whole thing. */ + delta = old_hop_len; + size = sizeof(*ip6_hdr); + ip6_hdr->nexthdr = old_hop->nexthdr; + } else { + delta = (end - start) & ~7; + if (delta) + old_hop->hdrlen -= delta / 8; + pad = (end - start) & 7; + size = sizeof(*ip6_hdr) + start + pad; + calipso_pad_write((unsigned char *)old_hop, start, pad); + } + + if (delta) { + skb_pull(skb, delta); + memmove((char *)ip6_hdr + delta, ip6_hdr, size); + skb_reset_network_header(skb); + } + + return 0; +} + +static const struct netlbl_calipso_ops ops = { + .doi_add = calipso_doi_add, + .doi_free = calipso_doi_free, + .doi_remove = calipso_doi_remove, + .doi_getdef = calipso_doi_getdef, + .doi_putdef = calipso_doi_putdef, + .doi_walk = calipso_doi_walk, + .sock_getattr = calipso_sock_getattr, + .sock_setattr = calipso_sock_setattr, + .sock_delattr = calipso_sock_delattr, + .req_setattr = calipso_req_setattr, + .req_delattr = calipso_req_delattr, + .opt_getattr = calipso_opt_getattr, + .skbuff_optptr = calipso_skbuff_optptr, + .skbuff_setattr = calipso_skbuff_setattr, + .skbuff_delattr = calipso_skbuff_delattr, + .cache_invalidate = calipso_cache_invalidate, + .cache_add = calipso_cache_add +}; + +/** + * calipso_init - Initialize the CALIPSO module + * + * Description: + * Initialize the CALIPSO module and prepare it for use. Returns zero on + * success and negative values on failure. + * + */ +int __init calipso_init(void) +{ + int ret_val; + + ret_val = calipso_cache_init(); + if (!ret_val) + netlbl_calipso_ops_register(&ops); + return ret_val; +} + +void calipso_exit(void) +{ + netlbl_calipso_ops_register(NULL); + calipso_cache_invalidate(); + kfree(calipso_cache); +} diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 8de5dd7aaa05..139ceb68bd37 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -43,6 +43,7 @@ #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> +#include <net/calipso.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/xfrm.h> #endif @@ -603,6 +604,28 @@ drop: return false; } +/* CALIPSO RFC 5570 */ + +static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff) +{ + const unsigned char *nh = skb_network_header(skb); + + if (nh[optoff + 1] < 8) + goto drop; + + if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1]) + goto drop; + + if (!calipso_validate(skb, nh + optoff)) + goto drop; + + return true; + +drop: + kfree_skb(skb); + return false; +} + static const struct tlvtype_proc tlvprochopopt_lst[] = { { .type = IPV6_TLV_ROUTERALERT, @@ -612,6 +635,10 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = { .type = IPV6_TLV_JUMBO, .func = ipv6_hop_jumbo, }, + { + .type = IPV6_TLV_CALIPSO, + .func = ipv6_hop_calipso, + }, { -1, } }; @@ -758,6 +785,27 @@ static int ipv6_renew_option(void *ohdr, return 0; } +/** + * ipv6_renew_options - replace a specific ext hdr with a new one. + * + * @sk: sock from which to allocate memory + * @opt: original options + * @newtype: option type to replace in @opt + * @newopt: new option of type @newtype to replace (user-mem) + * @newoptlen: length of @newopt + * + * Returns a new set of options which is a copy of @opt with the + * option type @newtype replaced with @newopt. + * + * @opt may be NULL, in which case a new set of options is returned + * containing just @newopt. + * + * @newopt may be NULL, in which case the specified option type is + * not copied into the new set of options. + * + * The new set of options is allocated from the socket option memory + * buffer of @sk. + */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, @@ -830,6 +878,34 @@ out: return ERR_PTR(err); } +/** + * ipv6_renew_options_kern - replace a specific ext hdr with a new one. + * + * @sk: sock from which to allocate memory + * @opt: original options + * @newtype: option type to replace in @opt + * @newopt: new option of type @newtype to replace (kernel-mem) + * @newoptlen: length of @newopt + * + * See ipv6_renew_options(). The difference is that @newopt is + * kernel memory, rather than user memory. + */ +struct ipv6_txoptions * +ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt, + int newtype, struct ipv6_opt_hdr *newopt, + int newoptlen) +{ + struct ipv6_txoptions *ret_val; + const mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret_val = ipv6_renew_options(sk, opt, newtype, + (struct ipv6_opt_hdr __user *)newopt, + newoptlen); + set_fs(old_fs); + return ret_val; +} + struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 9508a20fbf61..305e2ed730bf 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -112,7 +112,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, } EXPORT_SYMBOL(ipv6_skip_exthdr); -int ipv6_find_tlv(struct sk_buff *skb, int offset, int type) +int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a9895e15ee9c..5330262ab673 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -98,7 +98,6 @@ int ip6_ra_control(struct sock *sk, int sel) return 0; } -static struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 45243bbe5253..69c50e737c54 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -15,6 +15,9 @@ #include <net/ipv6.h> #include <net/addrconf.h> #include <net/inet_frag.h> +#ifdef CONFIG_NETLABEL +#include <net/calipso.h> +#endif static int one = 1; static int auto_flowlabels_min; @@ -106,6 +109,22 @@ static struct ctl_table ipv6_rotable[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one }, +#ifdef CONFIG_NETLABEL + { + .procname = "calipso_cache_enable", + .data = &calipso_cache_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "calipso_cache_bucket_size", + .data = &calipso_cache_bucketsize, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NETLABEL */ { } }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 37cf91323319..33df8b8575cc 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -443,6 +443,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; @@ -463,8 +464,10 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); rcu_read_lock(); - err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), - np->tclass); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); + err = ip6_xmit(sk, skb, fl6, opt, np->tclass); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -476,6 +479,7 @@ done: static void tcp_v6_reqsk_destructor(struct request_sock *req) { + kfree(inet_rsk(req)->ipv6_opt); kfree_skb(inet_rsk(req)->pktopts); } @@ -1112,7 +1116,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * but we make one more one thing there: reattach optmem to newsk. */ - opt = rcu_dereference(np->opt); + opt = ireq->ipv6_opt; + if (!opt) + opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 37d674e6f8a9..02b45a8e8b35 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -22,6 +22,7 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/poll.h> +#include <linux/security.h> #include <net/sock.h> #include <asm/ebcdic.h> #include <asm/cpcmd.h> @@ -530,8 +531,10 @@ static void iucv_sock_close(struct sock *sk) static void iucv_sock_init(struct sock *sk, struct sock *parent) { - if (parent) + if (parent) { sk->sk_type = parent->sk_type; + security_sk_clone(parent, sk); + } } static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern) diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig index 56958c85f2b4..d9eaa30ffe3f 100644 --- a/net/netlabel/Kconfig +++ b/net/netlabel/Kconfig @@ -5,6 +5,7 @@ config NETLABEL bool "NetLabel subsystem support" depends on SECURITY + select CRC_CCITT if IPV6 default n ---help--- NetLabel provides support for explicit network packet labeling diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile index d2732fc952e2..d341ede0dca5 100644 --- a/net/netlabel/Makefile +++ b/net/netlabel/Makefile @@ -12,4 +12,4 @@ obj-y += netlabel_mgmt.o # protocol modules obj-y += netlabel_unlabeled.o obj-y += netlabel_cipso_v4.o - +obj-$(subst m,y,$(CONFIG_IPV6)) += netlabel_calipso.o diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c new file mode 100644 index 000000000000..2ec93c5e77bb --- /dev/null +++ b/net/netlabel/netlabel_calipso.c @@ -0,0 +1,740 @@ +/* + * NetLabel CALIPSO/IPv6 Support + * + * This file defines the CALIPSO/IPv6 functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and CALIPSO. + * + * Authors: Paul Moore <paul@paul-moore.com> + * Huw Davies <huw@codeweavers.com> + * + */ + +/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/audit.h> +#include <linux/slab.h> +#include <net/sock.h> +#include <net/netlink.h> +#include <net/genetlink.h> +#include <net/netlabel.h> +#include <net/calipso.h> +#include <linux/atomic.h> + +#include "netlabel_user.h" +#include "netlabel_calipso.h" +#include "netlabel_mgmt.h" +#include "netlabel_domainhash.h" + +/* Argument struct for calipso_doi_walk() */ +struct netlbl_calipso_doiwalk_arg { + struct netlink_callback *nl_cb; + struct sk_buff *skb; + u32 seq; +}; + +/* Argument struct for netlbl_domhsh_walk() */ +struct netlbl_domhsh_walk_arg { + struct netlbl_audit *audit_info; + u32 doi; +}; + +/* NetLabel Generic NETLINK CALIPSO family */ +static struct genl_family netlbl_calipso_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = NETLBL_NLTYPE_CALIPSO_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_CALIPSO_A_MAX, +}; + +/* NetLabel Netlink attribute policy */ +static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = { + [NLBL_CALIPSO_A_DOI] = { .type = NLA_U32 }, + [NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 }, +}; + +/* NetLabel Command Handlers + */ +/** + * netlbl_calipso_add_pass - Adds a CALIPSO pass DOI definition + * @info: the Generic NETLINK info block + * @audit_info: NetLabel audit information + * + * Description: + * Create a new CALIPSO_MAP_PASS DOI definition based on the given ADD message + * and add it to the CALIPSO engine. Return zero on success and non-zero on + * error. + * + */ +static int netlbl_calipso_add_pass(struct genl_info *info, + struct netlbl_audit *audit_info) +{ + int ret_val; + struct calipso_doi *doi_def = NULL; + + doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); + if (!doi_def) + return -ENOMEM; + doi_def->type = CALIPSO_MAP_PASS; + doi_def->doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); + ret_val = calipso_doi_add(doi_def, audit_info); + if (ret_val != 0) + calipso_doi_free(doi_def); + + return ret_val; +} + +/** + * netlbl_calipso_add - Handle an ADD message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Create a new DOI definition based on the given ADD message and add it to the + * CALIPSO engine. Returns zero on success, negative values on failure. + * + */ +static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) + +{ + int ret_val = -EINVAL; + struct netlbl_audit audit_info; + + if (!info->attrs[NLBL_CALIPSO_A_DOI] || + !info->attrs[NLBL_CALIPSO_A_MTYPE]) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { + case CALIPSO_MAP_PASS: + ret_val = netlbl_calipso_add_pass(info, &audit_info); + break; + } + if (ret_val == 0) + atomic_inc(&netlabel_mgmt_protocount); + + return ret_val; +} + +/** + * netlbl_calipso_list - Handle a LIST message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated LIST message and respond accordingly. + * Returns zero on success and negative values on error. + * + */ +static int netlbl_calipso_list(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val; + struct sk_buff *ans_skb = NULL; + void *data; + u32 doi; + struct calipso_doi *doi_def; + + if (!info->attrs[NLBL_CALIPSO_A_DOI]) { + ret_val = -EINVAL; + goto list_failure; + } + + doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); + + doi_def = calipso_doi_getdef(doi); + if (!doi_def) { + ret_val = -EINVAL; + goto list_failure; + } + + ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ans_skb) { + ret_val = -ENOMEM; + goto list_failure_put; + } + data = genlmsg_put_reply(ans_skb, info, &netlbl_calipso_gnl_family, + 0, NLBL_CALIPSO_C_LIST); + if (!data) { + ret_val = -ENOMEM; + goto list_failure_put; + } + + ret_val = nla_put_u32(ans_skb, NLBL_CALIPSO_A_MTYPE, doi_def->type); + if (ret_val != 0) + goto list_failure_put; + + calipso_doi_putdef(doi_def); + + genlmsg_end(ans_skb, data); + return genlmsg_reply(ans_skb, info); + +list_failure_put: + calipso_doi_putdef(doi_def); +list_failure: + kfree_skb(ans_skb); + return ret_val; +} + +/** + * netlbl_calipso_listall_cb - calipso_doi_walk() callback for LISTALL + * @doi_def: the CALIPSO DOI definition + * @arg: the netlbl_calipso_doiwalk_arg structure + * + * Description: + * This function is designed to be used as a callback to the + * calipso_doi_walk() function for use in generating a response for a LISTALL + * message. Returns the size of the message on success, negative values on + * failure. + * + */ +static int netlbl_calipso_listall_cb(struct calipso_doi *doi_def, void *arg) +{ + int ret_val = -ENOMEM; + struct netlbl_calipso_doiwalk_arg *cb_arg = arg; + void *data; + + data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, + cb_arg->seq, &netlbl_calipso_gnl_family, + NLM_F_MULTI, NLBL_CALIPSO_C_LISTALL); + if (!data) + goto listall_cb_failure; + + ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_DOI, doi_def->doi); + if (ret_val != 0) + goto listall_cb_failure; + ret_val = nla_put_u32(cb_arg->skb, + NLBL_CALIPSO_A_MTYPE, + doi_def->type); + if (ret_val != 0) + goto listall_cb_failure; + + genlmsg_end(cb_arg->skb, data); + return 0; + +listall_cb_failure: + genlmsg_cancel(cb_arg->skb, data); + return ret_val; +} + +/** + * netlbl_calipso_listall - Handle a LISTALL message + * @skb: the NETLINK buffer + * @cb: the NETLINK callback + * + * Description: + * Process a user generated LISTALL message and respond accordingly. Returns + * zero on success and negative values on error. + * + */ +static int netlbl_calipso_listall(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct netlbl_calipso_doiwalk_arg cb_arg; + u32 doi_skip = cb->args[0]; + + cb_arg.nl_cb = cb; + cb_arg.skb = skb; + cb_arg.seq = cb->nlh->nlmsg_seq; + + calipso_doi_walk(&doi_skip, netlbl_calipso_listall_cb, &cb_arg); + + cb->args[0] = doi_skip; + return skb->len; +} + +/** + * netlbl_calipso_remove_cb - netlbl_calipso_remove() callback for REMOVE + * @entry: LSM domain mapping entry + * @arg: the netlbl_domhsh_walk_arg structure + * + * Description: + * This function is intended for use by netlbl_calipso_remove() as the callback + * for the netlbl_domhsh_walk() function; it removes LSM domain map entries + * which are associated with the CALIPSO DOI specified in @arg. Returns zero on + * success, negative values on failure. + * + */ +static int netlbl_calipso_remove_cb(struct netlbl_dom_map *entry, void *arg) +{ + struct netlbl_domhsh_walk_arg *cb_arg = arg; + + if (entry->def.type == NETLBL_NLTYPE_CALIPSO && + entry->def.calipso->doi == cb_arg->doi) + return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info); + + return 0; +} + +/** + * netlbl_calipso_remove - Handle a REMOVE message + * @skb: the NETLINK buffer + * @info: the Generic NETLINK info block + * + * Description: + * Process a user generated REMOVE message and respond accordingly. Returns + * zero on success, negative values on failure. + * + */ +static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) +{ + int ret_val = -EINVAL; + struct netlbl_domhsh_walk_arg cb_arg; + struct netlbl_audit audit_info; + u32 skip_bkt = 0; + u32 skip_chain = 0; + + if (!info->attrs[NLBL_CALIPSO_A_DOI]) + return -EINVAL; + + netlbl_netlink_auditinfo(skb, &audit_info); + cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); + cb_arg.audit_info = &audit_info; + ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, + netlbl_calipso_remove_cb, &cb_arg); + if (ret_val == 0 || ret_val == -ENOENT) { + ret_val = calipso_doi_remove(cb_arg.doi, &audit_info); + if (ret_val == 0) + atomic_dec(&netlabel_mgmt_protocount); + } + + return ret_val; +} + +/* NetLabel Generic NETLINK Command Definitions + */ + +static const struct genl_ops netlbl_calipso_ops[] = { + { + .cmd = NLBL_CALIPSO_C_ADD, + .flags = GENL_ADMIN_PERM, + .policy = calipso_genl_policy, + .doit = netlbl_calipso_add, + .dumpit = NULL, + }, + { + .cmd = NLBL_CALIPSO_C_REMOVE, + .flags = GENL_ADMIN_PERM, + .policy = calipso_genl_policy, + .doit = netlbl_calipso_remove, + .dumpit = NULL, + }, + { + .cmd = NLBL_CALIPSO_C_LIST, + .flags = 0, + .policy = calipso_genl_policy, + .doit = netlbl_calipso_list, + .dumpit = NULL, + }, + { + .cmd = NLBL_CALIPSO_C_LISTALL, + .flags = 0, + .policy = calipso_genl_policy, + .doit = NULL, + .dumpit = netlbl_calipso_listall, + }, +}; + +/* NetLabel Generic NETLINK Protocol Functions + */ + +/** + * netlbl_calipso_genl_init - Register the CALIPSO NetLabel component + * + * Description: + * Register the CALIPSO packet NetLabel component with the Generic NETLINK + * mechanism. Returns zero on success, negative values on failure. + * + */ +int __init netlbl_calipso_genl_init(void) +{ + return genl_register_family_with_ops(&netlbl_calipso_gnl_family, + netlbl_calipso_ops); +} + +static const struct netlbl_calipso_ops *calipso_ops; + +/** + * netlbl_calipso_ops_register - Register the CALIPSO operations + * + * Description: + * Register the CALIPSO packet engine operations. + * + */ +const struct netlbl_calipso_ops * +netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops) +{ + return xchg(&calipso_ops, ops); +} +EXPORT_SYMBOL(netlbl_calipso_ops_register); + +static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) +{ + return ACCESS_ONCE(calipso_ops); +} + +/** + * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine + * @doi_def: the DOI structure + * @audit_info: NetLabel audit information + * + * Description: + * The caller defines a new DOI for use by the CALIPSO engine and calls this + * function to add it to the list of acceptable domains. The caller must + * ensure that the mapping table specified in @doi_def->map meets all of the + * requirements of the mapping type (see calipso.h for details). Returns + * zero on success and non-zero on failure. + * + */ +int calipso_doi_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_add(doi_def, audit_info); + return ret_val; +} + +/** + * calipso_doi_free - Frees a DOI definition + * @doi_def: the DOI definition + * + * Description: + * This function frees all of the memory associated with a DOI definition. + * + */ +void calipso_doi_free(struct calipso_doi *doi_def) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->doi_free(doi_def); +} + +/** + * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine + * @doi: the DOI value + * @audit_secid: the LSM secid to use in the audit message + * + * Description: + * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will + * be called to release their own LSM domain mappings as well as our own + * domain list. Returns zero on success and negative values on failure. + * + */ +int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_remove(doi, audit_info); + return ret_val; +} + +/** + * calipso_doi_getdef - Returns a reference to a valid DOI definition + * @doi: the DOI value + * + * Description: + * Searches for a valid DOI definition and if one is found it is returned to + * the caller. Otherwise NULL is returned. The caller must ensure that + * calipso_doi_putdef() is called when the caller is done. + * + */ +struct calipso_doi *calipso_doi_getdef(u32 doi) +{ + struct calipso_doi *ret_val = NULL; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_getdef(doi); + return ret_val; +} + +/** + * calipso_doi_putdef - Releases a reference for the given DOI definition + * @doi_def: the DOI definition + * + * Description: + * Releases a DOI definition reference obtained from calipso_doi_getdef(). + * + */ +void calipso_doi_putdef(struct calipso_doi *doi_def) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->doi_putdef(doi_def); +} + +/** + * calipso_doi_walk - Iterate through the DOI definitions + * @skip_cnt: skip past this number of DOI definitions, updated + * @callback: callback for each DOI definition + * @cb_arg: argument for the callback function + * + * Description: + * Iterate over the DOI definition list, skipping the first @skip_cnt entries. + * For each entry call @callback, if @callback returns a negative value stop + * 'walking' through the list and return. Updates the value in @skip_cnt upon + * return. Returns zero on success, negative values on failure. + * + */ +int calipso_doi_walk(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, void *arg), + void *cb_arg) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->doi_walk(skip_cnt, callback, cb_arg); + return ret_val; +} + +/** + * calipso_sock_getattr - Get the security attributes from a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Query @sk to see if there is a CALIPSO option attached to the sock and if + * there is return the CALIPSO security attributes in @secattr. This function + * requires that @sk be locked, or privately held, but it does not do any + * locking itself. Returns zero on success and negative values on failure. + * + */ +int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->sock_getattr(sk, secattr); + return ret_val; +} + +/** + * calipso_sock_setattr - Add a CALIPSO option to a socket + * @sk: the socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. This function requires + * exclusive access to @sk, which means it either needs to be in the + * process of being created or locked. Returns zero on success and negative + * values on failure. + * + */ +int calipso_sock_setattr(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->sock_setattr(sk, doi_def, secattr); + return ret_val; +} + +/** + * calipso_sock_delattr - Delete the CALIPSO option from a socket + * @sk: the socket + * + * Description: + * Removes the CALIPSO option from a socket, if present. + * + */ +void calipso_sock_delattr(struct sock *sk) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->sock_delattr(sk); +} + +/** + * calipso_req_setattr - Add a CALIPSO option to a connection request socket + * @req: the connection request socket + * @doi_def: the CALIPSO DOI to use + * @secattr: the specific security attributes of the socket + * + * Description: + * Set the CALIPSO option on the given socket using the DOI definition and + * security attributes passed to the function. Returns zero on success and + * negative values on failure. + * + */ +int calipso_req_setattr(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->req_setattr(req, doi_def, secattr); + return ret_val; +} + +/** + * calipso_req_delattr - Delete the CALIPSO option from a request socket + * @reg: the request socket + * + * Description: + * Removes the CALIPSO option from a request socket, if present. + * + */ +void calipso_req_delattr(struct request_sock *req) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->req_delattr(req); +} + +/** + * calipso_optptr - Find the CALIPSO option in the packet + * @skb: the packet + * + * Description: + * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer + * to the start of the CALIPSO option on success, NULL if one if not found. + * + */ +unsigned char *calipso_optptr(const struct sk_buff *skb) +{ + unsigned char *ret_val = NULL; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->skbuff_optptr(skb); + return ret_val; +} + +/** + * calipso_getattr - Get the security attributes from a memory block. + * @calipso: the CALIPSO option + * @secattr: the security attributes + * + * Description: + * Inspect @calipso and return the security attributes in @secattr. + * Returns zero on success and negative values on failure. + * + */ +int calipso_getattr(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->opt_getattr(calipso, secattr); + return ret_val; +} + +/** + * calipso_skbuff_setattr - Set the CALIPSO option on a packet + * @skb: the packet + * @doi_def: the CALIPSO DOI to use + * @secattr: the security attributes + * + * Description: + * Set the CALIPSO option on the given packet based on the security attributes. + * Returns a pointer to the IP header on success and NULL on failure. + * + */ +int calipso_skbuff_setattr(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->skbuff_setattr(skb, doi_def, secattr); + return ret_val; +} + +/** + * calipso_skbuff_delattr - Delete any CALIPSO options from a packet + * @skb: the packet + * + * Description: + * Removes any and all CALIPSO options from the given packet. Returns zero on + * success, negative values on failure. + * + */ +int calipso_skbuff_delattr(struct sk_buff *skb) +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->skbuff_delattr(skb); + return ret_val; +} + +/** + * calipso_cache_invalidate - Invalidates the current CALIPSO cache + * + * Description: + * Invalidates and frees any entries in the CALIPSO cache. Returns zero on + * success and negative values on failure. + * + */ +void calipso_cache_invalidate(void) +{ + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ops->cache_invalidate(); +} + +/** + * calipso_cache_add - Add an entry to the CALIPSO cache + * @calipso_ptr: the CALIPSO option + * @secattr: the packet's security attributes + * + * Description: + * Add a new entry into the CALIPSO label mapping cache. + * Returns zero on success, negative values on failure. + * + */ +int calipso_cache_add(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr) + +{ + int ret_val = -ENOMSG; + const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); + + if (ops) + ret_val = ops->cache_add(calipso_ptr, secattr); + return ret_val; +} diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h new file mode 100644 index 000000000000..9fd291cd0fc5 --- /dev/null +++ b/net/netlabel/netlabel_calipso.h @@ -0,0 +1,151 @@ +/* + * NetLabel CALIPSO Support + * + * This file defines the CALIPSO functions for the NetLabel system. The + * NetLabel system manages static and dynamic label mappings for network + * protocols such as CIPSO and RIPSO. + * + * Authors: Paul Moore <paul@paul-moore.com> + * Huw Davies <huw@codeweavers.com> + * + */ + +/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006 + * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#ifndef _NETLABEL_CALIPSO +#define _NETLABEL_CALIPSO + +#include <net/netlabel.h> +#include <net/calipso.h> + +/* The following NetLabel payloads are supported by the CALIPSO subsystem. + * + * o ADD: + * Sent by an application to add a new DOI mapping table. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * NLBL_CALIPSO_A_MTYPE + * + * If using CALIPSO_MAP_PASS no additional attributes are required. + * + * o REMOVE: + * Sent by an application to remove a specific DOI mapping table from the + * CALIPSO system. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * + * o LIST: + * Sent by an application to list the details of a DOI definition. On + * success the kernel should send a response using the following format. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * + * The valid response message format depends on the type of the DOI mapping, + * the defined formats are shown below. + * + * Required attributes: + * + * NLBL_CALIPSO_A_MTYPE + * + * If using CALIPSO_MAP_PASS no additional attributes are required. + * + * o LISTALL: + * This message is sent by an application to list the valid DOIs on the + * system. When sent by an application there is no payload and the + * NLM_F_DUMP flag should be set. The kernel should respond with a series of + * the following messages. + * + * Required attributes: + * + * NLBL_CALIPSO_A_DOI + * NLBL_CALIPSO_A_MTYPE + * + */ + +/* NetLabel CALIPSO commands */ +enum { + NLBL_CALIPSO_C_UNSPEC, + NLBL_CALIPSO_C_ADD, + NLBL_CALIPSO_C_REMOVE, + NLBL_CALIPSO_C_LIST, + NLBL_CALIPSO_C_LISTALL, + __NLBL_CALIPSO_C_MAX, +}; + +/* NetLabel CALIPSO attributes */ +enum { + NLBL_CALIPSO_A_UNSPEC, + NLBL_CALIPSO_A_DOI, + /* (NLA_U32) + * the DOI value */ + NLBL_CALIPSO_A_MTYPE, + /* (NLA_U32) + * the mapping table type (defined in the calipso.h header as + * CALIPSO_MAP_*) */ + __NLBL_CALIPSO_A_MAX, +}; + +#define NLBL_CALIPSO_A_MAX (__NLBL_CALIPSO_A_MAX - 1) + +/* NetLabel protocol functions */ +#if IS_ENABLED(CONFIG_IPV6) +int netlbl_calipso_genl_init(void); +#else +static inline int netlbl_calipso_genl_init(void) +{ + return 0; +} +#endif + +int calipso_doi_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info); +void calipso_doi_free(struct calipso_doi *doi_def); +int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info); +struct calipso_doi *calipso_doi_getdef(u32 doi); +void calipso_doi_putdef(struct calipso_doi *doi_def); +int calipso_doi_walk(u32 *skip_cnt, + int (*callback)(struct calipso_doi *doi_def, void *arg), + void *cb_arg); +int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); +int calipso_sock_setattr(struct sock *sk, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); +void calipso_sock_delattr(struct sock *sk); +int calipso_req_setattr(struct request_sock *req, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); +void calipso_req_delattr(struct request_sock *req); +unsigned char *calipso_optptr(const struct sk_buff *skb); +int calipso_getattr(const unsigned char *calipso, + struct netlbl_lsm_secattr *secattr); +int calipso_skbuff_setattr(struct sk_buff *skb, + const struct calipso_doi *doi_def, + const struct netlbl_lsm_secattr *secattr); +int calipso_skbuff_delattr(struct sk_buff *skb); +void calipso_cache_invalidate(void); +int calipso_cache_add(const unsigned char *calipso_ptr, + const struct netlbl_lsm_secattr *secattr); + +#endif diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index ada67422234b..41d0e95d171e 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -37,10 +37,12 @@ #include <linux/slab.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> +#include <net/calipso.h> #include <asm/bug.h> #include "netlabel_mgmt.h" #include "netlabel_addrlist.h" +#include "netlabel_calipso.h" #include "netlabel_domainhash.h" #include "netlabel_user.h" @@ -55,8 +57,9 @@ struct netlbl_domhsh_tbl { static DEFINE_SPINLOCK(netlbl_domhsh_lock); #define netlbl_domhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock)) -static struct netlbl_domhsh_tbl *netlbl_domhsh; -static struct netlbl_dom_map *netlbl_domhsh_def; +static struct netlbl_domhsh_tbl __rcu *netlbl_domhsh; +static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv4; +static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv6; /* * Domain Hash Table Helper Functions @@ -126,18 +129,26 @@ static u32 netlbl_domhsh_hash(const char *key) return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1); } +static bool netlbl_family_match(u16 f1, u16 f2) +{ + return (f1 == f2) || (f1 == AF_UNSPEC) || (f2 == AF_UNSPEC); +} + /** * netlbl_domhsh_search - Search for a domain entry * @domain: the domain + * @family: the address family * * Description: * Searches the domain hash table and returns a pointer to the hash table - * entry if found, otherwise NULL is returned. The caller is responsible for + * entry if found, otherwise NULL is returned. @family may be %AF_UNSPEC + * which matches any address family entries. The caller is responsible for * ensuring that the hash table is protected with either a RCU read lock or the * hash table lock. * */ -static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) +static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, + u16 family) { u32 bkt; struct list_head *bkt_list; @@ -147,7 +158,9 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) bkt = netlbl_domhsh_hash(domain); bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; list_for_each_entry_rcu(iter, bkt_list, list) - if (iter->valid && strcmp(iter->domain, domain) == 0) + if (iter->valid && + netlbl_family_match(iter->family, family) && + strcmp(iter->domain, domain) == 0) return iter; } @@ -157,28 +170,37 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain) /** * netlbl_domhsh_search_def - Search for a domain entry * @domain: the domain - * @def: return default if no match is found + * @family: the address family * * Description: * Searches the domain hash table and returns a pointer to the hash table * entry if an exact match is found, if an exact match is not present in the * hash table then the default entry is returned if valid otherwise NULL is - * returned. The caller is responsible ensuring that the hash table is + * returned. @family may be %AF_UNSPEC which matches any address family + * entries. The caller is responsible ensuring that the hash table is * protected with either a RCU read lock or the hash table lock. * */ -static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain) +static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain, + u16 family) { struct netlbl_dom_map *entry; - entry = netlbl_domhsh_search(domain); - if (entry == NULL) { - entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def); - if (entry != NULL && !entry->valid) - entry = NULL; + entry = netlbl_domhsh_search(domain, family); + if (entry != NULL) + return entry; + if (family == AF_INET || family == AF_UNSPEC) { + entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv4); + if (entry != NULL && entry->valid) + return entry; + } + if (family == AF_INET6 || family == AF_UNSPEC) { + entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv6); + if (entry != NULL && entry->valid) + return entry; } - return entry; + return NULL; } /** @@ -203,6 +225,7 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, { struct audit_buffer *audit_buf; struct cipso_v4_doi *cipsov4 = NULL; + struct calipso_doi *calipso = NULL; u32 type; audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info); @@ -221,12 +244,14 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, struct netlbl_domaddr6_map *map6; map6 = netlbl_domhsh_addr6_entry(addr6); type = map6->def.type; + calipso = map6->def.calipso; netlbl_af6list_audit_addr(audit_buf, 0, NULL, &addr6->addr, &addr6->mask); #endif /* IPv6 */ } else { type = entry->def.type; cipsov4 = entry->def.cipso; + calipso = entry->def.calipso; } switch (type) { case NETLBL_NLTYPE_UNLABELED: @@ -238,6 +263,12 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, " nlbl_protocol=cipsov4 cipso_doi=%u", cipsov4->doi); break; + case NETLBL_NLTYPE_CALIPSO: + BUG_ON(calipso == NULL); + audit_log_format(audit_buf, + " nlbl_protocol=calipso calipso_doi=%u", + calipso->doi); + break; } audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0); audit_log_end(audit_buf); @@ -264,13 +295,25 @@ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry) if (entry == NULL) return -EINVAL; + if (entry->family != AF_INET && entry->family != AF_INET6 && + (entry->family != AF_UNSPEC || + entry->def.type != NETLBL_NLTYPE_UNLABELED)) + return -EINVAL; + switch (entry->def.type) { case NETLBL_NLTYPE_UNLABELED: - if (entry->def.cipso != NULL || entry->def.addrsel != NULL) + if (entry->def.cipso != NULL || entry->def.calipso != NULL || + entry->def.addrsel != NULL) return -EINVAL; break; case NETLBL_NLTYPE_CIPSOV4: - if (entry->def.cipso == NULL) + if (entry->family != AF_INET || + entry->def.cipso == NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_CALIPSO: + if (entry->family != AF_INET6 || + entry->def.calipso == NULL) return -EINVAL; break; case NETLBL_NLTYPE_ADDRSELECT: @@ -294,6 +337,12 @@ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry) map6 = netlbl_domhsh_addr6_entry(iter6); switch (map6->def.type) { case NETLBL_NLTYPE_UNLABELED: + if (map6->def.calipso != NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_CALIPSO: + if (map6->def.calipso == NULL) + return -EINVAL; break; default: return -EINVAL; @@ -358,15 +407,18 @@ int __init netlbl_domhsh_init(u32 size) * * Description: * Adds a new entry to the domain hash table and handles any updates to the - * lower level protocol handler (i.e. CIPSO). Returns zero on success, - * negative on failure. + * lower level protocol handler (i.e. CIPSO). @entry->family may be set to + * %AF_UNSPEC which will add an entry that matches all address families. This + * is only useful for the unlabelled type and will only succeed if there is no + * existing entry for any address family with the same domain. Returns zero + * on success, negative on failure. * */ int netlbl_domhsh_add(struct netlbl_dom_map *entry, struct netlbl_audit *audit_info) { int ret_val = 0; - struct netlbl_dom_map *entry_old; + struct netlbl_dom_map *entry_old, *entry_b; struct netlbl_af4list *iter4; struct netlbl_af4list *tmp4; #if IS_ENABLED(CONFIG_IPV6) @@ -385,9 +437,10 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, rcu_read_lock(); spin_lock(&netlbl_domhsh_lock); if (entry->domain != NULL) - entry_old = netlbl_domhsh_search(entry->domain); + entry_old = netlbl_domhsh_search(entry->domain, entry->family); else - entry_old = netlbl_domhsh_search_def(entry->domain); + entry_old = netlbl_domhsh_search_def(entry->domain, + entry->family); if (entry_old == NULL) { entry->valid = 1; @@ -397,7 +450,41 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, &rcu_dereference(netlbl_domhsh)->tbl[bkt]); } else { INIT_LIST_HEAD(&entry->list); - rcu_assign_pointer(netlbl_domhsh_def, entry); + switch (entry->family) { + case AF_INET: + rcu_assign_pointer(netlbl_domhsh_def_ipv4, + entry); + break; + case AF_INET6: + rcu_assign_pointer(netlbl_domhsh_def_ipv6, + entry); + break; + case AF_UNSPEC: + if (entry->def.type != + NETLBL_NLTYPE_UNLABELED) { + ret_val = -EINVAL; + goto add_return; + } + entry_b = kzalloc(sizeof(*entry_b), GFP_ATOMIC); + if (entry_b == NULL) { + ret_val = -ENOMEM; + goto add_return; + } + entry_b->family = AF_INET6; + entry_b->def.type = NETLBL_NLTYPE_UNLABELED; + entry_b->valid = 1; + entry->family = AF_INET; + rcu_assign_pointer(netlbl_domhsh_def_ipv4, + entry); + rcu_assign_pointer(netlbl_domhsh_def_ipv6, + entry_b); + break; + default: + /* Already checked in + * netlbl_domhsh_validate(). */ + ret_val = -EINVAL; + goto add_return; + } } if (entry->def.type == NETLBL_NLTYPE_ADDRSELECT) { @@ -513,10 +600,12 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, spin_lock(&netlbl_domhsh_lock); if (entry->valid) { entry->valid = 0; - if (entry != rcu_dereference(netlbl_domhsh_def)) - list_del_rcu(&entry->list); + if (entry == rcu_dereference(netlbl_domhsh_def_ipv4)) + RCU_INIT_POINTER(netlbl_domhsh_def_ipv4, NULL); + else if (entry == rcu_dereference(netlbl_domhsh_def_ipv6)) + RCU_INIT_POINTER(netlbl_domhsh_def_ipv6, NULL); else - RCU_INIT_POINTER(netlbl_domhsh_def, NULL); + list_del_rcu(&entry->list); } else ret_val = -ENOENT; spin_unlock(&netlbl_domhsh_lock); @@ -533,6 +622,10 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, if (ret_val == 0) { struct netlbl_af4list *iter4; struct netlbl_domaddr4_map *map4; +#if IS_ENABLED(CONFIG_IPV6) + struct netlbl_af6list *iter6; + struct netlbl_domaddr6_map *map6; +#endif /* IPv6 */ switch (entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: @@ -541,12 +634,22 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry, map4 = netlbl_domhsh_addr4_entry(iter4); cipso_v4_doi_putdef(map4->def.cipso); } - /* no need to check the IPv6 list since we currently - * support only unlabeled protocols for IPv6 */ +#if IS_ENABLED(CONFIG_IPV6) + netlbl_af6list_foreach_rcu(iter6, + &entry->def.addrsel->list6) { + map6 = netlbl_domhsh_addr6_entry(iter6); + calipso_doi_putdef(map6->def.calipso); + } +#endif /* IPv6 */ break; case NETLBL_NLTYPE_CIPSOV4: cipso_v4_doi_putdef(entry->def.cipso); break; +#if IS_ENABLED(CONFIG_IPV6) + case NETLBL_NLTYPE_CALIPSO: + calipso_doi_putdef(entry->def.calipso); + break; +#endif /* IPv6 */ } call_rcu(&entry->rcu, netlbl_domhsh_free_entry); } @@ -583,9 +686,9 @@ int netlbl_domhsh_remove_af4(const char *domain, rcu_read_lock(); if (domain) - entry_map = netlbl_domhsh_search(domain); + entry_map = netlbl_domhsh_search(domain, AF_INET); else - entry_map = netlbl_domhsh_search_def(domain); + entry_map = netlbl_domhsh_search_def(domain, AF_INET); if (entry_map == NULL || entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT) goto remove_af4_failure; @@ -622,28 +725,114 @@ remove_af4_failure: return -ENOENT; } +#if IS_ENABLED(CONFIG_IPV6) +/** + * netlbl_domhsh_remove_af6 - Removes an address selector entry + * @domain: the domain + * @addr: IPv6 address + * @mask: IPv6 address mask + * @audit_info: NetLabel audit information + * + * Description: + * Removes an individual address selector from a domain mapping and potentially + * the entire mapping if it is empty. Returns zero on success, negative values + * on failure. + * + */ +int netlbl_domhsh_remove_af6(const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info) +{ + struct netlbl_dom_map *entry_map; + struct netlbl_af6list *entry_addr; + struct netlbl_af4list *iter4; + struct netlbl_af6list *iter6; + struct netlbl_domaddr6_map *entry; + + rcu_read_lock(); + + if (domain) + entry_map = netlbl_domhsh_search(domain, AF_INET6); + else + entry_map = netlbl_domhsh_search_def(domain, AF_INET6); + if (entry_map == NULL || + entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT) + goto remove_af6_failure; + + spin_lock(&netlbl_domhsh_lock); + entry_addr = netlbl_af6list_remove(addr, mask, + &entry_map->def.addrsel->list6); + spin_unlock(&netlbl_domhsh_lock); + + if (entry_addr == NULL) + goto remove_af6_failure; + netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4) + goto remove_af6_single_addr; + netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6) + goto remove_af6_single_addr; + /* the domain mapping is empty so remove it from the mapping table */ + netlbl_domhsh_remove_entry(entry_map, audit_info); + +remove_af6_single_addr: + rcu_read_unlock(); + /* yick, we can't use call_rcu here because we don't have a rcu head + * pointer but hopefully this should be a rare case so the pause + * shouldn't be a problem */ + synchronize_rcu(); + entry = netlbl_domhsh_addr6_entry(entry_addr); + calipso_doi_putdef(entry->def.calipso); + kfree(entry); + return 0; + +remove_af6_failure: + rcu_read_unlock(); + return -ENOENT; +} +#endif /* IPv6 */ + /** * netlbl_domhsh_remove - Removes an entry from the domain hash table * @domain: the domain to remove + * @family: address family * @audit_info: NetLabel audit information * * Description: * Removes an entry from the domain hash table and handles any updates to the - * lower level protocol handler (i.e. CIPSO). Returns zero on success, - * negative on failure. + * lower level protocol handler (i.e. CIPSO). @family may be %AF_UNSPEC which + * removes all address family entries. Returns zero on success, negative on + * failure. * */ -int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) +int netlbl_domhsh_remove(const char *domain, u16 family, + struct netlbl_audit *audit_info) { - int ret_val; + int ret_val = -EINVAL; struct netlbl_dom_map *entry; rcu_read_lock(); - if (domain) - entry = netlbl_domhsh_search(domain); - else - entry = netlbl_domhsh_search_def(domain); - ret_val = netlbl_domhsh_remove_entry(entry, audit_info); + + if (family == AF_INET || family == AF_UNSPEC) { + if (domain) + entry = netlbl_domhsh_search(domain, AF_INET); + else + entry = netlbl_domhsh_search_def(domain, AF_INET); + ret_val = netlbl_domhsh_remove_entry(entry, audit_info); + if (ret_val && ret_val != -ENOENT) + goto done; + } + if (family == AF_INET6 || family == AF_UNSPEC) { + int ret_val2; + + if (domain) + entry = netlbl_domhsh_search(domain, AF_INET6); + else + entry = netlbl_domhsh_search_def(domain, AF_INET6); + ret_val2 = netlbl_domhsh_remove_entry(entry, audit_info); + if (ret_val2 != -ENOENT) + ret_val = ret_val2; + } +done: rcu_read_unlock(); return ret_val; @@ -651,32 +840,38 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info) /** * netlbl_domhsh_remove_default - Removes the default entry from the table + * @family: address family * @audit_info: NetLabel audit information * * Description: - * Removes/resets the default entry for the domain hash table and handles any - * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on - * success, non-zero on failure. + * Removes/resets the default entry corresponding to @family from the domain + * hash table and handles any updates to the lower level protocol handler + * (i.e. CIPSO). @family may be %AF_UNSPEC which removes all address family + * entries. Returns zero on success, negative on failure. * */ -int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info) +int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info) { - return netlbl_domhsh_remove(NULL, audit_info); + return netlbl_domhsh_remove(NULL, family, audit_info); } /** * netlbl_domhsh_getentry - Get an entry from the domain hash table * @domain: the domain name to search for + * @family: address family * * Description: * Look through the domain hash table searching for an entry to match @domain, - * return a pointer to a copy of the entry or NULL. The caller is responsible - * for ensuring that rcu_read_[un]lock() is called. + * with address family @family, return a pointer to a copy of the entry or + * NULL. The caller is responsible for ensuring that rcu_read_[un]lock() is + * called. * */ -struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain) +struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family) { - return netlbl_domhsh_search_def(domain); + if (family == AF_UNSPEC) + return NULL; + return netlbl_domhsh_search_def(domain, family); } /** @@ -696,7 +891,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain, struct netlbl_dom_map *dom_iter; struct netlbl_af4list *addr_iter; - dom_iter = netlbl_domhsh_search_def(domain); + dom_iter = netlbl_domhsh_search_def(domain, AF_INET); if (dom_iter == NULL) return NULL; @@ -726,7 +921,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain, struct netlbl_dom_map *dom_iter; struct netlbl_af6list *addr_iter; - dom_iter = netlbl_domhsh_search_def(domain); + dom_iter = netlbl_domhsh_search_def(domain, AF_INET6); if (dom_iter == NULL) return NULL; diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h index 680caf4dff56..1f9247781927 100644 --- a/net/netlabel/netlabel_domainhash.h +++ b/net/netlabel/netlabel_domainhash.h @@ -51,6 +51,7 @@ struct netlbl_dommap_def { union { struct netlbl_domaddr_map *addrsel; struct cipso_v4_doi *cipso; + struct calipso_doi *calipso; }; }; #define netlbl_domhsh_addr4_entry(iter) \ @@ -70,6 +71,7 @@ struct netlbl_domaddr6_map { struct netlbl_dom_map { char *domain; + u16 family; struct netlbl_dommap_def def; u32 valid; @@ -91,14 +93,23 @@ int netlbl_domhsh_remove_af4(const char *domain, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info); -int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info); -int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info); -struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain); +int netlbl_domhsh_remove_af6(const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info); +int netlbl_domhsh_remove(const char *domain, u16 family, + struct netlbl_audit *audit_info); +int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info); +struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family); struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain, __be32 addr); #if IS_ENABLED(CONFIG_IPV6) struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain, const struct in6_addr *addr); +int netlbl_domhsh_remove_af6(const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info); #endif /* IPv6 */ int netlbl_domhsh_walk(u32 *skip_bkt, diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 1325776daa27..28c56b95fb7f 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -37,12 +37,14 @@ #include <net/ipv6.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> +#include <net/calipso.h> #include <asm/bug.h> #include <linux/atomic.h> #include "netlabel_domainhash.h" #include "netlabel_unlabeled.h" #include "netlabel_cipso_v4.h" +#include "netlabel_calipso.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" #include "netlabel_addrlist.h" @@ -72,12 +74,17 @@ int netlbl_cfg_map_del(const char *domain, struct netlbl_audit *audit_info) { if (addr == NULL && mask == NULL) { - return netlbl_domhsh_remove(domain, audit_info); + return netlbl_domhsh_remove(domain, family, audit_info); } else if (addr != NULL && mask != NULL) { switch (family) { case AF_INET: return netlbl_domhsh_remove_af4(domain, addr, mask, audit_info); +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return netlbl_domhsh_remove_af6(domain, addr, mask, + audit_info); +#endif /* IPv6 */ default: return -EPFNOSUPPORT; } @@ -119,6 +126,7 @@ int netlbl_cfg_unlbl_map_add(const char *domain, if (entry->domain == NULL) goto cfg_unlbl_map_add_failure; } + entry->family = family; if (addr == NULL && mask == NULL) entry->def.type = NETLBL_NLTYPE_UNLABELED; @@ -345,6 +353,7 @@ int netlbl_cfg_cipsov4_map_add(u32 doi, entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (entry == NULL) goto out_entry; + entry->family = AF_INET; if (domain != NULL) { entry->domain = kstrdup(domain, GFP_ATOMIC); if (entry->domain == NULL) @@ -399,6 +408,139 @@ out_entry: return ret_val; } +/** + * netlbl_cfg_calipso_add - Add a new CALIPSO DOI definition + * @doi_def: CALIPSO DOI definition + * @audit_info: NetLabel audit information + * + * Description: + * Add a new CALIPSO DOI definition as defined by @doi_def. Returns zero on + * success and negative values on failure. + * + */ +int netlbl_cfg_calipso_add(struct calipso_doi *doi_def, + struct netlbl_audit *audit_info) +{ +#if IS_ENABLED(CONFIG_IPV6) + return calipso_doi_add(doi_def, audit_info); +#else /* IPv6 */ + return -ENOSYS; +#endif /* IPv6 */ +} + +/** + * netlbl_cfg_calipso_del - Remove an existing CALIPSO DOI definition + * @doi: CALIPSO DOI + * @audit_info: NetLabel audit information + * + * Description: + * Remove an existing CALIPSO DOI definition matching @doi. Returns zero on + * success and negative values on failure. + * + */ +void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info) +{ +#if IS_ENABLED(CONFIG_IPV6) + calipso_doi_remove(doi, audit_info); +#endif /* IPv6 */ +} + +/** + * netlbl_cfg_calipso_map_add - Add a new CALIPSO DOI mapping + * @doi: the CALIPSO DOI + * @domain: the domain mapping to add + * @addr: IP address + * @mask: IP address mask + * @audit_info: NetLabel audit information + * + * Description: + * Add a new NetLabel/LSM domain mapping for the given CALIPSO DOI to the + * NetLabel subsystem. A @domain value of NULL adds a new default domain + * mapping. Returns zero on success, negative values on failure. + * + */ +int netlbl_cfg_calipso_map_add(u32 doi, + const char *domain, + const struct in6_addr *addr, + const struct in6_addr *mask, + struct netlbl_audit *audit_info) +{ +#if IS_ENABLED(CONFIG_IPV6) + int ret_val = -ENOMEM; + struct calipso_doi *doi_def; + struct netlbl_dom_map *entry; + struct netlbl_domaddr_map *addrmap = NULL; + struct netlbl_domaddr6_map *addrinfo = NULL; + + doi_def = calipso_doi_getdef(doi); + if (doi_def == NULL) + return -ENOENT; + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) + goto out_entry; + entry->family = AF_INET6; + if (domain != NULL) { + entry->domain = kstrdup(domain, GFP_ATOMIC); + if (entry->domain == NULL) + goto out_domain; + } + + if (addr == NULL && mask == NULL) { + entry->def.calipso = doi_def; + entry->def.type = NETLBL_NLTYPE_CALIPSO; + } else if (addr != NULL && mask != NULL) { + addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC); + if (addrmap == NULL) + goto out_addrmap; + INIT_LIST_HEAD(&addrmap->list4); + INIT_LIST_HEAD(&addrmap->list6); + + addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC); + if (addrinfo == NULL) + goto out_addrinfo; + addrinfo->def.calipso = doi_def; + addrinfo->def.type = NETLBL_NLTYPE_CALIPSO; + addrinfo->list.addr = *addr; + addrinfo->list.addr.s6_addr32[0] &= mask->s6_addr32[0]; + addrinfo->list.addr.s6_addr32[1] &= mask->s6_addr32[1]; + addrinfo->list.addr.s6_addr32[2] &= mask->s6_addr32[2]; + addrinfo->list.addr.s6_addr32[3] &= mask->s6_addr32[3]; + addrinfo->list.mask = *mask; + addrinfo->list.valid = 1; + ret_val = netlbl_af6list_add(&addrinfo->list, &addrmap->list6); + if (ret_val != 0) + goto cfg_calipso_map_add_failure; + + entry->def.addrsel = addrmap; + entry->def.type = NETLBL_NLTYPE_ADDRSELECT; + } else { + ret_val = -EINVAL; + goto out_addrmap; + } + + ret_val = netlbl_domhsh_add(entry, audit_info); + if (ret_val != 0) + goto cfg_calipso_map_add_failure; + + return 0; + +cfg_calipso_map_add_failure: + kfree(addrinfo); +out_addrinfo: + kfree(addrmap); +out_addrmap: + kfree(entry->domain); +out_domain: + kfree(entry); +out_entry: + calipso_doi_putdef(doi_def); + return ret_val; +#else /* IPv6 */ + return -ENOSYS; +#endif /* IPv6 */ +} + /* * Security Attribute Functions */ @@ -519,6 +661,7 @@ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset) return -ENOENT; } +EXPORT_SYMBOL(netlbl_catmap_walk); /** * netlbl_catmap_walkrng - Find the end of a string of set bits @@ -609,20 +752,19 @@ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, off = catmap->startbit; *offset = off; } - iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_NONE, 0); + iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_WALK, 0); if (iter == NULL) { *offset = (u32)-1; return 0; } if (off < iter->startbit) { - off = iter->startbit; - *offset = off; + *offset = iter->startbit; + off = 0; } else off -= iter->startbit; - idx = off / NETLBL_CATMAP_MAPSIZE; - *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_SIZE); + *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_MAPSIZE); return 0; } @@ -655,6 +797,7 @@ int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap, return 0; } +EXPORT_SYMBOL(netlbl_catmap_setbit); /** * netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap @@ -727,6 +870,76 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, return 0; } +/* Bitmap functions + */ + +/** + * netlbl_bitmap_walk - Walk a bitmap looking for a bit + * @bitmap: the bitmap + * @bitmap_len: length in bits + * @offset: starting offset + * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit + * + * Description: + * Starting at @offset, walk the bitmap from left to right until either the + * desired bit is found or we reach the end. Return the bit offset, -1 if + * not found, or -2 if error. + */ +int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, + u32 offset, u8 state) +{ + u32 bit_spot; + u32 byte_offset; + unsigned char bitmask; + unsigned char byte; + + byte_offset = offset / 8; + byte = bitmap[byte_offset]; + bit_spot = offset; + bitmask = 0x80 >> (offset % 8); + + while (bit_spot < bitmap_len) { + if ((state && (byte & bitmask) == bitmask) || + (state == 0 && (byte & bitmask) == 0)) + return bit_spot; + + bit_spot++; + bitmask >>= 1; + if (bitmask == 0) { + byte = bitmap[++byte_offset]; + bitmask = 0x80; + } + } + + return -1; +} +EXPORT_SYMBOL(netlbl_bitmap_walk); + +/** + * netlbl_bitmap_setbit - Sets a single bit in a bitmap + * @bitmap: the bitmap + * @bit: the bit + * @state: if non-zero, set the bit (1) else clear the bit (0) + * + * Description: + * Set a single bit in the bitmask. Returns zero on success, negative values + * on error. + */ +void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state) +{ + u32 byte_spot; + u8 bitmask; + + /* gcc always rounds to zero when doing integer division */ + byte_spot = bit / 8; + bitmask = 0x80 >> (bit % 8); + if (state) + bitmap[byte_spot] |= bitmask; + else + bitmap[byte_spot] &= ~bitmask; +} +EXPORT_SYMBOL(netlbl_bitmap_setbit); + /* * LSM Functions */ @@ -774,7 +987,7 @@ int netlbl_sock_setattr(struct sock *sk, struct netlbl_dom_map *dom_entry; rcu_read_lock(); - dom_entry = netlbl_domhsh_getentry(secattr->domain); + dom_entry = netlbl_domhsh_getentry(secattr->domain, family); if (dom_entry == NULL) { ret_val = -ENOENT; goto socket_setattr_return; @@ -799,9 +1012,21 @@ int netlbl_sock_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + switch (dom_entry->def.type) { + case NETLBL_NLTYPE_ADDRSELECT: + ret_val = -EDESTADDRREQ; + break; + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_sock_setattr(sk, + dom_entry->def.calipso, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -824,7 +1049,16 @@ socket_setattr_return: */ void netlbl_sock_delattr(struct sock *sk) { - cipso_v4_sock_delattr(sk); + switch (sk->sk_family) { + case AF_INET: + cipso_v4_sock_delattr(sk); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + calipso_sock_delattr(sk); + break; +#endif /* IPv6 */ + } } /** @@ -850,7 +1084,7 @@ int netlbl_sock_getattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - ret_val = -ENOMSG; + ret_val = calipso_sock_getattr(sk, secattr); break; #endif /* IPv6 */ default: @@ -878,6 +1112,9 @@ int netlbl_conn_setattr(struct sock *sk, { int ret_val; struct sockaddr_in *addr4; +#if IS_ENABLED(CONFIG_IPV6) + struct sockaddr_in6 *addr6; +#endif struct netlbl_dommap_def *entry; rcu_read_lock(); @@ -898,7 +1135,7 @@ int netlbl_conn_setattr(struct sock *sk, case NETLBL_NLTYPE_UNLABELED: /* just delete the protocols we support for right now * but we could remove other protocols if needed */ - cipso_v4_sock_delattr(sk); + netlbl_sock_delattr(sk); ret_val = 0; break; default: @@ -907,9 +1144,27 @@ int netlbl_conn_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + addr6 = (struct sockaddr_in6 *)addr; + entry = netlbl_domhsh_getentry_af6(secattr->domain, + &addr6->sin6_addr); + if (entry == NULL) { + ret_val = -ENOENT; + goto conn_setattr_return; + } + switch (entry->type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_sock_setattr(sk, + entry->calipso, secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + netlbl_sock_delattr(sk); + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -936,12 +1191,13 @@ int netlbl_req_setattr(struct request_sock *req, { int ret_val; struct netlbl_dommap_def *entry; + struct inet_request_sock *ireq = inet_rsk(req); rcu_read_lock(); switch (req->rsk_ops->family) { case AF_INET: entry = netlbl_domhsh_getentry_af4(secattr->domain, - inet_rsk(req)->ir_rmt_addr); + ireq->ir_rmt_addr); if (entry == NULL) { ret_val = -ENOENT; goto req_setattr_return; @@ -952,9 +1208,7 @@ int netlbl_req_setattr(struct request_sock *req, entry->cipso, secattr); break; case NETLBL_NLTYPE_UNLABELED: - /* just delete the protocols we support for right now - * but we could remove other protocols if needed */ - cipso_v4_req_delattr(req); + netlbl_req_delattr(req); ret_val = 0; break; default: @@ -963,9 +1217,24 @@ int netlbl_req_setattr(struct request_sock *req, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + entry = netlbl_domhsh_getentry_af6(secattr->domain, + &ireq->ir_v6_rmt_addr); + if (entry == NULL) { + ret_val = -ENOENT; + goto req_setattr_return; + } + switch (entry->type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_req_setattr(req, + entry->calipso, secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + netlbl_req_delattr(req); + ret_val = 0; + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -987,7 +1256,16 @@ req_setattr_return: */ void netlbl_req_delattr(struct request_sock *req) { - cipso_v4_req_delattr(req); + switch (req->rsk_ops->family) { + case AF_INET: + cipso_v4_req_delattr(req); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + calipso_req_delattr(req); + break; +#endif /* IPv6 */ + } } /** @@ -1007,13 +1285,17 @@ int netlbl_skbuff_setattr(struct sk_buff *skb, { int ret_val; struct iphdr *hdr4; +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *hdr6; +#endif struct netlbl_dommap_def *entry; rcu_read_lock(); switch (family) { case AF_INET: hdr4 = ip_hdr(skb); - entry = netlbl_domhsh_getentry_af4(secattr->domain,hdr4->daddr); + entry = netlbl_domhsh_getentry_af4(secattr->domain, + hdr4->daddr); if (entry == NULL) { ret_val = -ENOENT; goto skbuff_setattr_return; @@ -1034,9 +1316,26 @@ int netlbl_skbuff_setattr(struct sk_buff *skb, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - /* since we don't support any IPv6 labeling protocols right - * now we can optimize everything away until we do */ - ret_val = 0; + hdr6 = ipv6_hdr(skb); + entry = netlbl_domhsh_getentry_af6(secattr->domain, + &hdr6->daddr); + if (entry == NULL) { + ret_val = -ENOENT; + goto skbuff_setattr_return; + } + switch (entry->type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = calipso_skbuff_setattr(skb, entry->calipso, + secattr); + break; + case NETLBL_NLTYPE_UNLABELED: + /* just delete the protocols we support for right now + * but we could remove other protocols if needed */ + ret_val = calipso_skbuff_delattr(skb); + break; + default: + ret_val = -ENOENT; + } break; #endif /* IPv6 */ default: @@ -1075,6 +1374,9 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + ptr = calipso_optptr(skb); + if (ptr && calipso_getattr(ptr, secattr) == 0) + return 0; break; #endif /* IPv6 */ } @@ -1085,6 +1387,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, /** * netlbl_skbuff_err - Handle a LSM error on a sk_buff * @skb: the packet + * @family: the family * @error: the error code * @gateway: true if host is acting as a gateway, false otherwise * @@ -1094,10 +1397,14 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb, * according to the packet's labeling protocol. * */ -void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway) +void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway) { - if (cipso_v4_optptr(skb)) - cipso_v4_error(skb, error, gateway); + switch (family) { + case AF_INET: + if (cipso_v4_optptr(skb)) + cipso_v4_error(skb, error, gateway); + break; + } } /** @@ -1112,11 +1419,15 @@ void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway) void netlbl_cache_invalidate(void) { cipso_v4_cache_invalidate(); +#if IS_ENABLED(CONFIG_IPV6) + calipso_cache_invalidate(); +#endif /* IPv6 */ } /** * netlbl_cache_add - Add an entry to a NetLabel protocol cache * @skb: the packet + * @family: the family * @secattr: the packet's security attributes * * Description: @@ -1125,7 +1436,7 @@ void netlbl_cache_invalidate(void) * values on error. * */ -int netlbl_cache_add(const struct sk_buff *skb, +int netlbl_cache_add(const struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr) { unsigned char *ptr; @@ -1133,10 +1444,20 @@ int netlbl_cache_add(const struct sk_buff *skb, if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0) return -ENOMSG; - ptr = cipso_v4_optptr(skb); - if (ptr) - return cipso_v4_cache_add(ptr, secattr); - + switch (family) { + case AF_INET: + ptr = cipso_v4_optptr(skb); + if (ptr) + return cipso_v4_cache_add(ptr, secattr); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ptr = calipso_optptr(skb); + if (ptr) + return calipso_cache_add(ptr, secattr); + break; +#endif /* IPv6 */ + } return -ENOMSG; } @@ -1161,6 +1482,7 @@ struct audit_buffer *netlbl_audit_start(int type, { return netlbl_audit_start_common(type, audit_info); } +EXPORT_SYMBOL(netlbl_audit_start); /* * Setup Functions diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 13f777f20995..f85d0e07af2d 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -41,8 +41,10 @@ #include <net/ipv6.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> +#include <net/calipso.h> #include <linux/atomic.h> +#include "netlabel_calipso.h" #include "netlabel_domainhash.h" #include "netlabel_user.h" #include "netlabel_mgmt.h" @@ -72,6 +74,8 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { [NLBL_MGMT_A_PROTOCOL] = { .type = NLA_U32 }, [NLBL_MGMT_A_VERSION] = { .type = NLA_U32 }, [NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 }, + [NLBL_MGMT_A_FAMILY] = { .type = NLA_U16 }, + [NLBL_MGMT_A_CLPDOI] = { .type = NLA_U32 }, }; /* @@ -95,6 +99,9 @@ static int netlbl_mgmt_add_common(struct genl_info *info, int ret_val = -EINVAL; struct netlbl_domaddr_map *addrmap = NULL; struct cipso_v4_doi *cipsov4 = NULL; +#if IS_ENABLED(CONFIG_IPV6) + struct calipso_doi *calipso = NULL; +#endif u32 tmp_val; struct netlbl_dom_map *entry = kzalloc(sizeof(*entry), GFP_KERNEL); @@ -119,6 +126,11 @@ static int netlbl_mgmt_add_common(struct genl_info *info, switch (entry->def.type) { case NETLBL_NLTYPE_UNLABELED: + if (info->attrs[NLBL_MGMT_A_FAMILY]) + entry->family = + nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]); + else + entry->family = AF_UNSPEC; break; case NETLBL_NLTYPE_CIPSOV4: if (!info->attrs[NLBL_MGMT_A_CV4DOI]) @@ -128,12 +140,30 @@ static int netlbl_mgmt_add_common(struct genl_info *info, cipsov4 = cipso_v4_doi_getdef(tmp_val); if (cipsov4 == NULL) goto add_free_domain; + entry->family = AF_INET; entry->def.cipso = cipsov4; break; +#if IS_ENABLED(CONFIG_IPV6) + case NETLBL_NLTYPE_CALIPSO: + if (!info->attrs[NLBL_MGMT_A_CLPDOI]) + goto add_free_domain; + + tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CLPDOI]); + calipso = calipso_doi_getdef(tmp_val); + if (calipso == NULL) + goto add_free_domain; + entry->family = AF_INET6; + entry->def.calipso = calipso; + break; +#endif /* IPv6 */ default: goto add_free_domain; } + if ((entry->family == AF_INET && info->attrs[NLBL_MGMT_A_IPV6ADDR]) || + (entry->family == AF_INET6 && info->attrs[NLBL_MGMT_A_IPV4ADDR])) + goto add_doi_put_def; + if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) { struct in_addr *addr; struct in_addr *mask; @@ -178,6 +208,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, goto add_free_addrmap; } + entry->family = AF_INET; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; entry->def.addrsel = addrmap; #if IS_ENABLED(CONFIG_IPV6) @@ -220,6 +251,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info, map->list.mask = *mask; map->list.valid = 1; map->def.type = entry->def.type; + if (calipso) + map->def.calipso = calipso; ret_val = netlbl_af6list_add(&map->list, &addrmap->list6); if (ret_val != 0) { @@ -227,6 +260,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, goto add_free_addrmap; } + entry->family = AF_INET6; entry->def.type = NETLBL_NLTYPE_ADDRSELECT; entry->def.addrsel = addrmap; #endif /* IPv6 */ @@ -242,6 +276,9 @@ add_free_addrmap: kfree(addrmap); add_doi_put_def: cipso_v4_doi_putdef(cipsov4); +#if IS_ENABLED(CONFIG_IPV6) + calipso_doi_putdef(calipso); +#endif add_free_domain: kfree(entry->domain); add_free_entry: @@ -278,6 +315,10 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, return ret_val; } + ret_val = nla_put_u16(skb, NLBL_MGMT_A_FAMILY, entry->family); + if (ret_val != 0) + return ret_val; + switch (entry->def.type) { case NETLBL_NLTYPE_ADDRSELECT: nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST); @@ -340,6 +381,15 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, if (ret_val != 0) return ret_val; + switch (map6->def.type) { + case NETLBL_NLTYPE_CALIPSO: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI, + map6->def.calipso->doi); + if (ret_val != 0) + return ret_val; + break; + } + nla_nest_end(skb, nla_b); } #endif /* IPv6 */ @@ -347,15 +397,25 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb, nla_nest_end(skb, nla_a); break; case NETLBL_NLTYPE_UNLABELED: - ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type); + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + entry->def.type); break; case NETLBL_NLTYPE_CIPSOV4: - ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type); + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + entry->def.type); if (ret_val != 0) return ret_val; ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI, entry->def.cipso->doi); break; + case NETLBL_NLTYPE_CALIPSO: + ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL, + entry->def.type); + if (ret_val != 0) + return ret_val; + ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI, + entry->def.calipso->doi); + break; } return ret_val; @@ -418,7 +478,7 @@ static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info) netlbl_netlink_auditinfo(skb, &audit_info); domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]); - return netlbl_domhsh_remove(domain, &audit_info); + return netlbl_domhsh_remove(domain, AF_UNSPEC, &audit_info); } /** @@ -536,7 +596,7 @@ static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info) netlbl_netlink_auditinfo(skb, &audit_info); - return netlbl_domhsh_remove_default(&audit_info); + return netlbl_domhsh_remove_default(AF_UNSPEC, &audit_info); } /** @@ -556,6 +616,12 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info) struct sk_buff *ans_skb = NULL; void *data; struct netlbl_dom_map *entry; + u16 family; + + if (info->attrs[NLBL_MGMT_A_FAMILY]) + family = nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]); + else + family = AF_INET; ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (ans_skb == NULL) @@ -566,7 +632,7 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info) goto listdef_failure; rcu_read_lock(); - entry = netlbl_domhsh_getentry(NULL); + entry = netlbl_domhsh_getentry(NULL, family); if (entry == NULL) { ret_val = -ENOENT; goto listdef_failure_lock; @@ -651,6 +717,15 @@ static int netlbl_mgmt_protocols(struct sk_buff *skb, goto protocols_return; protos_sent++; } +#if IS_ENABLED(CONFIG_IPV6) + if (protos_sent == 2) { + if (netlbl_mgmt_protocols_cb(skb, + cb, + NETLBL_NLTYPE_CALIPSO) < 0) + goto protocols_return; + protos_sent++; + } +#endif protocols_return: cb->args[0] = protos_sent; diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h index 8b6e1ab62b48..ea01e42bca78 100644 --- a/net/netlabel/netlabel_mgmt.h +++ b/net/netlabel/netlabel_mgmt.h @@ -58,7 +58,10 @@ * * NLBL_MGMT_A_CV4DOI * - * If using NETLBL_NLTYPE_UNLABELED no other attributes are required. + * If using NETLBL_NLTYPE_UNLABELED no other attributes are required, + * however the following attribute may optionally be sent: + * + * NLBL_MGMT_A_FAMILY * * o REMOVE: * Sent by an application to remove a domain mapping from the NetLabel @@ -77,6 +80,7 @@ * Required attributes: * * NLBL_MGMT_A_DOMAIN + * NLBL_MGMT_A_FAMILY * * If the IP address selectors are not used the following attribute is * required: @@ -108,7 +112,10 @@ * * NLBL_MGMT_A_CV4DOI * - * If using NETLBL_NLTYPE_UNLABELED no other attributes are required. + * If using NETLBL_NLTYPE_UNLABELED no other attributes are required, + * however the following attribute may optionally be sent: + * + * NLBL_MGMT_A_FAMILY * * o REMOVEDEF: * Sent by an application to remove the default domain mapping from the @@ -117,13 +124,17 @@ * o LISTDEF: * This message can be sent either from an application or by the kernel in * response to an application generated LISTDEF message. When sent by an - * application there is no payload. On success the kernel should send a - * response using the following format. + * application there may be an optional payload. * - * If the IP address selectors are not used the following attribute is + * NLBL_MGMT_A_FAMILY + * + * On success the kernel should send a response using the following format: + * + * If the IP address selectors are not used the following attributes are * required: * * NLBL_MGMT_A_PROTOCOL + * NLBL_MGMT_A_FAMILY * * If the IP address selectors are used then the following attritbute is * required: @@ -209,6 +220,12 @@ enum { /* (NLA_NESTED) * the selector list, there must be at least one * NLBL_MGMT_A_ADDRSELECTOR attribute */ + NLBL_MGMT_A_FAMILY, + /* (NLA_U16) + * The address family */ + NLBL_MGMT_A_CLPDOI, + /* (NLA_U32) + * the CALIPSO DOI value */ __NLBL_MGMT_A_MAX, }; #define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 9eaa9a1e8629..4528cff9138b 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -116,8 +116,8 @@ struct netlbl_unlhsh_walk_arg { static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) -static struct netlbl_unlhsh_tbl *netlbl_unlhsh; -static struct netlbl_unlhsh_iface *netlbl_unlhsh_def; +static struct netlbl_unlhsh_tbl __rcu *netlbl_unlhsh; +static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def; /* Accept unlabeled packets flag */ static u8 netlabel_unlabel_acceptflg; @@ -1537,6 +1537,7 @@ int __init netlbl_unlabel_defconf(void) entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) return -ENOMEM; + entry->family = AF_UNSPEC; entry->def.type = NETLBL_NLTYPE_UNLABELED; ret_val = netlbl_domhsh_add_default(entry, &audit_info); if (ret_val != 0) diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index adf8b7900da2..58495f44c62a 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -44,6 +44,7 @@ #include "netlabel_mgmt.h" #include "netlabel_unlabeled.h" #include "netlabel_cipso_v4.h" +#include "netlabel_calipso.h" #include "netlabel_user.h" /* @@ -71,6 +72,10 @@ int __init netlbl_netlink_init(void) if (ret_val != 0) return ret_val; + ret_val = netlbl_calipso_genl_init(); + if (ret_val != 0) + return ret_val; + return netlbl_unlabel_genl_init(); } diff --git a/net/sctp/output.c b/net/sctp/output.c index 7425f6c23888..1f1682b9a6a8 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -610,7 +610,8 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) /* We will generate more packets, so re-queue * auth chunk. */ - list_add(&chunk->list, &packet->chunk_list); + list_add(&packet->auth->list, + &packet->chunk_list); } else { sctp_chunk_free(packet->auth); packet->auth = NULL; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 8812e1bf6c1c..9fc417a8b476 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2079,7 +2079,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, lock_sock(sk); if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED) && - !sctp_sstate(sk, CLOSING)) { + !sctp_sstate(sk, CLOSING) && !sctp_sstate(sk, CLOSED)) { err = -ENOTCONN; goto out; } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index ec166d2bd2d9..877e55066f89 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -204,7 +204,9 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) /* If the socket is just going to throw this away, do not * even try to deliver it. */ - if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) + if (sk->sk_shutdown & RCV_SHUTDOWN && + (sk->sk_shutdown & SEND_SHUTDOWN || + !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 040ff627c18a..a7e42f9a405c 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) ret = kstrtoul(val, 0, &num); if (ret == -EINVAL) goto out_inval; - nbits = fls(num); - if (num > (1U << nbits)) - nbits++; + nbits = fls(num - 1); if (nbits > MAX_HASHTABLE_BITS || nbits < 2) goto out_inval; *(unsigned int *)kp->arg = nbits; @@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred) EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); bool -rpcauth_cred_key_to_expire(struct rpc_cred *cred) +rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) { + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) + return false; if (!cred->cr_ops->crkey_to_expire) return false; return cred->cr_ops->crkey_to_expire(cred); diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 54dd3fdead54..168219535a34 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) /* Fast track for non crkey_timeout (no key) underlying credentials */ - if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) return 0; /* Fast track for the normal case */ @@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) if (IS_ERR(tcred)) return -EACCES; - if (!tcred->cr_ops->crkey_timeout) { - set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags); - ret = 0; - goto out_put; - } - /* Test for the almost error case */ ret = tcred->cr_ops->crkey_timeout(tcred); if (ret != 0) { @@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); } -out_put: put_rpccred(tcred); return ret; } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index e64ae93d5b4f..23c8e7c39656 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1015,8 +1015,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_flags = 0; auth->au_ops = &authgss_ops; auth->au_flavor = flavor; + if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) + auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; atomic_set(&auth->au_count, 1); kref_init(&gss_auth->kref); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 65427492b1c9..60595835317a 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = { .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_INTEGRITY, .name = "krb5i", + .datatouch = true, }, [2] = { .pseudoflavor = RPC_AUTH_GSS_KRB5P, .qop = GSS_C_QOP_DEFAULT, .service = RPC_GSS_SVC_PRIVACY, .name = "krb5p", + .datatouch = true, }, }; diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 7063d856a598..5fec3abbe19b 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) } EXPORT_SYMBOL(gss_pseudoflavor_to_service); +bool +gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return gm->gm_pfs[i].datatouch; + } + return false; +} + char * gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) { diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index e085f5ae1548..1d281816f2bf 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1230,8 +1230,9 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, if (status) goto out; - dprintk("RPC: svcauth_gss: gss major status = %d\n", - ud.major_status); + dprintk("RPC: svcauth_gss: gss major status = %d " + "minor status = %d\n", + ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 8d9eb4d5ddd8..4d17376b2acb 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -115,6 +115,7 @@ static struct rpc_auth null_auth = { .au_cslack = NUL_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 9f65452b7cbc..a99278c984e8 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -228,6 +228,7 @@ static struct rpc_auth unix_auth = { .au_cslack = UNX_CALLSLACK, .au_rslack = NUL_REPLYSLACK, + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = ATOMIC_INIT(0), diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 553bf95f7003..4d8e11f94a35 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -362,7 +362,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) cache_purge(cd); spin_lock(&cache_list_lock); write_lock(&cd->hash_lock); - if (cd->entries || atomic_read(&cd->inuse)) { + if (cd->entries) { write_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); goto out; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2808d550d273..cb49898a5a58 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata) kfree(data); } -const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = { +static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { .rpc_call_done = rpc_cb_add_xprt_done, .rpc_release = rpc_cb_add_xprt_release, }; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index fc48eca21fd2..84f98cbe31c3 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1386,7 +1386,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct dentry *root, *gssd_dentry; - struct net *net = data; + struct net *net = get_net(sb->s_fs_info); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1419,7 +1419,6 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb); if (err) goto err_depopulate; - sb->s_fs_info = get_net(net); mutex_unlock(&sn->pipefs_sb_lock); return 0; @@ -1448,7 +1447,8 @@ static struct dentry * rpc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super); + struct net *net = current->nsproxy->net_ns; + return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super); } static void rpc_kill_sb(struct super_block *sb) @@ -1468,9 +1468,9 @@ static void rpc_kill_sb(struct super_block *sb) RPC_PIPEFS_UMOUNT, sb); mutex_unlock(&sn->pipefs_sb_lock); - put_net(net); out: kill_litter_super(sb); + put_net(net); } static struct file_system_type rpc_pipe_fs_type = { diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index fcfd48d263f6..9ae588511aaf 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue; /* * rpciod-related stuff */ -struct workqueue_struct *rpciod_workqueue; +struct workqueue_struct *rpciod_workqueue __read_mostly; +struct workqueue_struct *xprtiod_workqueue __read_mostly; /* * Disable the timer for a given RPC task. Should be called with @@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); * lockless RPC_IS_QUEUED() test) before we've had a chance to test * the RPC_TASK_RUNNING flag. */ -static void rpc_make_runnable(struct rpc_task *task) +static void rpc_make_runnable(struct workqueue_struct *wq, + struct rpc_task *task) { bool need_wakeup = !rpc_test_and_set_running(task); @@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task) return; if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); - queue_work(rpciod_workqueue, &task->u.tk_work); + queue_work(wq, &task->u.tk_work); } else wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); } @@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); /** - * __rpc_do_wake_up_task - wake up a single rpc_task + * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task + * @wq: workqueue on which to run task * @queue: wait queue * @task: task to be woken up * * Caller must hold queue->lock, and have cleared the task queued flag. */ -static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) +static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, + struct rpc_task *task) { dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", task->tk_pid, jiffies); @@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task __rpc_remove_wait_queue(queue, task); - rpc_make_runnable(task); + rpc_make_runnable(wq, task); dprintk("RPC: __rpc_wake_up_task done\n"); } @@ -436,16 +441,25 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task) { if (RPC_IS_QUEUED(task)) { smp_rmb(); if (task->tk_waitqueue == queue) - __rpc_do_wake_up_task(queue, task); + __rpc_do_wake_up_task_on_wq(wq, queue, task); } } /* + * Wake up a queued task while the queue lock is being held + */ +static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); +} + +/* * Wake up a task on a specific queue */ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) @@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue) /* * Wake up the first task on the wait queue. */ -struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, +struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, bool (*func)(struct rpc_task *, void *), void *data) { struct rpc_task *task = NULL; @@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, task = __rpc_find_next_queued(queue); if (task != NULL) { if (func(task, data)) - rpc_wake_up_task_queue_locked(queue, task); + rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); else task = NULL; } @@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, return task; } + +/* + * Wake up the first task on the wait queue. + */ +struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, + bool (*func)(struct rpc_task *, void *), void *data) +{ + return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data); +} EXPORT_SYMBOL_GPL(rpc_wake_up_first); static bool rpc_wake_up_next_func(struct rpc_task *task, void *data) @@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task) bool is_async = RPC_IS_ASYNC(task); rpc_set_active(task); - rpc_make_runnable(task); + rpc_make_runnable(rpciod_workqueue, task); if (!is_async) __rpc_execute(task); } @@ -1071,10 +1095,22 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - /* Note: highpri because network receive is latency sensitive */ - wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); + if (!wq) + goto out_failed; rpciod_workqueue = wq; - return rpciod_workqueue != NULL; + /* Note: highpri because network receive is latency sensitive */ + wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!wq) + goto free_rpciod; + xprtiod_workqueue = wq; + return 1; +free_rpciod: + wq = rpciod_workqueue; + rpciod_workqueue = NULL; + destroy_workqueue(wq); +out_failed: + return 0; } static void rpciod_stop(void) @@ -1088,6 +1124,9 @@ static void rpciod_stop(void) wq = rpciod_workqueue; rpciod_workqueue = NULL; destroy_workqueue(wq); + wq = xprtiod_workqueue; + xprtiod_workqueue = NULL; + destroy_workqueue(wq); } void diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index cc9852897395..c5b0cb4f4056 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); /* Encode reply */ - if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { + if (*statp == rpc_drop_reply || + test_bit(RQ_DROPME, &rqstp->rq_flags)) { if (procp->pc_release) procp->pc_release(rqstp, NULL, rqstp->rq_resp); goto dropit; } + if (*statp == rpc_autherr_badcred) { + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + goto err_bad_auth; + } if (*statp == rpc_success && (xdr = procp->pc_encode) && !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 4f01f63102ee..c3f652395a80 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -21,6 +21,10 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static unsigned int svc_rpc_per_connection_limit __read_mostly; +module_param(svc_rpc_per_connection_limit, uint, 0644); + + static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); @@ -329,12 +333,45 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) } EXPORT_SYMBOL_GPL(svc_print_addr); +static bool svc_xprt_slots_in_range(struct svc_xprt *xprt) +{ + unsigned int limit = svc_rpc_per_connection_limit; + int nrqsts = atomic_read(&xprt->xpt_nr_rqsts); + + return limit == 0 || (nrqsts >= 0 && nrqsts < limit); +} + +static bool svc_xprt_reserve_slot(struct svc_rqst *rqstp, struct svc_xprt *xprt) +{ + if (!test_bit(RQ_DATA, &rqstp->rq_flags)) { + if (!svc_xprt_slots_in_range(xprt)) + return false; + atomic_inc(&xprt->xpt_nr_rqsts); + set_bit(RQ_DATA, &rqstp->rq_flags); + } + return true; +} + +static void svc_xprt_release_slot(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) { + atomic_dec(&xprt->xpt_nr_rqsts); + svc_xprt_enqueue(xprt); + } +} + static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) { if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE))) return true; - if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED))) - return xprt->xpt_ops->xpo_has_wspace(xprt); + if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED))) { + if (xprt->xpt_ops->xpo_has_wspace(xprt) && + svc_xprt_slots_in_range(xprt)) + return true; + trace_svc_xprt_no_write_space(xprt); + return false; + } return false; } @@ -480,8 +517,6 @@ void svc_reserve(struct svc_rqst *rqstp, int space) atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; - if (xprt->xpt_ops->xpo_adjust_wspace) - xprt->xpt_ops->xpo_adjust_wspace(xprt); svc_xprt_enqueue(xprt); } } @@ -512,8 +547,8 @@ static void svc_xprt_release(struct svc_rqst *rqstp) rqstp->rq_res.head[0].iov_len = 0; svc_reserve(rqstp, 0); + svc_xprt_release_slot(rqstp); rqstp->rq_xprt = NULL; - svc_xprt_put(xprt); } @@ -781,7 +816,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) svc_add_new_temp_xprt(serv, newxpt); else module_put(xprt->xpt_class->xcl_owner); - } else { + } else if (svc_xprt_reserve_slot(rqstp, xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", rqstp, rqstp->rq_pool->sp_id, xprt, @@ -871,6 +906,7 @@ EXPORT_SYMBOL_GPL(svc_recv); */ void svc_drop(struct svc_rqst *rqstp) { + trace_svc_drop(rqstp); dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); svc_xprt_release(rqstp); } @@ -1148,6 +1184,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) spin_unlock(&xprt->xpt_lock); dprintk("revisit canceled\n"); svc_xprt_put(xprt); + trace_svc_drop_deferred(dr); kfree(dr); return; } @@ -1205,6 +1242,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) set_bit(RQ_DROPME, &rqstp->rq_flags); dr->handle.revisit = svc_revisit; + trace_svc_defer(rqstp); return &dr->handle; } @@ -1245,6 +1283,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) struct svc_deferred_req, handle.recent); list_del_init(&dr->handle.recent); + trace_svc_revisit_deferred(dr); } else clear_bit(XPT_DEFERRED, &xprt->xpt_flags); spin_unlock(&xprt->xpt_lock); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index dadfec66dbd8..57625f64efd5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -60,7 +60,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int flags); -static void svc_udp_data_ready(struct sock *); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); static void svc_sock_detach(struct svc_xprt *); @@ -398,48 +397,21 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp) return svc_port_is_privileged(svc_addr(rqstp)); } -static bool sunrpc_waitqueue_active(wait_queue_head_t *wq) -{ - if (!wq) - return false; - /* - * There should normally be a memory * barrier here--see - * wq_has_sleeper(). - * - * It appears that isn't currently necessary, though, basically - * because callers all appear to have sufficient memory barriers - * between the time the relevant change is made and the - * time they call these callbacks. - * - * The nfsd code itself doesn't actually explicitly wait on - * these waitqueues, but it may wait on them for example in - * sendpage() or sendmsg() calls. (And those may be the only - * places, since it it uses nonblocking reads.) - * - * Maybe we should add the memory barriers anyway, but these are - * hot paths so we'd need to be convinced there's no sigificant - * penalty. - */ - return waitqueue_active(wq); -} - /* * INET callback when data has been received on the socket. */ -static void svc_udp_data_ready(struct sock *sk) +static void svc_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq = sk_sleep(sk); if (svsk) { dprintk("svc: socket %p(inet %p), busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); + svsk->sk_odata(sk); + if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) + svc_xprt_enqueue(&svsk->sk_xprt); } - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible(wq); } /* @@ -448,56 +420,22 @@ static void svc_udp_data_ready(struct sock *sk) static void svc_write_space(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); - wait_queue_head_t *wq = sk_sleep(sk); if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + svsk->sk_owspace(sk); svc_xprt_enqueue(&svsk->sk_xprt); } - - if (sunrpc_waitqueue_active(wq)) { - dprintk("RPC svc_write_space: someone sleeping on %p\n", - svsk); - wake_up_interruptible(wq); - } } static int svc_tcp_has_wspace(struct svc_xprt *xprt) { - struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct svc_serv *serv = svsk->sk_xprt.xpt_server; - int required; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) return 1; - required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg; - if (sk_stream_wspace(svsk->sk_sk) >= required || - (sk_stream_min_wspace(svsk->sk_sk) == 0 && - atomic_read(&xprt->xpt_reserved) == 0)) - return 1; - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - return 0; -} - -static void svc_tcp_write_space(struct sock *sk) -{ - struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); - struct socket *sock = sk->sk_socket; - - if (!sk_stream_is_writeable(sk) || !sock) - return; - if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt)) - clear_bit(SOCK_NOSPACE, &sock->flags); - svc_write_space(sk); -} - -static void svc_tcp_adjust_wspace(struct svc_xprt *xprt) -{ - struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - - if (svc_tcp_has_wspace(xprt)) - clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); } /* @@ -746,7 +684,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); - svsk->sk_sk->sk_data_ready = svc_udp_data_ready; + svsk->sk_sk->sk_data_ready = svc_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; /* initialise setting must have enough space to @@ -786,11 +724,12 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) static void svc_tcp_listen_data_ready(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq; dprintk("svc: socket %p TCP (listen) state change %d\n", sk, sk->sk_state); + if (svsk) + svsk->sk_odata(sk); /* * This callback may called twice when a new connection * is established as a child socket inherits everything @@ -808,10 +747,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) } else printk("svc: socket %p: no user data\n", sk); } - - wq = sk_sleep(sk); - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible_all(wq); } /* @@ -820,7 +755,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) static void svc_tcp_state_change(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq = sk_sleep(sk); dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", sk, sk->sk_state, sk->sk_user_data); @@ -828,26 +762,12 @@ static void svc_tcp_state_change(struct sock *sk) if (!svsk) printk("svc: socket %p: no user data\n", sk); else { - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible_all(wq); -} - -static void svc_tcp_data_ready(struct sock *sk) -{ - struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; - wait_queue_head_t *wq = sk_sleep(sk); - - dprintk("svc: socket %p TCP data ready (svsk %p)\n", - sk, sk->sk_user_data); - if (svsk) { - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); + svsk->sk_ostate(sk); + if (sk->sk_state != TCP_ESTABLISHED) { + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); + } } - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible(wq); } /* @@ -901,6 +821,11 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) dprintk("%s: connect from %s\n", serv->sv_name, __svc_print_addr(sin, buf, sizeof(buf))); + /* Reset the inherited callbacks before calling svc_setup_socket */ + newsock->sk->sk_state_change = svsk->sk_ostate; + newsock->sk->sk_data_ready = svsk->sk_odata; + newsock->sk->sk_write_space = svsk->sk_owspace; + /* make sure that a write doesn't block forever when * low on memory */ @@ -1317,7 +1242,6 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, - .xpo_adjust_wspace = svc_tcp_adjust_wspace, }; static struct svc_xprt_class svc_tcp_class = { @@ -1357,8 +1281,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) } else { dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; - sk->sk_data_ready = svc_tcp_data_ready; - sk->sk_write_space = svc_tcp_write_space; + sk->sk_data_ready = svc_data_ready; + sk->sk_write_space = svc_write_space; svsk->sk_reclen = 0; svsk->sk_tcplen = 0; @@ -1368,8 +1292,13 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - if (sk->sk_state != TCP_ESTABLISHED) + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + break; + default: set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + } } } @@ -1428,17 +1357,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Initialize the socket */ if (sock->type == SOCK_DGRAM) svc_udp_init(svsk, serv); - else { - /* initialise setting must have enough space to - * receive and respond to one request. - */ - svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg, - 4 * serv->sv_max_mesg); + else svc_tcp_init(svsk, serv); - } - dprintk("svc: svc_setup_socket created %p (inet %p)\n", - svsk, svsk->sk_sk); + dprintk("svc: svc_setup_socket created %p (inet %p), " + "listen %d close %d\n", + svsk, svsk->sk_sk, + test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); return svsk; } @@ -1606,18 +1532,16 @@ static void svc_sock_detach(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sock *sk = svsk->sk_sk; - wait_queue_head_t *wq; dprintk("svc: svc_sock_detach(%p)\n", svsk); /* put back the old socket callbacks */ + lock_sock(sk); sk->sk_state_change = svsk->sk_ostate; sk->sk_data_ready = svsk->sk_odata; sk->sk_write_space = svsk->sk_owspace; - - wq = sk_sleep(sk); - if (sunrpc_waitqueue_active(wq)) - wake_up_interruptible(wq); + sk->sk_user_data = NULL; + release_sock(sk); } /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 216a1385718a..8313960cac52 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_atomic(); } else - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); } /* @@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt)) + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, + __xprt_lock_write_func, xprt)) return; xprt_clear_locked(xprt); } @@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) return; if (RPCXPRT_CONGESTED(xprt)) goto out_unlock; - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt)) + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, + __xprt_lock_write_cong_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); @@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) set_bit(XPRT_CLOSE_WAIT, &xprt->state); /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -672,7 +674,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) set_bit(XPRT_CLOSE_WAIT, &xprt->state); /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); xprt_wake_pending_tasks(xprt, -EAGAIN); out: spin_unlock_bh(&xprt->transport_lock); @@ -689,7 +691,7 @@ xprt_init_autodisconnect(unsigned long data) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); - queue_work(rpciod_workqueue, &xprt->task_cleanup); + queue_work(xprtiod_workqueue, &xprt->task_cleanup); return; out_abort: spin_unlock(&xprt->transport_lock); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index e7fd76975d86..66c9d63f4797 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi, xprt_switch_find_xprt_t find_next) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); - struct list_head *head; if (xps == NULL) return NULL; - head = &xps->xps_xprt_list; - if (xps->xps_nxprts < 2) - return xprt_switch_find_first_entry(head); - return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next); + return xprt_switch_set_next_cursor(&xps->xps_xprt_list, + &xpi->xpi_cursor, + find_next); } static diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index dc9f3b513a05..ef19fa42c50f 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ - fmr_ops.o frwr_ops.o physical_ops.o \ + fmr_ops.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ module.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 6326ebe8b595..21cb3b150b37 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -19,13 +19,6 @@ * verb (fmr_op_unmap). */ -/* Transport recovery - * - * After a transport reconnect, fmr_op_map re-uses the MR already - * allocated for the RPC, but generates a fresh rkey then maps the - * MR again. This process is synchronous. - */ - #include "xprt_rdma.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -35,62 +28,132 @@ /* Maximum scatter/gather per FMR */ #define RPCRDMA_MAX_FMR_SGES (64) -static struct workqueue_struct *fmr_recovery_wq; - -#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) +/* Access mode of externally registered pages */ +enum { + RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ, +}; -int -fmr_alloc_recovery_wq(void) +bool +fmr_is_supported(struct rpcrdma_ia *ia) { - fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); - return !fmr_recovery_wq ? -ENOMEM : 0; + if (!ia->ri_device->alloc_fmr) { + pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", + ia->ri_device->name); + return false; + } + return true; } -void -fmr_destroy_recovery_wq(void) +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) { - struct workqueue_struct *wq; + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; - if (!fmr_recovery_wq) - return; + mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mw->fmr.fm_physaddrs) + goto out_free; - wq = fmr_recovery_wq; - fmr_recovery_wq = NULL; - destroy_workqueue(wq); + mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mw->mw_sg), GFP_KERNEL); + if (!mw->mw_sg) + goto out_free; + + sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); + + mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mw->fmr.fm_mr)) + goto out_fmr_err; + + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mw->fmr.fm_mr)); + +out_free: + kfree(mw->mw_sg); + kfree(mw->fmr.fm_physaddrs); + return -ENOMEM; } static int __fmr_unmap(struct rpcrdma_mw *mw) { LIST_HEAD(l); + int rc; - list_add(&mw->fmr.fmr->list, &l); - return ib_unmap_fmr(&l); + list_add(&mw->fmr.fm_mr->list, &l); + rc = ib_unmap_fmr(&l); + list_del_init(&mw->fmr.fm_mr->list); + return rc; } -/* Deferred reset of a single FMR. Generate a fresh rkey by - * replacing the MR. There's no recovery if this fails. - */ static void -__fmr_recovery_worker(struct work_struct *work) +fmr_op_release_mr(struct rpcrdma_mw *r) { - struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, - mw_work); - struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + LIST_HEAD(unmap_list); + int rc; - __fmr_unmap(mw); - rpcrdma_put_mw(r_xprt, mw); - return; + /* Ensure MW is not on any rl_registered list */ + if (!list_empty(&r->mw_list)) + list_del(&r->mw_list); + + kfree(r->fmr.fm_physaddrs); + kfree(r->mw_sg); + + /* In case this one was left mapped, try to unmap it + * to prevent dealloc_fmr from failing with EBUSY + */ + rc = __fmr_unmap(r); + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + r, rc); + + rc = ib_dealloc_fmr(r->fmr.fm_mr); + if (rc) + pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", + r, rc); + + kfree(r); } -/* A broken MR was discovered in a context that can't sleep. - * Defer recovery to the recovery worker. +/* Reset of a single FMR. */ static void -__fmr_queue_recovery(struct rpcrdma_mw *mw) +fmr_op_recover_mr(struct rpcrdma_mw *mw) { - INIT_WORK(&mw->mw_work, __fmr_recovery_worker); - queue_work(fmr_recovery_wq, &mw->mw_work); + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + int rc; + + /* ORDER: invalidate first */ + rc = __fmr_unmap(mw); + + /* ORDER: then DMA unmap */ + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (rc) + goto out_release; + + rpcrdma_put_mw(r_xprt, mw); + r_xprt->rx_stats.mrs_recovered++; + return; + +out_release: + pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); + r_xprt->rx_stats.mrs_orphaned++; + + spin_lock(&r_xprt->rx_buf.rb_mwlock); + list_del(&mw->mw_all); + spin_unlock(&r_xprt->rx_buf.rb_mwlock); + + fmr_op_release_mr(mw); } static int @@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); } -static int -fmr_op_init(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - struct rpcrdma_mw *r; - int i, rc; - - spin_lock_init(&buf->rb_mwlock); - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - - i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1); - i += 2; /* head + tail */ - i *= buf->rb_max_requests; /* one set for each RPC slot */ - dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); - - rc = -ENOMEM; - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (!r) - goto out; - - r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * - sizeof(u64), GFP_KERNEL); - if (!r->fmr.physaddrs) - goto out_free; - - r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->fmr.fmr)) - goto out_fmr_err; - - r->mw_xprt = r_xprt; - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - return 0; - -out_fmr_err: - rc = PTR_ERR(r->fmr.fmr); - dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); - kfree(r->fmr.physaddrs); -out_free: - kfree(r); -out: - return rc; -} - /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ static int fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing) + int nsegs, bool writing, struct rpcrdma_mw **out) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct ib_device *device = ia->ri_device; - enum dma_data_direction direction = rpcrdma_data_dir(writing); struct rpcrdma_mr_seg *seg1 = seg; int len, pageoff, i, rc; struct rpcrdma_mw *mw; + u64 *dma_pages; - mw = seg1->rl_mw; - seg1->rl_mw = NULL; - if (!mw) { - mw = rpcrdma_get_mw(r_xprt); - if (!mw) - return -ENOMEM; - } else { - /* this is a retransmit; generate a fresh rkey */ - rc = __fmr_unmap(mw); - if (rc) - return rc; - } + mw = rpcrdma_get_mw(r_xprt); + if (!mw) + return -ENOBUFS; pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ @@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (nsegs > RPCRDMA_MAX_FMR_SGES) nsegs = RPCRDMA_MAX_FMR_SGES; for (i = 0; i < nsegs;) { - rpcrdma_map_one(device, seg, direction); - mw->fmr.physaddrs[i] = seg->mr_dma; + if (seg->mr_page) + sg_set_page(&mw->mw_sg[i], + seg->mr_page, + seg->mr_len, + offset_in_page(seg->mr_offset)); + else + sg_set_buf(&mw->mw_sg[i], seg->mr_offset, + seg->mr_len); len += seg->mr_len; ++seg; ++i; @@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - - rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, - i, seg1->mr_dma); + mw->mw_nents = i; + mw->mw_dir = rpcrdma_data_dir(writing); + if (i == 0) + goto out_dmamap_err; + + if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir)) + goto out_dmamap_err; + + for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) + dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); + rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, + dma_pages[0]); if (rc) goto out_maperr; - seg1->rl_mw = mw; - seg1->mr_rkey = mw->fmr.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - return i; + mw->mw_handle = mw->fmr.fm_mr->rkey; + mw->mw_length = len; + mw->mw_offset = dma_pages[0] + pageoff; -out_maperr: - dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", - __func__, len, (unsigned long long)seg1->mr_dma, - pageoff, i, rc); - while (i--) - rpcrdma_unmap_one(device, --seg); - return rc; -} + *out = mw; + return mw->mw_nents; -static void -__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) -{ - struct ib_device *device = r_xprt->rx_ia.ri_device; - int nsegs = seg->mr_nsegs; +out_dmamap_err: + pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", + mw->mw_sg, mw->mw_nents); + rpcrdma_defer_mr_recovery(mw); + return -EIO; - while (nsegs--) - rpcrdma_unmap_one(device, seg++); +out_maperr: + pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + len, (unsigned long long)dma_pages[0], + pageoff, mw->mw_nents, rc); + rpcrdma_defer_mr_recovery(mw); + return -EIO; } /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. + * + * Caller ensures that req->rl_registered is not empty. */ static void fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct rpcrdma_mr_seg *seg; - unsigned int i, nchunks; - struct rpcrdma_mw *mw; + struct rpcrdma_mw *mw, *tmp; LIST_HEAD(unmap_list); int rc; @@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* ORDER: Invalidate all of the req's MRs first * * ib_unmap_fmr() is slow, so use a single call instead - * of one call per mapped MR. + * of one call per mapped FMR. */ - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; - - list_add(&mw->fmr.fmr->list, &unmap_list); - - i += seg->mr_nsegs; - } + list_for_each_entry(mw, &req->rl_registered, mw_list) + list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); rc = ib_unmap_fmr(&unmap_list); if (rc) - pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); + goto out_reset; /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. */ - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + list_del_init(&mw->mw_list); + list_del_init(&mw->fmr.fm_mr->list); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + rpcrdma_put_mw(r_xprt, mw); + } - __fmr_dma_unmap(r_xprt, seg); - rpcrdma_put_mw(r_xprt, seg->rl_mw); + return; - i += seg->mr_nsegs; - seg->mr_nsegs = 0; - seg->rl_mw = NULL; - } +out_reset: + pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - req->rl_nchunks = 0; + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + list_del_init(&mw->fmr.fm_mr->list); + fmr_op_recover_mr(mw); + } } /* Use a slow, safe mechanism to invalidate all memory regions * that were registered for "req". - * - * In the asynchronous case, DMA unmapping occurs first here - * because the rpcrdma_mr_seg is released immediately after this - * call. It's contents won't be available in __fmr_dma_unmap later. - * FIXME. */ static void fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, bool sync) { - struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - unsigned int i; - - for (i = 0; req->rl_nchunks; req->rl_nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; - - if (sync) { - /* ORDER */ - __fmr_unmap(mw); - __fmr_dma_unmap(r_xprt, seg); - rpcrdma_put_mw(r_xprt, mw); - } else { - __fmr_dma_unmap(r_xprt, seg); - __fmr_queue_recovery(mw); - } - - i += seg->mr_nsegs; - seg->mr_nsegs = 0; - seg->rl_mw = NULL; - } -} - -static void -fmr_op_destroy(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - kfree(r->fmr.physaddrs); - rc = ib_dealloc_fmr(r->fmr.fmr); - if (rc) - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", - __func__, rc); + while (!list_empty(&req->rl_registered)) { + mw = list_first_entry(&req->rl_registered, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); - kfree(r); + if (sync) + fmr_op_recover_mr(mw); + else + rpcrdma_defer_mr_recovery(mw); } } @@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap_safe = fmr_op_unmap_safe, + .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, - .ro_init = fmr_op_init, - .ro_destroy = fmr_op_destroy, + .ro_init_mr = fmr_op_init_mr, + .ro_release_mr = fmr_op_release_mr, .ro_displayname = "fmr", }; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c0947544babe..892b5e1d9b09 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -73,29 +73,71 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static struct workqueue_struct *frwr_recovery_wq; - -#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) +bool +frwr_is_supported(struct rpcrdma_ia *ia) +{ + struct ib_device_attr *attrs = &ia->ri_device->attrs; + + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + goto out_not_supported; + if (attrs->max_fast_reg_page_list_len == 0) + goto out_not_supported; + return true; + +out_not_supported: + pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", + ia->ri_device->name); + return false; +} -int -frwr_alloc_recovery_wq(void) +static int +frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) { - frwr_recovery_wq = alloc_workqueue("frwr_recovery", - FRWR_RECOVERY_WQ_FLAGS, 0); - return !frwr_recovery_wq ? -ENOMEM : 0; + unsigned int depth = ia->ri_max_frmr_depth; + struct rpcrdma_frmr *f = &r->frmr; + int rc; + + f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth); + if (IS_ERR(f->fr_mr)) + goto out_mr_err; + + r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); + if (!r->mw_sg) + goto out_list_err; + + sg_init_table(r->mw_sg, depth); + init_completion(&f->fr_linv_done); + return 0; + +out_mr_err: + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = -ENOMEM; + dprintk("RPC: %s: sg allocation failure\n", + __func__); + ib_dereg_mr(f->fr_mr); + return rc; } -void -frwr_destroy_recovery_wq(void) +static void +frwr_op_release_mr(struct rpcrdma_mw *r) { - struct workqueue_struct *wq; + int rc; - if (!frwr_recovery_wq) - return; + /* Ensure MW is not on any rl_registered list */ + if (!list_empty(&r->mw_list)) + list_del(&r->mw_list); - wq = frwr_recovery_wq; - frwr_recovery_wq = NULL; - destroy_workqueue(wq); + rc = ib_dereg_mr(r->frmr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + r, rc); + kfree(r->mw_sg); + kfree(r); } static int @@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) return 0; } -static void -__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_frmr *f = &mw->frmr; - int rc; - - rc = __frwr_reset_mr(ia, mw); - ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir); - if (rc) - return; - - rpcrdma_put_mw(r_xprt, mw); -} - -/* Deferred reset of a single FRMR. Generate a fresh rkey by - * replacing the MR. +/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. * * There's no recovery if this fails. The FRMR is abandoned, but * remains in rb_all. It will be cleaned up when the transport is * destroyed. */ static void -__frwr_recovery_worker(struct work_struct *work) -{ - struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, - mw_work); - - __frwr_reset_and_unmap(r->mw_xprt, r); - return; -} - -/* A broken MR was discovered in a context that can't sleep. - * Defer recovery to the recovery worker. - */ -static void -__frwr_queue_recovery(struct rpcrdma_mw *r) -{ - INIT_WORK(&r->mw_work, __frwr_recovery_worker); - queue_work(frwr_recovery_wq, &r->mw_work); -} - -static int -__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, - unsigned int depth) +frwr_op_recover_mr(struct rpcrdma_mw *mw) { - struct rpcrdma_frmr *f = &r->frmr; + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; - f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); - if (IS_ERR(f->fr_mr)) - goto out_mr_err; - - f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL); - if (!f->fr_sg) - goto out_list_err; - - sg_init_table(f->fr_sg, depth); - - init_completion(&f->fr_linv_done); - - return 0; + rc = __frwr_reset_mr(ia, mw); + ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (rc) + goto out_release; -out_mr_err: - rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_mr status %i\n", - __func__, rc); - return rc; + rpcrdma_put_mw(r_xprt, mw); + r_xprt->rx_stats.mrs_recovered++; + return; -out_list_err: - rc = -ENOMEM; - dprintk("RPC: %s: sg allocation failure\n", - __func__); - ib_dereg_mr(f->fr_mr); - return rc; -} +out_release: + pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); + r_xprt->rx_stats.mrs_orphaned++; -static void -__frwr_release(struct rpcrdma_mw *r) -{ - int rc; + spin_lock(&r_xprt->rx_buf.rb_mwlock); + list_del(&mw->mw_all); + spin_unlock(&r_xprt->rx_buf.rb_mwlock); - rc = ib_dereg_mr(r->frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr status %i\n", - __func__, rc); - kfree(r->frmr.fr_sg); + frwr_op_release_mr(mw); } static int @@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) complete_all(&frmr->fr_linv_done); } -static int -frwr_op_init(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct ib_device *device = r_xprt->rx_ia.ri_device; - unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - int i; - - spin_lock_init(&buf->rb_mwlock); - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - - i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1); - i += 2; /* head + tail */ - i *= buf->rb_max_requests; /* one set for each RPC slot */ - dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); - - while (i--) { - struct rpcrdma_mw *r; - int rc; - - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (!r) - return -ENOMEM; - - rc = __frwr_init(r, pd, device, depth); - if (rc) { - kfree(r); - return rc; - } - - r->mw_xprt = r_xprt; - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - - return 0; -} - -/* Post a FAST_REG Work Request to register a memory region +/* Post a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ static int frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing) + int nsegs, bool writing, struct rpcrdma_mw **out) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct ib_device *device = ia->ri_device; - enum dma_data_direction direction = rpcrdma_data_dir(writing); - struct rpcrdma_mr_seg *seg1 = seg; struct rpcrdma_mw *mw; struct rpcrdma_frmr *frmr; struct ib_mr *mr; @@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int rc, i, n, dma_nents; u8 key; - mw = seg1->rl_mw; - seg1->rl_mw = NULL; + mw = NULL; do { if (mw) - __frwr_queue_recovery(mw); + rpcrdma_defer_mr_recovery(mw); mw = rpcrdma_get_mw(r_xprt); if (!mw) - return -ENOMEM; + return -ENOBUFS; } while (mw->frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->frmr; frmr->fr_state = FRMR_IS_VALID; @@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; - for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&frmr->fr_sg[i], + sg_set_page(&mw->mw_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, + sg_set_buf(&mw->mw_sg[i], seg->mr_offset, seg->mr_len); ++seg; @@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - frmr->fr_nents = i; - frmr->fr_dir = direction; - - dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); - if (!dma_nents) { - pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", - __func__, frmr->fr_sg, frmr->fr_nents); - return -ENOMEM; - } + mw->mw_nents = i; + mw->mw_dir = rpcrdma_data_dir(writing); + if (i == 0) + goto out_dmamap_err; - n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE); - if (unlikely(n != frmr->fr_nents)) { - pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", - __func__, frmr->fr_mr, n, frmr->fr_nents); - rc = n < 0 ? n : -EINVAL; - goto out_senderr; - } + dma_nents = ib_dma_map_sg(ia->ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (!dma_nents) + goto out_dmamap_err; + + n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); + if (unlikely(n != mw->mw_nents)) + goto out_mapmr_err; dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", - __func__, mw, frmr->fr_nents, mr->length); + __func__, mw, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); @@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (rc) goto out_senderr; - seg1->rl_mw = mw; - seg1->mr_rkey = mr->rkey; - seg1->mr_base = mr->iova; - seg1->mr_nsegs = frmr->fr_nents; - seg1->mr_len = mr->length; + mw->mw_handle = mr->rkey; + mw->mw_length = mr->length; + mw->mw_offset = mr->iova; + + *out = mw; + return mw->mw_nents; - return frmr->fr_nents; +out_dmamap_err: + pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", + mw->mw_sg, mw->mw_nents); + rpcrdma_defer_mr_recovery(mw); + return -EIO; + +out_mapmr_err: + pr_err("rpcrdma: failed to map mr %p (%u/%u)\n", + frmr->fr_mr, n, mw->mw_nents); + rpcrdma_defer_mr_recovery(mw); + return -EIO; out_senderr: - dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); - __frwr_queue_recovery(mw); - return rc; + pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); + rpcrdma_defer_mr_recovery(mw); + return -ENOTCONN; } static struct ib_send_wr * -__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) +__frwr_prepare_linv_wr(struct rpcrdma_mw *mw) { - struct rpcrdma_mw *mw = seg->rl_mw; struct rpcrdma_frmr *f = &mw->frmr; struct ib_send_wr *invalidate_wr; @@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. + * + * Caller ensures that req->rl_registered is not empty. */ static void frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg; - unsigned int i, nchunks; + struct rpcrdma_mw *mw, *tmp; struct rpcrdma_frmr *f; - struct rpcrdma_mw *mw; int rc; dprintk("RPC: %s: req %p\n", __func__, req); @@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ + f = NULL; invalidate_wrs = pos = prev = NULL; - seg = NULL; - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - - pos = __frwr_prepare_linv_wr(seg); + list_for_each_entry(mw, &req->rl_registered, mw_list) { + pos = __frwr_prepare_linv_wr(mw); if (!invalidate_wrs) invalidate_wrs = pos; else prev->next = pos; prev = pos; - - i += seg->mr_nsegs; + f = &mw->frmr; } - f = &seg->rl_mw->frmr; /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain @@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * them to the free MW list. */ unmap: - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; - seg->rl_mw = NULL; - - ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, - f->fr_dir); + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + list_del_init(&mw->mw_list); + ib_dma_unmap_sg(ia->ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); - - i += seg->mr_nsegs; - seg->mr_nsegs = 0; } - - req->rl_nchunks = 0; return; reset_mrs: - pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); + rdma_disconnect(ia->ri_id); /* Find and reset the MRs in the LOCAL_INV WRs that did not * get posted. This is synchronous, and slow. */ - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; + list_for_each_entry(mw, &req->rl_registered, mw_list) { f = &mw->frmr; - if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { __frwr_reset_mr(ia, mw); bad_wr = bad_wr->next; } - - i += seg->mr_nsegs; } goto unmap; } @@ -621,38 +552,17 @@ static void frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, bool sync) { - struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - unsigned int i; - for (i = 0; req->rl_nchunks; req->rl_nchunks--) { - seg = &req->rl_segments[i]; - mw = seg->rl_mw; + while (!list_empty(&req->rl_registered)) { + mw = list_first_entry(&req->rl_registered, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); if (sync) - __frwr_reset_and_unmap(r_xprt, mw); + frwr_op_recover_mr(mw); else - __frwr_queue_recovery(mw); - - i += seg->mr_nsegs; - seg->mr_nsegs = 0; - seg->rl_mw = NULL; - } -} - -static void -frwr_op_destroy(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - - /* Ensure stale MWs for "buf" are no longer in flight */ - flush_workqueue(frwr_recovery_wq); - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - __frwr_release(r); - kfree(r); + rpcrdma_defer_mr_recovery(mw); } } @@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap_safe = frwr_op_unmap_safe, + .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, - .ro_init = frwr_op_init, - .ro_destroy = frwr_op_destroy, + .ro_init_mr = frwr_op_init_mr, + .ro_release_mr = frwr_op_release_mr, .ro_displayname = "frwr", }; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c deleted file mode 100644 index 3750596cc432..000000000000 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2015 Oracle. All rights reserved. - * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. - */ - -/* No-op chunk preparation. All client memory is pre-registered. - * Sometimes referred to as ALLPHYSICAL mode. - * - * Physical registration is simple because all client memory is - * pre-registered and never deregistered. This mode is good for - * adapter bring up, but is considered not safe: the server is - * trusted not to abuse its access to client memory not involved - * in RDMA I/O. - */ - -#include "xprt_rdma.h" - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - -static int -physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata) -{ - struct ib_mr *mr; - - /* Obtain an rkey to use for RPC data payloads. - */ - mr = ib_get_dma_mr(ia->ri_pd, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - if (IS_ERR(mr)) { - pr_err("%s: ib_get_dma_mr for failed with %lX\n", - __func__, PTR_ERR(mr)); - return -ENOMEM; - } - ia->ri_dma_mr = mr; - - rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int, - RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS)); - return 0; -} - -/* PHYSICAL memory registration conveys one page per chunk segment. - */ -static size_t -physical_op_maxpages(struct rpcrdma_xprt *r_xprt) -{ - return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS); -} - -static int -physical_op_init(struct rpcrdma_xprt *r_xprt) -{ - return 0; -} - -/* The client's physical memory is already exposed for - * remote access via RDMA READ or RDMA WRITE. - */ -static int -physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); - seg->mr_rkey = ia->ri_dma_mr->rkey; - seg->mr_base = seg->mr_dma; - return 1; -} - -/* DMA unmap all memory regions that were mapped for "req". - */ -static void -physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) -{ - struct ib_device *device = r_xprt->rx_ia.ri_device; - unsigned int i; - - for (i = 0; req->rl_nchunks; --req->rl_nchunks) - rpcrdma_unmap_one(device, &req->rl_segments[i++]); -} - -/* Use a slow, safe mechanism to invalidate all memory regions - * that were registered for "req". - * - * For physical memory registration, there is no good way to - * fence a single MR that has been advertised to the server. The - * client has already handed the server an R_key that cannot be - * invalidated and is shared by all MRs on this connection. - * Tearing down the PD might be the only safe choice, but it's - * not clear that a freshly acquired DMA R_key would be different - * than the one used by the PD that was just destroyed. - * FIXME. - */ -static void -physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - bool sync) -{ - physical_op_unmap_sync(r_xprt, req); -} - -static void -physical_op_destroy(struct rpcrdma_buffer *buf) -{ -} - -const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { - .ro_map = physical_op_map, - .ro_unmap_sync = physical_op_unmap_sync, - .ro_unmap_safe = physical_op_unmap_safe, - .ro_open = physical_op_open, - .ro_maxpages = physical_op_maxpages, - .ro_init = physical_op_init, - .ro_destroy = physical_op_destroy, - .ro_displayname = "physical", -}; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 35a81096e83d..a47f170b20ef 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf) * MR when they can. */ static int -rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, - int n, int nsegs) +rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) { size_t page_offset; u32 remaining; @@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, base = vec->iov_base; page_offset = offset_in_page(base); remaining = vec->iov_len; - while (remaining && n < nsegs) { + while (remaining && n < RPCRDMA_MAX_SEGS) { seg[n].mr_page = NULL; seg[n].mr_offset = base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); @@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, static int rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, - enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) { - int len, n = 0, p; - int page_base; + int len, n, p, page_base; struct page **ppages; + n = 0; if (pos == 0) { - n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); - if (n == nsegs) - return -EIO; + n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); + if (n == RPCRDMA_MAX_SEGS) + goto out_overflow; } len = xdrbuf->page_len; ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = xdrbuf->page_base & ~PAGE_MASK; p = 0; - while (len && n < nsegs) { + while (len && n < RPCRDMA_MAX_SEGS) { if (!ppages[p]) { /* alloc the pagelist for receiving buffer */ ppages[p] = alloc_page(GFP_ATOMIC); if (!ppages[p]) - return -ENOMEM; + return -EAGAIN; } seg[n].mr_page = ppages[p]; seg[n].mr_offset = (void *)(unsigned long) page_base; seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); if (seg[n].mr_len > PAGE_SIZE) - return -EIO; + goto out_overflow; len -= seg[n].mr_len; ++n; ++p; @@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, } /* Message overflows the seg array */ - if (len && n == nsegs) - return -EIO; + if (len && n == RPCRDMA_MAX_SEGS) + goto out_overflow; /* When encoding the read list, the tail is always sent inline */ if (type == rpcrdma_readch) @@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, * xdr pad bytes, saving the server an RDMA operation. */ if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) return n; - n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); - if (n == nsegs) - return -EIO; + n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); + if (n == RPCRDMA_MAX_SEGS) + goto out_overflow; } return n; + +out_overflow: + pr_err("rpcrdma: segment array overflow\n"); + return -EIO; } static inline __be32 * -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) +xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) { - *iptr++ = cpu_to_be32(seg->mr_rkey); - *iptr++ = cpu_to_be32(seg->mr_len); - return xdr_encode_hyper(iptr, seg->mr_base); + *iptr++ = cpu_to_be32(mw->mw_handle); + *iptr++ = cpu_to_be32(mw->mw_length); + return xdr_encode_hyper(iptr, mw->mw_offset); } /* XDR-encode the Read list. Supports encoding a list of read @@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, __be32 *iptr, enum rpcrdma_chunktype rtype) { - struct rpcrdma_mr_seg *seg = req->rl_nextseg; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; unsigned int pos; int n, nsegs; @@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) pos = 0; - nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, - RPCRDMA_MAX_SEGS - req->rl_nchunks); + seg = req->rl_segments; + nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); - if (n <= 0) + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + false, &mw); + if (n < 0) return ERR_PTR(n); + list_add(&mw->mw_list, &req->rl_registered); *iptr++ = xdr_one; /* item present */ @@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, * have the same "position". */ *iptr++ = cpu_to_be32(pos); - iptr = xdr_encode_rdma_segment(iptr, seg); + iptr = xdr_encode_rdma_segment(iptr, mw); - dprintk("RPC: %5u %s: read segment pos %u " - "%d@0x%016llx:0x%08x (%s)\n", + dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, pos, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); + mw->mw_length, (unsigned long long)mw->mw_offset, + mw->mw_handle, n < nsegs ? "more" : "last"); r_xprt->rx_stats.read_chunk_count++; - req->rl_nchunks++; seg += n; nsegs -= n; } while (nsegs); - req->rl_nextseg = seg; /* Finish Read list */ *iptr++ = xdr_zero; /* Next item not present */ @@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, __be32 *iptr, enum rpcrdma_chunktype wtype) { - struct rpcrdma_mr_seg *seg = req->rl_nextseg; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; int n, nsegs, nchunks; __be32 *segcount; @@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return iptr; } + seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg, - RPCRDMA_MAX_SEGS - req->rl_nchunks); + wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); - if (n <= 0) + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (n < 0) return ERR_PTR(n); + list_add(&mw->mw_list, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, seg); + iptr = xdr_encode_rdma_segment(iptr, mw); - dprintk("RPC: %5u %s: write segment " - "%d@0x016%llx:0x%08x (%s)\n", + dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); + mw->mw_length, (unsigned long long)mw->mw_offset, + mw->mw_handle, n < nsegs ? "more" : "last"); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; - req->rl_nchunks++; nchunks++; seg += n; nsegs -= n; } while (nsegs); - req->rl_nextseg = seg; /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); @@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpc_rqst *rqst, __be32 *iptr, enum rpcrdma_chunktype wtype) { - struct rpcrdma_mr_seg *seg = req->rl_nextseg; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; int n, nsegs, nchunks; __be32 *segcount; @@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, return iptr; } - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, - RPCRDMA_MAX_SEGS - req->rl_nchunks); + seg = req->rl_segments; + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); - if (n <= 0) + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (n < 0) return ERR_PTR(n); + list_add(&mw->mw_list, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, seg); + iptr = xdr_encode_rdma_segment(iptr, mw); - dprintk("RPC: %5u %s: reply segment " - "%d@0x%016llx:0x%08x (%s)\n", + dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); + mw->mw_length, (unsigned long long)mw->mw_offset, + mw->mw_handle, n < nsegs ? "more" : "last"); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; - req->rl_nchunks++; nchunks++; seg += n; nsegs -= n; } while (nsegs); - req->rl_nextseg = seg; /* Update count of segments in the Reply chunk */ *segcount = cpu_to_be32(nchunks); @@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; + bool ddp_allowed; ssize_t hdrlen; size_t rpclen; __be32 *iptr; @@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); headerp->rm_type = rdma_msg; + /* When the ULP employs a GSS flavor that guarantees integrity + * or privacy, direct data placement of individual data items + * is not allowed. + */ + ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & + RPCAUTH_AUTH_DATATOUCH); + /* * Chunks needed for results? * @@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) */ if (rpcrdma_results_inline(r_xprt, rqst)) wtype = rpcrdma_noch; - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) wtype = rpcrdma_writech; else wtype = rpcrdma_replych; @@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) rtype = rpcrdma_noch; rpcrdma_inline_pullup(rqst); rpclen = rqst->rq_svec[0].iov_len; - } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; rpclen = rqst->rq_svec[0].iov_len; rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); @@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * send a Call message with a Position Zero Read chunk and a * regular Read chunk at the same time. */ - req->rl_nchunks = 0; - req->rl_nextseg = req->rl_segments; iptr = headerp->rm_body.rm_chunks; iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); if (IS_ERR(iptr)) @@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) out_overflow: pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); - /* Terminate this RPC. Chunks registered above will be - * released by xprt_release -> xprt_rmda_free . - */ - return -EIO; + iptr = ERR_PTR(-EIO); out_unmap: r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); @@ -705,15 +711,13 @@ out_unmap: * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) */ static int -rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) +rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) { unsigned int i, total_len; struct rpcrdma_write_chunk *cur_wchunk; char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); i = be32_to_cpu(**iptrp); - if (i > max) - return -1; cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); total_len = 0; while (i--) { @@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b return total_len; } -/* - * Scatter inline received data back into provided iov's. +/** + * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs + * @rqst: controlling RPC request + * @srcp: points to RPC message payload in receive buffer + * @copy_len: remaining length of receive buffer content + * @pad: Write chunk pad bytes needed (zero for pure inline) + * + * The upper layer has set the maximum number of bytes it can + * receive in each component of rq_rcv_buf. These values are set in + * the head.iov_len, page_len, tail.iov_len, and buflen fields. + * + * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in + * many cases this function simply updates iov_base pointers in + * rq_rcv_buf to point directly to the received reply data, to + * avoid copying reply data. + * + * Returns the count of bytes which had to be memcopied. */ -static void +static unsigned long rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { - int i, npages, curlen, olen; + unsigned long fixup_copy_count; + int i, npages, curlen; char *destp; struct page **ppages; int page_base; + /* The head iovec is redirected to the RPC reply message + * in the receive buffer, to avoid a memcopy. + */ + rqst->rq_rcv_buf.head[0].iov_base = srcp; + rqst->rq_private_buf.head[0].iov_base = srcp; + + /* The contents of the receive buffer that follow + * head.iov_len bytes are copied into the page list. + */ curlen = rqst->rq_rcv_buf.head[0].iov_len; - if (curlen > copy_len) { /* write chunk header fixup */ + if (curlen > copy_len) curlen = copy_len; - rqst->rq_rcv_buf.head[0].iov_len = curlen; - } - dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", __func__, srcp, copy_len, curlen); - - /* Shift pointer for first receive segment only */ - rqst->rq_rcv_buf.head[0].iov_base = srcp; srcp += curlen; copy_len -= curlen; - olen = copy_len; - i = 0; - rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; page_base = rqst->rq_rcv_buf.page_base; ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); page_base &= ~PAGE_MASK; - + fixup_copy_count = 0; if (copy_len && rqst->rq_rcv_buf.page_len) { - npages = PAGE_ALIGN(page_base + - rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; - for (; i < npages; i++) { + int pagelist_len; + + pagelist_len = rqst->rq_rcv_buf.page_len; + if (pagelist_len > copy_len) + pagelist_len = copy_len; + npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { curlen = PAGE_SIZE - page_base; - if (curlen > copy_len) - curlen = copy_len; + if (curlen > pagelist_len) + curlen = pagelist_len; + dprintk("RPC: %s: page %d" " srcp 0x%p len %d curlen %d\n", __func__, i, srcp, copy_len, curlen); @@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) kunmap_atomic(destp); srcp += curlen; copy_len -= curlen; - if (copy_len == 0) + fixup_copy_count += curlen; + pagelist_len -= curlen; + if (!pagelist_len) break; page_base = 0; } - } - if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { - curlen = copy_len; - if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) - curlen = rqst->rq_rcv_buf.tail[0].iov_len; - if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) - memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); - dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", - __func__, srcp, copy_len, curlen); - rqst->rq_rcv_buf.tail[0].iov_len = curlen; - copy_len -= curlen; ++i; - } else - rqst->rq_rcv_buf.tail[0].iov_len = 0; - - if (pad) { - /* implicit padding on terminal chunk */ - unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; - while (pad--) - p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; + /* Implicit padding for the last segment in a Write + * chunk is inserted inline at the front of the tail + * iovec. The upper layer ignores the content of + * the pad. Simply ensure inline content in the tail + * that follows the Write chunk is properly aligned. + */ + if (pad) + srcp -= pad; } - if (copy_len) - dprintk("RPC: %s: %d bytes in" - " %d extra segments (%d lost)\n", - __func__, olen, i, copy_len); + /* The tail iovec is redirected to the remaining data + * in the receive buffer, to avoid a memcopy. + */ + if (copy_len || pad) { + rqst->rq_rcv_buf.tail[0].iov_base = srcp; + rqst->rq_private_buf.tail[0].iov_base = srcp; + } - /* TBD avoid a warning from call_decode() */ - rqst->rq_private_buf = rqst->rq_rcv_buf; + return fixup_copy_count; } void @@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) (headerp->rm_body.rm_chunks[1] == xdr_zero && headerp->rm_body.rm_chunks[2] != xdr_zero) || (headerp->rm_body.rm_chunks[1] != xdr_zero && - req->rl_nchunks == 0)) + list_empty(&req->rl_registered))) goto badheader; if (headerp->rm_body.rm_chunks[1] != xdr_zero) { /* count any expected write chunks in read reply */ /* start at write chunk array count */ iptr = &headerp->rm_body.rm_chunks[2]; - rdmalen = rpcrdma_count_chunks(rep, - req->rl_nchunks, 1, &iptr); + rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); /* check for validity, and no reply chunk after */ if (rdmalen < 0 || *iptr++ != xdr_zero) goto badheader; @@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) rep->rr_len -= RPCRDMA_HDRLEN_MIN; status = rep->rr_len; } - /* Fix up the rpc results for upper layer */ - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); + + r_xprt->rx_stats.fixup_copy_count += + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, + rdmalen); break; case rdma_nomsg: @@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (headerp->rm_body.rm_chunks[0] != xdr_zero || headerp->rm_body.rm_chunks[1] != xdr_zero || headerp->rm_body.rm_chunks[2] != xdr_one || - req->rl_nchunks == 0) + list_empty(&req->rl_registered)) goto badheader; iptr = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); - rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); + rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); if (rdmalen < 0) goto badheader; r_xprt->rx_stats.total_rdma_reply += rdmalen; @@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) badheader: default: - dprintk("%s: invalid rpcrdma reply header (type %d):" - " chunks[012] == %d %d %d" - " expected chunks <= %d\n", - __func__, be32_to_cpu(headerp->rm_type), - headerp->rm_body.rm_chunks[0], - headerp->rm_body.rm_chunks[1], - headerp->rm_body.rm_chunks[2], - req->rl_nchunks); + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", + rqst->rq_task->tk_pid, __func__, + be32_to_cpu(headerp->rm_type)); status = -EIO; r_xprt->rx_stats.bad_reply_count++; break; @@ -1035,7 +1049,7 @@ out: * control: waking the next RPC waits until this RPC has * relinquished all its Send Queue entries. */ - if (req->rl_nchunks) + if (!list_empty(&req->rl_registered)) r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); spin_lock_bh(&xprt->transport_lock); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 99d2e5b72726..81f0e879f019 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -558,7 +558,6 @@ out_sendbuf: out_fail: rpcrdma_buffer_put(req); - r_xprt->rx_stats.failed_marshal_count++; return NULL; } @@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer) rpcrdma_buffer_put(req); } -/* +/** + * xprt_rdma_send_request - marshal and send an RPC request + * @task: RPC task with an RPC message in rq_snd_buf + * + * Return values: + * 0: The request has been sent + * ENOTCONN: Caller needs to invoke connect logic then call again + * ENOBUFS: Call again later to send the request + * EIO: A permanent error occurred. The request was not sent, + * and don't try it again + * * send_request invokes the meat of RPC RDMA. It must do the following: + * * 1. Marshal the RPC request into an RPC RDMA request, which means * putting a header in front of data, and creating IOVs for RDMA * from those in the request. @@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer) * the request (rpcrdma_ep_post). * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). */ - static int xprt_rdma_send_request(struct rpc_task *task) { @@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; + /* On retransmit, remove any previously registered chunks */ + r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); + rc = rpcrdma_marshal_req(rqst); if (rc < 0) goto failed_marshal; @@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task) return 0; failed_marshal: - r_xprt->rx_stats.failed_marshal_count++; dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", __func__, rc); if (rc == -EIO) - return -EIO; + r_xprt->rx_stats.failed_marshal_count++; + if (rc != -ENOTCONN) + return rc; drop_connection: xprt_disconnect_done(xprt); return -ENOTCONN; /* implies disconnect */ @@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) xprt->stat.bad_xids, xprt->stat.req_u, xprt->stat.bklog_u); - seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ", r_xprt->rx_stats.read_chunk_count, r_xprt->rx_stats.write_chunk_count, r_xprt->rx_stats.reply_chunk_count, @@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.failed_marshal_count, r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); + seq_printf(seq, "%lu %lu %lu\n", + r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_orphaned, + r_xprt->rx_stats.mrs_allocated); } static int @@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void) __func__, rc); rpcrdma_destroy_wq(); - frwr_destroy_recovery_wq(); rc = xprt_unregister_transport(&xprt_rdma_bc); if (rc) @@ -753,20 +769,13 @@ int xprt_rdma_init(void) { int rc; - rc = frwr_alloc_recovery_wq(); - if (rc) - return rc; - rc = rpcrdma_alloc_wq(); - if (rc) { - frwr_destroy_recovery_wq(); + if (rc) return rc; - } rc = xprt_register_transport(&xprt_rdma); if (rc) { rpcrdma_destroy_wq(); - frwr_destroy_recovery_wq(); return rc; } @@ -774,7 +783,6 @@ int xprt_rdma_init(void) if (rc) { xprt_unregister_transport(&xprt_rdma); rpcrdma_destroy_wq(); - frwr_destroy_recovery_wq(); return rc; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index b044d98a1370..536d0be3f61b 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) struct rpcrdma_ia *ia = &xprt->rx_ia; int rc; - ia->ri_dma_mr = NULL; - ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); @@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_pd = ib_alloc_pd(ia->ri_device); if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); - dprintk("RPC: %s: ib_alloc_pd() failed %i\n", - __func__, rc); + pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); goto out2; } - if (memreg == RPCRDMA_FRMR) { - if (!(ia->ri_device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS) || - (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { - dprintk("RPC: %s: FRMR registration " - "not supported by HCA\n", __func__); - memreg = RPCRDMA_MTHCAFMR; - } - } - if (memreg == RPCRDMA_MTHCAFMR) { - if (!ia->ri_device->alloc_fmr) { - dprintk("RPC: %s: MTHCAFMR registration " - "not supported by HCA\n", __func__); - rc = -EINVAL; - goto out3; - } - } - switch (memreg) { case RPCRDMA_FRMR: - ia->ri_ops = &rpcrdma_frwr_memreg_ops; - break; - case RPCRDMA_ALLPHYSICAL: - ia->ri_ops = &rpcrdma_physical_memreg_ops; - break; + if (frwr_is_supported(ia)) { + ia->ri_ops = &rpcrdma_frwr_memreg_ops; + break; + } + /*FALLTHROUGH*/ case RPCRDMA_MTHCAFMR: - ia->ri_ops = &rpcrdma_fmr_memreg_ops; - break; + if (fmr_is_supported(ia)) { + ia->ri_ops = &rpcrdma_fmr_memreg_ops; + break; + } + /*FALLTHROUGH*/ default: - printk(KERN_ERR "RPC: Unsupported memory " - "registration mode: %d\n", memreg); - rc = -ENOMEM; + pr_err("rpcrdma: Unsupported memory registration mode: %d\n", + memreg); + rc = -EINVAL; goto out3; } - dprintk("RPC: %s: memory registration strategy is '%s'\n", - __func__, ia->ri_ops->ro_displayname); return 0; @@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, out2: ib_free_cq(sendcq); out1: - if (ia->ri_dma_mr) - ib_dereg_mr(ia->ri_dma_mr); return rc; } @@ -600,8 +578,6 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - int rc; - dprintk("RPC: %s: entering, connected is %d\n", __func__, ep->rep_connected); @@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_free_cq(ep->rep_attr.recv_cq); ib_free_cq(ep->rep_attr.send_cq); - - if (ia->ri_dma_mr) { - rc = ib_dereg_mr(ia->ri_dma_mr); - dprintk("RPC: %s: ib_dereg_mr returned %i\n", - __func__, rc); - } } /* @@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_drain_qp(ia->ri_id->qp); } +static void +rpcrdma_mr_recovery_worker(struct work_struct *work) +{ + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, + rb_recovery_worker.work); + struct rpcrdma_mw *mw; + + spin_lock(&buf->rb_recovery_lock); + while (!list_empty(&buf->rb_stale_mrs)) { + mw = list_first_entry(&buf->rb_stale_mrs, + struct rpcrdma_mw, mw_list); + list_del_init(&mw->mw_list); + spin_unlock(&buf->rb_recovery_lock); + + dprintk("RPC: %s: recovering MR %p\n", __func__, mw); + mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); + + spin_lock(&buf->rb_recovery_lock); + } + spin_unlock(&buf->rb_recovery_lock); +} + +void +rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) +{ + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + + spin_lock(&buf->rb_recovery_lock); + list_add(&mw->mw_list, &buf->rb_stale_mrs); + spin_unlock(&buf->rb_recovery_lock); + + schedule_delayed_work(&buf->rb_recovery_worker, 0); +} + +static void +rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + unsigned int count; + LIST_HEAD(free); + LIST_HEAD(all); + + for (count = 0; count < 32; count++) { + struct rpcrdma_mw *mw; + int rc; + + mw = kzalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + break; + + rc = ia->ri_ops->ro_init_mr(ia, mw); + if (rc) { + kfree(mw); + break; + } + + mw->mw_xprt = r_xprt; + + list_add(&mw->mw_list, &free); + list_add(&mw->mw_all, &all); + } + + spin_lock(&buf->rb_mwlock); + list_splice(&free, &buf->rb_mws); + list_splice(&all, &buf->rb_all); + r_xprt->rx_stats.mrs_allocated += count; + spin_unlock(&buf->rb_mwlock); + + dprintk("RPC: %s: created %u MRs\n", __func__, count); +} + +static void +rpcrdma_mr_refresh_worker(struct work_struct *work) +{ + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, + rb_refresh_worker.work); + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, + rx_buf); + + rpcrdma_create_mrs(r_xprt); +} + struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { @@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) spin_unlock(&buffer->rb_reqslock); req->rl_cqe.done = rpcrdma_wc_send; req->rl_buffer = &r_xprt->rx_buf; + INIT_LIST_HEAD(&req->rl_registered); return req; } @@ -832,17 +887,23 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; int i, rc; buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; - spin_lock_init(&buf->rb_lock); atomic_set(&buf->rb_credits, 1); + spin_lock_init(&buf->rb_mwlock); + spin_lock_init(&buf->rb_lock); + spin_lock_init(&buf->rb_recovery_lock); + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + INIT_LIST_HEAD(&buf->rb_stale_mrs); + INIT_DELAYED_WORK(&buf->rb_refresh_worker, + rpcrdma_mr_refresh_worker); + INIT_DELAYED_WORK(&buf->rb_recovery_worker, + rpcrdma_mr_recovery_worker); - rc = ia->ri_ops->ro_init(r_xprt); - if (rc) - goto out; + rpcrdma_create_mrs(r_xprt); INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); @@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i < buf->rb_max_requests + 2; i++) { + for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_rep *rep; rep = rpcrdma_create_rep(r_xprt); @@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) kfree(req); } +static void +rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, + rx_buf); + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mw *mw; + unsigned int count; + + count = 0; + spin_lock(&buf->rb_mwlock); + while (!list_empty(&buf->rb_all)) { + mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&mw->mw_all); + + spin_unlock(&buf->rb_mwlock); + ia->ri_ops->ro_release_mr(mw); + count++; + spin_lock(&buf->rb_mwlock); + } + spin_unlock(&buf->rb_mwlock); + r_xprt->rx_stats.mrs_allocated = 0; + + dprintk("RPC: %s: released %u MRs\n", __func__, count); +} + void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_ia *ia = rdmab_to_ia(buf); + cancel_delayed_work_sync(&buf->rb_recovery_worker); + while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; @@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } spin_unlock(&buf->rb_reqslock); - ia->ri_ops->ro_destroy(buf); + rpcrdma_destroy_mrs(buf); } struct rpcrdma_mw * @@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) spin_unlock(&buf->rb_mwlock); if (!mw) - pr_err("RPC: %s: no MWs available\n", __func__); + goto out_nomws; return mw; + +out_nomws: + dprintk("RPC: %s: no MWs available\n", __func__); + schedule_delayed_work(&buf->rb_refresh_worker, 0); + + /* Allow the reply handler and refresh worker to run */ + cond_resched(); + + return NULL; } void @@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) /* * Get a set of request/reply buffers. - * - * Reply buffer (if available) is attached to send buffer upon return. */ struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) @@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) out_reqbuf: spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of request buffers\n", __func__); + pr_warn("rpcrdma: out of request buffers (%p)\n", buffers); return NULL; out_repbuf: + list_add(&req->rl_free, &buffers->rb_send_bufs); spin_unlock(&buffers->rb_lock); - pr_warn("RPC: %s: out of reply buffers\n", __func__); - req->rl_reply = NULL; - return req; + pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers); + return NULL; } /* @@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ -void -rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) -{ - dprintk("RPC: map_one: offset %p iova %llx len %zu\n", - seg->mr_offset, - (unsigned long long)seg->mr_dma, seg->mr_dmalen); -} - /** * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers * @ia: controlling rpcrdma_ia @@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, if (rep) { rc = rpcrdma_ep_post_recv(ia, ep, rep); if (rc) - goto out; + return rc; req->rl_reply = NULL; } @@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); if (rc) - dprintk("RPC: %s: ib_post_send returned %i\n", __func__, - rc); -out: - return rc; + goto out_postsend_err; + return 0; + +out_postsend_err: + pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc); + return -ENOTCONN; } /* @@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, DMA_BIDIRECTIONAL); rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); - if (rc) - dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, - rc); - return rc; + goto out_postrecv; + return 0; + +out_postrecv: + pr_err("rpcrdma: ib_post_recv returned %i\n", rc); + return -ENOTCONN; } /** diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 95cdc66225ee..670fad57153a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -68,7 +68,6 @@ struct rpcrdma_ia { struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; - struct ib_mr *ri_dma_mr; struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; @@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) * o recv buffer (posted to provider) * o ib_sge (also donated to provider) * o status of reply (length, success or not) - * o bookkeeping state to get run by tasklet (list, etc) + * o bookkeeping state to get run by reply handler (list, etc) * - * These are allocated during initialization, per-transport instance; - * however, the tasklet execution list itself is global, as it should - * always be pretty short. + * These are allocated during initialization, per-transport instance. * * N of these are associated with a transport instance, and stored in * struct rpcrdma_buffer. N is the max number of outstanding requests. */ -#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) - -/* data segments + head/tail for Call + head/tail for Reply */ -#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4) - -struct rpcrdma_buffer; - struct rpcrdma_rep { struct ib_cqe rr_cqe; unsigned int rr_len; @@ -221,9 +211,6 @@ enum rpcrdma_frmr_state { }; struct rpcrdma_frmr { - struct scatterlist *fr_sg; - int fr_nents; - enum dma_data_direction fr_dir; struct ib_mr *fr_mr; struct ib_cqe fr_cqe; enum rpcrdma_frmr_state fr_state; @@ -235,18 +222,23 @@ struct rpcrdma_frmr { }; struct rpcrdma_fmr { - struct ib_fmr *fmr; - u64 *physaddrs; + struct ib_fmr *fm_mr; + u64 *fm_physaddrs; }; struct rpcrdma_mw { + struct list_head mw_list; + struct scatterlist *mw_sg; + int mw_nents; + enum dma_data_direction mw_dir; union { struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; }; - struct work_struct mw_work; struct rpcrdma_xprt *mw_xprt; - struct list_head mw_list; + u32 mw_handle; + u32 mw_length; + u64 mw_offset; struct list_head mw_all; }; @@ -266,33 +258,30 @@ struct rpcrdma_mw { * of iovs for send operations. The reason is that the iovs passed to * ib_post_{send,recv} must not be modified until the work request * completes. - * - * NOTES: - * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we - * marshal. The number needed varies depending on the iov lists that - * are passed to us, the memory registration mode we are in, and if - * physical addressing is used, the layout. */ +/* Maximum number of page-sized "segments" per chunk list to be + * registered or invalidated. Must handle a Reply chunk: + */ +enum { + RPCRDMA_MAX_IOV_SEGS = 3, + RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, + RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + + RPCRDMA_MAX_IOV_SEGS, +}; + struct rpcrdma_mr_seg { /* chunk descriptors */ - struct rpcrdma_mw *rl_mw; /* registered MR */ - u64 mr_base; /* registration result */ - u32 mr_rkey; /* registration result */ u32 mr_len; /* length of chunk or segment */ - int mr_nsegs; /* number of segments in chunk or 0 */ - enum dma_data_direction mr_dir; /* segment mapping direction */ - dma_addr_t mr_dma; /* segment mapping address */ - size_t mr_dmalen; /* segment mapping length */ struct page *mr_page; /* owning page, if any */ char *mr_offset; /* kva if no page, else offset */ }; #define RPCRDMA_MAX_IOVS (2) +struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_free; unsigned int rl_niovs; - unsigned int rl_nchunks; unsigned int rl_connect_cookie; struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; @@ -300,12 +289,13 @@ struct rpcrdma_req { struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; struct rpcrdma_regbuf *rl_rdmabuf; struct rpcrdma_regbuf *rl_sendbuf; - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; - struct rpcrdma_mr_seg *rl_nextseg; struct ib_cqe rl_cqe; struct list_head rl_all; bool rl_backchannel; + + struct list_head rl_registered; /* registered segments */ + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; }; static inline struct rpcrdma_req * @@ -341,6 +331,11 @@ struct rpcrdma_buffer { struct list_head rb_allreqs; u32 rb_bc_max_requests; + + spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ + struct list_head rb_stale_mrs; + struct delayed_work rb_recovery_worker; + struct delayed_work rb_refresh_worker; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -387,6 +382,9 @@ struct rpcrdma_stats { unsigned long bad_reply_count; unsigned long nomsg_call_count; unsigned long bcall_count; + unsigned long mrs_recovered; + unsigned long mrs_orphaned; + unsigned long mrs_allocated; }; /* @@ -395,23 +393,25 @@ struct rpcrdma_stats { struct rpcrdma_xprt; struct rpcrdma_memreg_ops { int (*ro_map)(struct rpcrdma_xprt *, - struct rpcrdma_mr_seg *, int, bool); + struct rpcrdma_mr_seg *, int, bool, + struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct rpcrdma_req *); void (*ro_unmap_safe)(struct rpcrdma_xprt *, struct rpcrdma_req *, bool); + void (*ro_recover_mr)(struct rpcrdma_mw *); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); size_t (*ro_maxpages)(struct rpcrdma_xprt *); - int (*ro_init)(struct rpcrdma_xprt *); - void (*ro_destroy)(struct rpcrdma_buffer *); + int (*ro_init_mr)(struct rpcrdma_ia *, + struct rpcrdma_mw *); + void (*ro_release_mr)(struct rpcrdma_mw *); const char *ro_displayname; }; extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; -extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; /* * RPCRDMA transport -- encapsulates the structures above for @@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize; */ int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); void rpcrdma_ia_close(struct rpcrdma_ia *); +bool frwr_is_supported(struct rpcrdma_ia *); +bool fmr_is_supported(struct rpcrdma_ia *); /* * Endpoint calls - xprtrdma/verbs.c @@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); +void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); + struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, size_t, gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, @@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *, int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); -int frwr_alloc_recovery_wq(void); -void frwr_destroy_recovery_wq(void); - int rpcrdma_alloc_wq(void); void rpcrdma_destroy_wq(void); @@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void); * Wrappers for chunk registration, shared by read/write chunk code. */ -void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); - static inline enum dma_data_direction rpcrdma_data_dir(bool writing) { return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; } -static inline void -rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, - enum dma_data_direction direction) -{ - seg->mr_dir = direction; - seg->mr_dmalen = seg->mr_len; - - if (seg->mr_page) - seg->mr_dma = ib_dma_map_page(device, - seg->mr_page, offset_in_page(seg->mr_offset), - seg->mr_dmalen, seg->mr_dir); - else - seg->mr_dma = ib_dma_map_single(device, - seg->mr_offset, - seg->mr_dmalen, seg->mr_dir); - - if (ib_dma_mapping_error(device, seg->mr_dma)) - rpcrdma_mapping_error(seg); -} - -static inline void -rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) -{ - if (seg->mr_page) - ib_dma_unmap_page(device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); - else - ib_dma_unmap_single(device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); -} - /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7e2b2fa189c3..111767ab124a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, - .extra2 = &xprt_max_resvport_limit + .extra2 = &xprt_max_resvport }, { .procname = "max_resvport", @@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xprt_min_resvport_limit, + .extra1 = &xprt_min_resvport, .extra2 = &xprt_max_resvport_limit }, { @@ -642,6 +642,7 @@ static int xs_tcp_send_request(struct rpc_task *task) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; bool zerocopy = true; + bool vm_wait = false; int status; int sent; @@ -677,15 +678,33 @@ static int xs_tcp_send_request(struct rpc_task *task) return 0; } + WARN_ON_ONCE(sent == 0 && status == 0); + + if (status == -EAGAIN ) { + /* + * Return EAGAIN if we're sure we're hitting the + * socket send buffer limits. + */ + if (test_bit(SOCK_NOSPACE, &transport->sock->flags)) + break; + /* + * Did we hit a memory allocation failure? + */ + if (sent == 0) { + status = -ENOBUFS; + if (vm_wait) + break; + /* Retry, knowing now that we're below the + * socket send buffer limit + */ + vm_wait = true; + } + continue; + } if (status < 0) break; - if (sent == 0) { - status = -EAGAIN; - break; - } + vm_wait = false; } - if (status == -EAGAIN && sk_stream_is_writeable(transport->inet)) - status = -ENOBUFS; switch (status) { case -ENOTSOCK: @@ -755,11 +774,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_error_report = transport->old_error_report; } +static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); +} + static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) { smp_mb__before_atomic(); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); + xs_sock_reset_state_flags(xprt); smp_mb__after_atomic(); } @@ -962,10 +989,13 @@ static void xs_local_data_receive(struct sock_xprt *transport) goto out; for (;;) { skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb == NULL) + if (skb != NULL) { + xs_local_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram(sk, skb); + continue; + } + if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; - xs_local_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); } out: mutex_unlock(&transport->recv_mutex); @@ -1043,10 +1073,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport) goto out; for (;;) { skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb == NULL) + if (skb != NULL) { + xs_udp_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram(sk, skb); + continue; + } + if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) break; - xs_udp_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); } out: mutex_unlock(&transport->recv_mutex); @@ -1074,7 +1107,14 @@ static void xs_data_ready(struct sock *sk) if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - queue_work(rpciod_workqueue, &transport->recv_worker); + transport->old_data_ready(sk); + /* Any data means we had a useful conversation, so + * then we don't need to delay the next reconnect + */ + if (xprt->reestablish_timeout) + xprt->reestablish_timeout = 0; + if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + queue_work(xprtiod_workqueue, &transport->recv_worker); } read_unlock_bh(&sk->sk_callback_lock); } @@ -1474,10 +1514,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) for (;;) { lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - release_sock(sk); - if (read <= 0) - break; - total += read; + if (read <= 0) { + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); + release_sock(sk); + if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + break; + } else { + release_sock(sk); + total += read; + } rd_desc.count = 65536; } out: @@ -1493,34 +1538,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work) } /** - * xs_tcp_data_ready - "data ready" callback for TCP sockets - * @sk: socket with data to read - * - */ -static void xs_tcp_data_ready(struct sock *sk) -{ - struct sock_xprt *transport; - struct rpc_xprt *xprt; - - dprintk("RPC: xs_tcp_data_ready...\n"); - - read_lock_bh(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk))) - goto out; - transport = container_of(xprt, struct sock_xprt, xprt); - - /* Any data means we had a useful conversation, so - * the we don't need to delay the next reconnect - */ - if (xprt->reestablish_timeout) - xprt->reestablish_timeout = 0; - queue_work(rpciod_workqueue, &transport->recv_worker); - -out: - read_unlock_bh(&sk->sk_callback_lock); -} - -/** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed * @@ -1714,7 +1731,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) static unsigned short xs_get_random_port(void) { - unsigned short range = xprt_max_resvport - xprt_min_resvport; + unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; unsigned short rand = (unsigned short) prandom_u32() % range; return rand + xprt_min_resvport; } @@ -2241,7 +2258,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; - sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_data_ready = xs_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; sock_set_flag(sk, SOCK_FASYNC); @@ -2380,7 +2397,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) /* Start by resetting any existing state */ xs_reset_transport(transport); - queue_delayed_work(rpciod_workqueue, + queue_delayed_work(xprtiod_workqueue, &transport->connect_worker, xprt->reestablish_timeout); xprt->reestablish_timeout <<= 1; @@ -2390,7 +2407,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; } else { dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - queue_delayed_work(rpciod_workqueue, + queue_delayed_work(xprtiod_workqueue, &transport->connect_worker, 0); } } @@ -3153,8 +3170,12 @@ static int param_set_uint_minmax(const char *val, static int param_set_portnr(const char *val, const struct kernel_param *kp) { - return param_set_uint_minmax(val, kp, + if (kp->arg == &xprt_min_resvport) + return param_set_uint_minmax(val, kp, RPC_MIN_RESVPORT, + xprt_max_resvport); + return param_set_uint_minmax(val, kp, + xprt_min_resvport, RPC_MAX_RESVPORT); } diff --git a/net/sysctl_net.c b/net/sysctl_net.c index ed98c1fc3de1..46a71c701e7c 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -46,7 +46,7 @@ static int net_ctl_permissions(struct ctl_table_header *head, kgid_t root_gid = make_kgid(net->user_ns, 0); /* Allow network administrator to have same access as root. */ - if (ns_capable(net->user_ns, CAP_NET_ADMIN) || + if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN) || uid_eq(root_uid, current_euid())) { int mode = (table->mode >> 6) & 7; return (mode << 6) | (mode << 3) | mode; diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index be70a57c1ff9..b62caa1c770c 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -794,10 +794,10 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, return 0; attr_msg_full: + read_unlock_bh(&mon->lock); nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); - read_unlock_bh(&mon->lock); return -EMSGSIZE; } diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index 14810abedc2e..8831e7c42167 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -26,3 +26,23 @@ config VMWARE_VMCI_VSOCKETS To compile this driver as a module, choose M here: the module will be called vmw_vsock_vmci_transport. If unsure, say N. + +config VIRTIO_VSOCKETS + tristate "virtio transport for Virtual Sockets" + depends on VSOCKETS && VIRTIO + select VIRTIO_VSOCKETS_COMMON + help + This module implements a virtio transport for Virtual Sockets. + + Enable this transport if your Virtual Machine host supports Virtual + Sockets over virtio. + + To compile this driver as a module, choose M here: the module will be + called vmw_vsock_virtio_transport. If unsure, say N. + +config VIRTIO_VSOCKETS_COMMON + tristate + help + This option is selected by any driver which needs to access + the virtio_vsock. The module will be called + vmw_vsock_virtio_transport_common. diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index 2ce52d70f224..bc27c70e0e59 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -1,7 +1,13 @@ obj-$(CONFIG_VSOCKETS) += vsock.o obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o +obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o +obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o vsock-y += af_vsock.o vsock_addr.o vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ vmci_transport_notify_qstate.o + +vmw_vsock_virtio_transport-y += virtio_transport.o + +vmw_vsock_virtio_transport_common-y += virtio_transport_common.o diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index b96ac918e0ba..17dbbe64cd73 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -344,6 +344,16 @@ static bool vsock_in_connected_table(struct vsock_sock *vsk) return ret; } +void vsock_remove_sock(struct vsock_sock *vsk) +{ + if (vsock_in_bound_table(vsk)) + vsock_remove_bound(vsk); + + if (vsock_in_connected_table(vsk)) + vsock_remove_connected(vsk); +} +EXPORT_SYMBOL_GPL(vsock_remove_sock); + void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) { int i; @@ -660,12 +670,6 @@ static void __vsock_release(struct sock *sk) vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ - if (vsock_in_bound_table(vsk)) - vsock_remove_bound(vsk); - - if (vsock_in_connected_table(vsk)) - vsock_remove_connected(vsk); - transport->release(vsk); lock_sock(sk); @@ -1995,6 +1999,15 @@ void vsock_core_exit(void) } EXPORT_SYMBOL_GPL(vsock_core_exit); +const struct vsock_transport *vsock_core_get_transport(void) +{ + /* vsock_register_mutex not taken since only the transport uses this + * function and only while registered. + */ + return transport; +} +EXPORT_SYMBOL_GPL(vsock_core_get_transport); + MODULE_AUTHOR("VMware, Inc."); MODULE_DESCRIPTION("VMware Virtual Socket Family"); MODULE_VERSION("1.0.1.0-k"); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c new file mode 100644 index 000000000000..699dfabdbccd --- /dev/null +++ b/net/vmw_vsock/virtio_transport.c @@ -0,0 +1,624 @@ +/* + * virtio transport for vsock + * + * Copyright (C) 2013-2015 Red Hat, Inc. + * Author: Asias He <asias@redhat.com> + * Stefan Hajnoczi <stefanha@redhat.com> + * + * Some of the code is take from Gerd Hoffmann <kraxel@redhat.com>'s + * early virtio-vsock proof-of-concept bits. + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/atomic.h> +#include <linux/virtio.h> +#include <linux/virtio_ids.h> +#include <linux/virtio_config.h> +#include <linux/virtio_vsock.h> +#include <net/sock.h> +#include <linux/mutex.h> +#include <net/af_vsock.h> + +static struct workqueue_struct *virtio_vsock_workqueue; +static struct virtio_vsock *the_virtio_vsock; +static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */ + +struct virtio_vsock { + struct virtio_device *vdev; + struct virtqueue *vqs[VSOCK_VQ_MAX]; + + /* Virtqueue processing is deferred to a workqueue */ + struct work_struct tx_work; + struct work_struct rx_work; + struct work_struct event_work; + + /* The following fields are protected by tx_lock. vqs[VSOCK_VQ_TX] + * must be accessed with tx_lock held. + */ + struct mutex tx_lock; + + struct work_struct send_pkt_work; + spinlock_t send_pkt_list_lock; + struct list_head send_pkt_list; + + atomic_t queued_replies; + + /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX] + * must be accessed with rx_lock held. + */ + struct mutex rx_lock; + int rx_buf_nr; + int rx_buf_max_nr; + + /* The following fields are protected by event_lock. + * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. + */ + struct mutex event_lock; + struct virtio_vsock_event event_list[8]; + + u32 guest_cid; +}; + +static struct virtio_vsock *virtio_vsock_get(void) +{ + return the_virtio_vsock; +} + +static u32 virtio_transport_get_local_cid(void) +{ + struct virtio_vsock *vsock = virtio_vsock_get(); + + return vsock->guest_cid; +} + +static void +virtio_transport_send_pkt_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, send_pkt_work); + struct virtqueue *vq; + bool added = false; + bool restart_rx = false; + + mutex_lock(&vsock->tx_lock); + + vq = vsock->vqs[VSOCK_VQ_TX]; + + /* Avoid unnecessary interrupts while we're processing the ring */ + virtqueue_disable_cb(vq); + + for (;;) { + struct virtio_vsock_pkt *pkt; + struct scatterlist hdr, buf, *sgs[2]; + int ret, in_sg = 0, out_sg = 0; + bool reply; + + spin_lock_bh(&vsock->send_pkt_list_lock); + if (list_empty(&vsock->send_pkt_list)) { + spin_unlock_bh(&vsock->send_pkt_list_lock); + virtqueue_enable_cb(vq); + break; + } + + pkt = list_first_entry(&vsock->send_pkt_list, + struct virtio_vsock_pkt, list); + list_del_init(&pkt->list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + reply = pkt->reply; + + sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); + sgs[out_sg++] = &hdr; + if (pkt->buf) { + sg_init_one(&buf, pkt->buf, pkt->len); + sgs[out_sg++] = &buf; + } + + ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL); + if (ret < 0) { + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + if (!virtqueue_enable_cb(vq) && ret == -ENOSPC) + continue; /* retry now that we have more space */ + break; + } + + if (reply) { + struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; + int val; + + val = atomic_dec_return(&vsock->queued_replies); + + /* Do we now have resources to resume rx processing? */ + if (val + 1 == virtqueue_get_vring_size(rx_vq)) + restart_rx = true; + } + + added = true; + } + + if (added) + virtqueue_kick(vq); + + mutex_unlock(&vsock->tx_lock); + + if (restart_rx) + queue_work(virtio_vsock_workqueue, &vsock->rx_work); +} + +static int +virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock *vsock; + int len = pkt->len; + + vsock = virtio_vsock_get(); + if (!vsock) { + virtio_transport_free_pkt(pkt); + return -ENODEV; + } + + if (pkt->reply) + atomic_inc(&vsock->queued_replies); + + spin_lock_bh(&vsock->send_pkt_list_lock); + list_add_tail(&pkt->list, &vsock->send_pkt_list); + spin_unlock_bh(&vsock->send_pkt_list_lock); + + queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + return len; +} + +static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) +{ + int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; + struct virtio_vsock_pkt *pkt; + struct scatterlist hdr, buf, *sgs[2]; + struct virtqueue *vq; + int ret; + + vq = vsock->vqs[VSOCK_VQ_RX]; + + do { + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + break; + + pkt->buf = kmalloc(buf_len, GFP_KERNEL); + if (!pkt->buf) { + virtio_transport_free_pkt(pkt); + break; + } + + pkt->len = buf_len; + + sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); + sgs[0] = &hdr; + + sg_init_one(&buf, pkt->buf, buf_len); + sgs[1] = &buf; + ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL); + if (ret) { + virtio_transport_free_pkt(pkt); + break; + } + vsock->rx_buf_nr++; + } while (vq->num_free); + if (vsock->rx_buf_nr > vsock->rx_buf_max_nr) + vsock->rx_buf_max_nr = vsock->rx_buf_nr; + virtqueue_kick(vq); +} + +static void virtio_transport_tx_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, tx_work); + struct virtqueue *vq; + bool added = false; + + vq = vsock->vqs[VSOCK_VQ_TX]; + mutex_lock(&vsock->tx_lock); + do { + struct virtio_vsock_pkt *pkt; + unsigned int len; + + virtqueue_disable_cb(vq); + while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) { + virtio_transport_free_pkt(pkt); + added = true; + } + } while (!virtqueue_enable_cb(vq)); + mutex_unlock(&vsock->tx_lock); + + if (added) + queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); +} + +/* Is there space left for replies to rx packets? */ +static bool virtio_transport_more_replies(struct virtio_vsock *vsock) +{ + struct virtqueue *vq = vsock->vqs[VSOCK_VQ_RX]; + int val; + + smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */ + val = atomic_read(&vsock->queued_replies); + + return val < virtqueue_get_vring_size(vq); +} + +static void virtio_transport_rx_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, rx_work); + struct virtqueue *vq; + + vq = vsock->vqs[VSOCK_VQ_RX]; + + mutex_lock(&vsock->rx_lock); + + do { + virtqueue_disable_cb(vq); + for (;;) { + struct virtio_vsock_pkt *pkt; + unsigned int len; + + if (!virtio_transport_more_replies(vsock)) { + /* Stop rx until the device processes already + * pending replies. Leave rx virtqueue + * callbacks disabled. + */ + goto out; + } + + pkt = virtqueue_get_buf(vq, &len); + if (!pkt) { + break; + } + + vsock->rx_buf_nr--; + + /* Drop short/long packets */ + if (unlikely(len < sizeof(pkt->hdr) || + len > sizeof(pkt->hdr) + pkt->len)) { + virtio_transport_free_pkt(pkt); + continue; + } + + pkt->len = len - sizeof(pkt->hdr); + virtio_transport_recv_pkt(pkt); + } + } while (!virtqueue_enable_cb(vq)); + +out: + if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2) + virtio_vsock_rx_fill(vsock); + mutex_unlock(&vsock->rx_lock); +} + +/* event_lock must be held */ +static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock, + struct virtio_vsock_event *event) +{ + struct scatterlist sg; + struct virtqueue *vq; + + vq = vsock->vqs[VSOCK_VQ_EVENT]; + + sg_init_one(&sg, event, sizeof(*event)); + + return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL); +} + +/* event_lock must be held */ +static void virtio_vsock_event_fill(struct virtio_vsock *vsock) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(vsock->event_list); i++) { + struct virtio_vsock_event *event = &vsock->event_list[i]; + + virtio_vsock_event_fill_one(vsock, event); + } + + virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]); +} + +static void virtio_vsock_reset_sock(struct sock *sk) +{ + lock_sock(sk); + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = ECONNRESET; + sk->sk_error_report(sk); + release_sock(sk); +} + +static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock) +{ + struct virtio_device *vdev = vsock->vdev; + u64 guest_cid; + + vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid), + &guest_cid, sizeof(guest_cid)); + vsock->guest_cid = le64_to_cpu(guest_cid); +} + +/* event_lock must be held */ +static void virtio_vsock_event_handle(struct virtio_vsock *vsock, + struct virtio_vsock_event *event) +{ + switch (le32_to_cpu(event->id)) { + case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET: + virtio_vsock_update_guest_cid(vsock); + vsock_for_each_connected_socket(virtio_vsock_reset_sock); + break; + } +} + +static void virtio_transport_event_work(struct work_struct *work) +{ + struct virtio_vsock *vsock = + container_of(work, struct virtio_vsock, event_work); + struct virtqueue *vq; + + vq = vsock->vqs[VSOCK_VQ_EVENT]; + + mutex_lock(&vsock->event_lock); + + do { + struct virtio_vsock_event *event; + unsigned int len; + + virtqueue_disable_cb(vq); + while ((event = virtqueue_get_buf(vq, &len)) != NULL) { + if (len == sizeof(*event)) + virtio_vsock_event_handle(vsock, event); + + virtio_vsock_event_fill_one(vsock, event); + } + } while (!virtqueue_enable_cb(vq)); + + virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]); + + mutex_unlock(&vsock->event_lock); +} + +static void virtio_vsock_event_done(struct virtqueue *vq) +{ + struct virtio_vsock *vsock = vq->vdev->priv; + + if (!vsock) + return; + queue_work(virtio_vsock_workqueue, &vsock->event_work); +} + +static void virtio_vsock_tx_done(struct virtqueue *vq) +{ + struct virtio_vsock *vsock = vq->vdev->priv; + + if (!vsock) + return; + queue_work(virtio_vsock_workqueue, &vsock->tx_work); +} + +static void virtio_vsock_rx_done(struct virtqueue *vq) +{ + struct virtio_vsock *vsock = vq->vdev->priv; + + if (!vsock) + return; + queue_work(virtio_vsock_workqueue, &vsock->rx_work); +} + +static struct virtio_transport virtio_transport = { + .transport = { + .get_local_cid = virtio_transport_get_local_cid, + + .init = virtio_transport_do_socket_init, + .destruct = virtio_transport_destruct, + .release = virtio_transport_release, + .connect = virtio_transport_connect, + .shutdown = virtio_transport_shutdown, + + .dgram_bind = virtio_transport_dgram_bind, + .dgram_dequeue = virtio_transport_dgram_dequeue, + .dgram_enqueue = virtio_transport_dgram_enqueue, + .dgram_allow = virtio_transport_dgram_allow, + + .stream_dequeue = virtio_transport_stream_dequeue, + .stream_enqueue = virtio_transport_stream_enqueue, + .stream_has_data = virtio_transport_stream_has_data, + .stream_has_space = virtio_transport_stream_has_space, + .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, + .stream_is_active = virtio_transport_stream_is_active, + .stream_allow = virtio_transport_stream_allow, + + .notify_poll_in = virtio_transport_notify_poll_in, + .notify_poll_out = virtio_transport_notify_poll_out, + .notify_recv_init = virtio_transport_notify_recv_init, + .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, + .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, + .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, + .notify_send_init = virtio_transport_notify_send_init, + .notify_send_pre_block = virtio_transport_notify_send_pre_block, + .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, + .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, + + .set_buffer_size = virtio_transport_set_buffer_size, + .set_min_buffer_size = virtio_transport_set_min_buffer_size, + .set_max_buffer_size = virtio_transport_set_max_buffer_size, + .get_buffer_size = virtio_transport_get_buffer_size, + .get_min_buffer_size = virtio_transport_get_min_buffer_size, + .get_max_buffer_size = virtio_transport_get_max_buffer_size, + }, + + .send_pkt = virtio_transport_send_pkt, +}; + +static int virtio_vsock_probe(struct virtio_device *vdev) +{ + vq_callback_t *callbacks[] = { + virtio_vsock_rx_done, + virtio_vsock_tx_done, + virtio_vsock_event_done, + }; + static const char * const names[] = { + "rx", + "tx", + "event", + }; + struct virtio_vsock *vsock = NULL; + int ret; + + ret = mutex_lock_interruptible(&the_virtio_vsock_mutex); + if (ret) + return ret; + + /* Only one virtio-vsock device per guest is supported */ + if (the_virtio_vsock) { + ret = -EBUSY; + goto out; + } + + vsock = kzalloc(sizeof(*vsock), GFP_KERNEL); + if (!vsock) { + ret = -ENOMEM; + goto out; + } + + vsock->vdev = vdev; + + ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX, + vsock->vqs, callbacks, names); + if (ret < 0) + goto out; + + virtio_vsock_update_guest_cid(vsock); + + ret = vsock_core_init(&virtio_transport.transport); + if (ret < 0) + goto out_vqs; + + vsock->rx_buf_nr = 0; + vsock->rx_buf_max_nr = 0; + atomic_set(&vsock->queued_replies, 0); + + vdev->priv = vsock; + the_virtio_vsock = vsock; + mutex_init(&vsock->tx_lock); + mutex_init(&vsock->rx_lock); + mutex_init(&vsock->event_lock); + spin_lock_init(&vsock->send_pkt_list_lock); + INIT_LIST_HEAD(&vsock->send_pkt_list); + INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); + INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); + INIT_WORK(&vsock->event_work, virtio_transport_event_work); + INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work); + + mutex_lock(&vsock->rx_lock); + virtio_vsock_rx_fill(vsock); + mutex_unlock(&vsock->rx_lock); + + mutex_lock(&vsock->event_lock); + virtio_vsock_event_fill(vsock); + mutex_unlock(&vsock->event_lock); + + mutex_unlock(&the_virtio_vsock_mutex); + return 0; + +out_vqs: + vsock->vdev->config->del_vqs(vsock->vdev); +out: + kfree(vsock); + mutex_unlock(&the_virtio_vsock_mutex); + return ret; +} + +static void virtio_vsock_remove(struct virtio_device *vdev) +{ + struct virtio_vsock *vsock = vdev->priv; + struct virtio_vsock_pkt *pkt; + + flush_work(&vsock->rx_work); + flush_work(&vsock->tx_work); + flush_work(&vsock->event_work); + flush_work(&vsock->send_pkt_work); + + vdev->config->reset(vdev); + + mutex_lock(&vsock->rx_lock); + while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) + virtio_transport_free_pkt(pkt); + mutex_unlock(&vsock->rx_lock); + + mutex_lock(&vsock->tx_lock); + while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) + virtio_transport_free_pkt(pkt); + mutex_unlock(&vsock->tx_lock); + + spin_lock_bh(&vsock->send_pkt_list_lock); + while (!list_empty(&vsock->send_pkt_list)) { + pkt = list_first_entry(&vsock->send_pkt_list, + struct virtio_vsock_pkt, list); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + spin_unlock_bh(&vsock->send_pkt_list_lock); + + mutex_lock(&the_virtio_vsock_mutex); + the_virtio_vsock = NULL; + vsock_core_exit(); + mutex_unlock(&the_virtio_vsock_mutex); + + vdev->config->del_vqs(vdev); + + kfree(vsock); +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static unsigned int features[] = { +}; + +static struct virtio_driver virtio_vsock_driver = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtio_vsock_probe, + .remove = virtio_vsock_remove, +}; + +static int __init virtio_vsock_init(void) +{ + int ret; + + virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0); + if (!virtio_vsock_workqueue) + return -ENOMEM; + ret = register_virtio_driver(&virtio_vsock_driver); + if (ret) + destroy_workqueue(virtio_vsock_workqueue); + return ret; +} + +static void __exit virtio_vsock_exit(void) +{ + unregister_virtio_driver(&virtio_vsock_driver); + destroy_workqueue(virtio_vsock_workqueue); +} + +module_init(virtio_vsock_init); +module_exit(virtio_vsock_exit); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("virtio transport for vsock"); +MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c new file mode 100644 index 000000000000..a53b3a16b4f1 --- /dev/null +++ b/net/vmw_vsock/virtio_transport_common.c @@ -0,0 +1,992 @@ +/* + * common code for virtio vsock + * + * Copyright (C) 2013-2015 Red Hat, Inc. + * Author: Asias He <asias@redhat.com> + * Stefan Hajnoczi <stefanha@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/list.h> +#include <linux/virtio.h> +#include <linux/virtio_ids.h> +#include <linux/virtio_config.h> +#include <linux/virtio_vsock.h> + +#include <net/sock.h> +#include <net/af_vsock.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/vsock_virtio_transport_common.h> + +/* How long to wait for graceful shutdown of a connection */ +#define VSOCK_CLOSE_TIMEOUT (8 * HZ) + +static const struct virtio_transport *virtio_transport_get_ops(void) +{ + const struct vsock_transport *t = vsock_core_get_transport(); + + return container_of(t, struct virtio_transport, transport); +} + +struct virtio_vsock_pkt * +virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port) +{ + struct virtio_vsock_pkt *pkt; + int err; + + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) + return NULL; + + pkt->hdr.type = cpu_to_le16(info->type); + pkt->hdr.op = cpu_to_le16(info->op); + pkt->hdr.src_cid = cpu_to_le64(src_cid); + pkt->hdr.dst_cid = cpu_to_le64(dst_cid); + pkt->hdr.src_port = cpu_to_le32(src_port); + pkt->hdr.dst_port = cpu_to_le32(dst_port); + pkt->hdr.flags = cpu_to_le32(info->flags); + pkt->len = len; + pkt->hdr.len = cpu_to_le32(len); + pkt->reply = info->reply; + + if (info->msg && len > 0) { + pkt->buf = kmalloc(len, GFP_KERNEL); + if (!pkt->buf) + goto out_pkt; + err = memcpy_from_msg(pkt->buf, info->msg, len); + if (err) + goto out; + } + + trace_virtio_transport_alloc_pkt(src_cid, src_port, + dst_cid, dst_port, + len, + info->type, + info->op, + info->flags); + + return pkt; + +out: + kfree(pkt->buf); +out_pkt: + kfree(pkt); + return NULL; +} +EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt); + +static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, + struct virtio_vsock_pkt_info *info) +{ + u32 src_cid, src_port, dst_cid, dst_port; + struct virtio_vsock_sock *vvs; + struct virtio_vsock_pkt *pkt; + u32 pkt_len = info->pkt_len; + + src_cid = vm_sockets_get_local_cid(); + src_port = vsk->local_addr.svm_port; + if (!info->remote_cid) { + dst_cid = vsk->remote_addr.svm_cid; + dst_port = vsk->remote_addr.svm_port; + } else { + dst_cid = info->remote_cid; + dst_port = info->remote_port; + } + + vvs = vsk->trans; + + /* we can send less than pkt_len bytes */ + if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE) + pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; + + /* virtio_transport_get_credit might return less than pkt_len credit */ + pkt_len = virtio_transport_get_credit(vvs, pkt_len); + + /* Do not send zero length OP_RW pkt */ + if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) + return pkt_len; + + pkt = virtio_transport_alloc_pkt(info, pkt_len, + src_cid, src_port, + dst_cid, dst_port); + if (!pkt) { + virtio_transport_put_credit(vvs, pkt_len); + return -ENOMEM; + } + + virtio_transport_inc_tx_pkt(vvs, pkt); + + return virtio_transport_get_ops()->send_pkt(pkt); +} + +static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, + struct virtio_vsock_pkt *pkt) +{ + vvs->rx_bytes += pkt->len; +} + +static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, + struct virtio_vsock_pkt *pkt) +{ + vvs->rx_bytes -= pkt->len; + vvs->fwd_cnt += pkt->len; +} + +void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) +{ + spin_lock_bh(&vvs->tx_lock); + pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); + pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); + spin_unlock_bh(&vvs->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); + +u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) +{ + u32 ret; + + spin_lock_bh(&vvs->tx_lock); + ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + if (ret > credit) + ret = credit; + vvs->tx_cnt += ret; + spin_unlock_bh(&vvs->tx_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_credit); + +void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) +{ + spin_lock_bh(&vvs->tx_lock); + vvs->tx_cnt -= credit; + spin_unlock_bh(&vvs->tx_lock); +} +EXPORT_SYMBOL_GPL(virtio_transport_put_credit); + +static int virtio_transport_send_credit_update(struct vsock_sock *vsk, + int type, + struct virtio_vsock_hdr *hdr) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, + .type = type, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +static ssize_t +virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0; + int err = -EFAULT; + + spin_lock_bh(&vvs->rx_lock); + while (total < len && !list_empty(&vvs->rx_queue)) { + pkt = list_first_entry(&vvs->rx_queue, + struct virtio_vsock_pkt, list); + + bytes = len - total; + if (bytes > pkt->len - pkt->off) + bytes = pkt->len - pkt->off; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + + err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); + if (err) + goto out; + + spin_lock_bh(&vvs->rx_lock); + + total += bytes; + pkt->off += bytes; + if (pkt->off == pkt->len) { + virtio_transport_dec_rx_pkt(vvs, pkt); + list_del(&pkt->list); + virtio_transport_free_pkt(pkt); + } + } + spin_unlock_bh(&vvs->rx_lock); + + /* Send a credit pkt to peer */ + virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM, + NULL); + + return total; + +out: + if (total) + err = total; + return err; +} + +ssize_t +virtio_transport_stream_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + if (flags & MSG_PEEK) + return -EOPNOTSUPP; + + return virtio_transport_stream_do_dequeue(vsk, msg, len); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue); + +int +virtio_transport_dgram_dequeue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len, int flags) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue); + +s64 virtio_transport_stream_has_data(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + spin_lock_bh(&vvs->rx_lock); + bytes = vvs->rx_bytes; + spin_unlock_bh(&vvs->rx_lock); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data); + +static s64 virtio_transport_has_space(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt); + if (bytes < 0) + bytes = 0; + + return bytes; +} + +s64 virtio_transport_stream_has_space(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + s64 bytes; + + spin_lock_bh(&vvs->tx_lock); + bytes = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + + return bytes; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space); + +int virtio_transport_do_socket_init(struct vsock_sock *vsk, + struct vsock_sock *psk) +{ + struct virtio_vsock_sock *vvs; + + vvs = kzalloc(sizeof(*vvs), GFP_KERNEL); + if (!vvs) + return -ENOMEM; + + vsk->trans = vvs; + vvs->vsk = vsk; + if (psk) { + struct virtio_vsock_sock *ptrans = psk->trans; + + vvs->buf_size = ptrans->buf_size; + vvs->buf_size_min = ptrans->buf_size_min; + vvs->buf_size_max = ptrans->buf_size_max; + vvs->peer_buf_alloc = ptrans->peer_buf_alloc; + } else { + vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE; + vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE; + vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE; + } + + vvs->buf_alloc = vvs->buf_size; + + spin_lock_init(&vvs->rx_lock); + spin_lock_init(&vvs->tx_lock); + INIT_LIST_HEAD(&vvs->rx_queue); + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init); + +u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size); + +u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size_min; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size); + +u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size_max; +} +EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size); + +void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < vvs->buf_size_min) + vvs->buf_size_min = val; + if (val > vvs->buf_size_max) + vvs->buf_size_max = val; + vvs->buf_size = val; + vvs->buf_alloc = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size); + +void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val > vvs->buf_size) + vvs->buf_size = val; + vvs->buf_size_min = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size); + +void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + if (val > VIRTIO_VSOCK_MAX_BUF_SIZE) + val = VIRTIO_VSOCK_MAX_BUF_SIZE; + if (val < vvs->buf_size) + vvs->buf_size = val; + vvs->buf_size_max = val; +} +EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size); + +int +virtio_transport_notify_poll_in(struct vsock_sock *vsk, + size_t target, + bool *data_ready_now) +{ + if (vsock_stream_has_data(vsk)) + *data_ready_now = true; + else + *data_ready_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in); + +int +virtio_transport_notify_poll_out(struct vsock_sock *vsk, + size_t target, + bool *space_avail_now) +{ + s64 free_space; + + free_space = vsock_stream_has_space(vsk); + if (free_space > 0) + *space_avail_now = true; + else if (free_space == 0) + *space_avail_now = false; + + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out); + +int virtio_transport_notify_recv_init(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init); + +int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block); + +int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk, + size_t target, struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue); + +int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk, + size_t target, ssize_t copied, bool data_read, + struct vsock_transport_recv_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue); + +int virtio_transport_notify_send_init(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init); + +int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block); + +int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk, + struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue); + +int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk, + ssize_t written, struct vsock_transport_send_notify_data *data) +{ + return 0; +} +EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue); + +u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + return vvs->buf_size; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat); + +bool virtio_transport_stream_is_active(struct vsock_sock *vsk) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active); + +bool virtio_transport_stream_allow(u32 cid, u32 port) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); + +int virtio_transport_dgram_bind(struct vsock_sock *vsk, + struct sockaddr_vm *addr) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); + +bool virtio_transport_dgram_allow(u32 cid, u32 port) +{ + return false; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow); + +int virtio_transport_connect(struct vsock_sock *vsk) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_REQUEST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_connect); + +int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_SHUTDOWN, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .flags = (mode & RCV_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_RCV : 0) | + (mode & SEND_SHUTDOWN ? + VIRTIO_VSOCK_SHUTDOWN_SEND : 0), + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_shutdown); + +int +virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + struct sockaddr_vm *remote_addr, + struct msghdr *msg, + size_t dgram_len) +{ + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue); + +ssize_t +virtio_transport_stream_enqueue(struct vsock_sock *vsk, + struct msghdr *msg, + size_t len) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RW, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .msg = msg, + .pkt_len = len, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} +EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue); + +void virtio_transport_destruct(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + + kfree(vvs); +} +EXPORT_SYMBOL_GPL(virtio_transport_destruct); + +static int virtio_transport_reset(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .reply = !!pkt, + }; + + /* Send RST only if the original pkt is not a RST pkt */ + if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + return 0; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +/* Normally packets are associated with a socket. There may be no socket if an + * attempt was made to connect to a socket that does not exist. + */ +static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, + .type = le16_to_cpu(pkt->hdr.type), + .reply = true, + }; + + /* Send RST only if the original pkt is not a RST pkt */ + if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + return 0; + + pkt = virtio_transport_alloc_pkt(&info, 0, + le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port), + le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + if (!pkt) + return -ENOMEM; + + return virtio_transport_get_ops()->send_pkt(pkt); +} + +static void virtio_transport_wait_close(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, + sock_flag(sk, SOCK_DONE))) + break; + } while (!signal_pending(current) && timeout); + + finish_wait(sk_sleep(sk), &wait); + } +} + +static void virtio_transport_do_close(struct vsock_sock *vsk, + bool cancel_timeout) +{ + struct sock *sk = sk_vsock(vsk); + + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + sk->sk_state_change(sk); + + if (vsk->close_work_scheduled && + (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { + vsk->close_work_scheduled = false; + + vsock_remove_sock(vsk); + + /* Release refcnt obtained when we scheduled the timeout */ + sock_put(sk); + } +} + +static void virtio_transport_close_timeout(struct work_struct *work) +{ + struct vsock_sock *vsk = + container_of(work, struct vsock_sock, close_work.work); + struct sock *sk = sk_vsock(vsk); + + sock_hold(sk); + lock_sock(sk); + + if (!sock_flag(sk, SOCK_DONE)) { + (void)virtio_transport_reset(vsk, NULL); + + virtio_transport_do_close(vsk, false); + } + + vsk->close_work_scheduled = false; + + release_sock(sk); + sock_put(sk); +} + +/* User context, vsk->sk is locked */ +static bool virtio_transport_close(struct vsock_sock *vsk) +{ + struct sock *sk = &vsk->sk; + + if (!(sk->sk_state == SS_CONNECTED || + sk->sk_state == SS_DISCONNECTING)) + return true; + + /* Already received SHUTDOWN from peer, reply with RST */ + if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) { + (void)virtio_transport_reset(vsk, NULL); + return true; + } + + if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) + (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); + + if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) + virtio_transport_wait_close(sk, sk->sk_lingertime); + + if (sock_flag(sk, SOCK_DONE)) { + return true; + } + + sock_hold(sk); + INIT_DELAYED_WORK(&vsk->close_work, + virtio_transport_close_timeout); + vsk->close_work_scheduled = true; + schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT); + return false; +} + +void virtio_transport_release(struct vsock_sock *vsk) +{ + struct sock *sk = &vsk->sk; + bool remove_sock = true; + + lock_sock(sk); + if (sk->sk_type == SOCK_STREAM) + remove_sock = virtio_transport_close(vsk); + release_sock(sk); + + if (remove_sock) + vsock_remove_sock(vsk); +} +EXPORT_SYMBOL_GPL(virtio_transport_release); + +static int +virtio_transport_recv_connecting(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + int err; + int skerr; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RESPONSE: + sk->sk_state = SS_CONNECTED; + sk->sk_socket->state = SS_CONNECTED; + vsock_insert_connected(vsk); + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_INVALID: + break; + case VIRTIO_VSOCK_OP_RST: + skerr = ECONNRESET; + err = 0; + goto destroy; + default: + skerr = EPROTO; + err = -EINVAL; + goto destroy; + } + return 0; + +destroy: + virtio_transport_reset(vsk, pkt); + sk->sk_state = SS_UNCONNECTED; + sk->sk_err = skerr; + sk->sk_error_report(sk); + return err; +} + +static int +virtio_transport_recv_connected(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + int err = 0; + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_RW: + pkt->len = le32_to_cpu(pkt->hdr.len); + pkt->off = 0; + + spin_lock_bh(&vvs->rx_lock); + virtio_transport_inc_rx_pkt(vvs, pkt); + list_add_tail(&pkt->list, &vvs->rx_queue); + spin_unlock_bh(&vvs->rx_lock); + + sk->sk_data_ready(sk); + return err; + case VIRTIO_VSOCK_OP_CREDIT_UPDATE: + sk->sk_write_space(sk); + break; + case VIRTIO_VSOCK_OP_SHUTDOWN: + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) + vsk->peer_shutdown |= RCV_SHUTDOWN; + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + vsk->peer_shutdown |= SEND_SHUTDOWN; + if (vsk->peer_shutdown == SHUTDOWN_MASK && + vsock_stream_has_data(vsk) <= 0) + sk->sk_state = SS_DISCONNECTING; + if (le32_to_cpu(pkt->hdr.flags)) + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_RST: + virtio_transport_do_close(vsk, true); + break; + default: + err = -EINVAL; + break; + } + + virtio_transport_free_pkt(pkt); + return err; +} + +static void +virtio_transport_recv_disconnecting(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + + if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) + virtio_transport_do_close(vsk, true); +} + +static int +virtio_transport_send_response(struct vsock_sock *vsk, + struct virtio_vsock_pkt *pkt) +{ + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RESPONSE, + .type = VIRTIO_VSOCK_TYPE_STREAM, + .remote_cid = le32_to_cpu(pkt->hdr.src_cid), + .remote_port = le32_to_cpu(pkt->hdr.src_port), + .reply = true, + }; + + return virtio_transport_send_pkt_info(vsk, &info); +} + +/* Handle server socket */ +static int +virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_sock *vchild; + struct sock *child; + + if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { + virtio_transport_reset(vsk, pkt); + return -EINVAL; + } + + if (sk_acceptq_is_full(sk)) { + virtio_transport_reset(vsk, pkt); + return -ENOMEM; + } + + child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL, + sk->sk_type, 0); + if (!child) { + virtio_transport_reset(vsk, pkt); + return -ENOMEM; + } + + sk->sk_ack_backlog++; + + lock_sock_nested(child, SINGLE_DEPTH_NESTING); + + child->sk_state = SS_CONNECTED; + + vchild = vsock_sk(child); + vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port)); + vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + + vsock_insert_connected(vchild); + vsock_enqueue_accept(sk, child); + virtio_transport_send_response(vchild, pkt); + + release_sock(child); + + sk->sk_data_ready(sk); + return 0; +} + +static bool virtio_transport_space_update(struct sock *sk, + struct virtio_vsock_pkt *pkt) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + bool space_available; + + /* buf_alloc and fwd_cnt is always included in the hdr */ + spin_lock_bh(&vvs->tx_lock); + vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); + vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); + space_available = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + return space_available; +} + +/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex + * lock. + */ +void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) +{ + struct sockaddr_vm src, dst; + struct vsock_sock *vsk; + struct sock *sk; + bool space_available; + + vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid), + le32_to_cpu(pkt->hdr.src_port)); + vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid), + le32_to_cpu(pkt->hdr.dst_port)); + + trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, + dst.svm_cid, dst.svm_port, + le32_to_cpu(pkt->hdr.len), + le16_to_cpu(pkt->hdr.type), + le16_to_cpu(pkt->hdr.op), + le32_to_cpu(pkt->hdr.flags), + le32_to_cpu(pkt->hdr.buf_alloc), + le32_to_cpu(pkt->hdr.fwd_cnt)); + + if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) { + (void)virtio_transport_reset_no_sock(pkt); + goto free_pkt; + } + + /* The socket must be in connected or bound table + * otherwise send reset back + */ + sk = vsock_find_connected_socket(&src, &dst); + if (!sk) { + sk = vsock_find_bound_socket(&dst); + if (!sk) { + (void)virtio_transport_reset_no_sock(pkt); + goto free_pkt; + } + } + + vsk = vsock_sk(sk); + + space_available = virtio_transport_space_update(sk, pkt); + + lock_sock(sk); + + /* Update CID in case it has changed after a transport reset event */ + vsk->local_addr.svm_cid = dst.svm_cid; + + if (space_available) + sk->sk_write_space(sk); + + switch (sk->sk_state) { + case VSOCK_SS_LISTEN: + virtio_transport_recv_listen(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTING: + virtio_transport_recv_connecting(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + case SS_CONNECTED: + virtio_transport_recv_connected(sk, pkt); + break; + case SS_DISCONNECTING: + virtio_transport_recv_disconnecting(sk, pkt); + virtio_transport_free_pkt(pkt); + break; + default: + virtio_transport_free_pkt(pkt); + break; + } + release_sock(sk); + + /* Release refcnt obtained when we fetched this socket out of the + * bound or connected list. + */ + sock_put(sk); + return; + +free_pkt: + virtio_transport_free_pkt(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); + +void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) +{ + kfree(pkt->buf); + kfree(pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Asias He"); +MODULE_DESCRIPTION("common code for virtio vsock"); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 4120b7a538be..4be4fbbc0b50 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1644,6 +1644,8 @@ static void vmci_transport_destruct(struct vsock_sock *vsk) static void vmci_transport_release(struct vsock_sock *vsk) { + vsock_remove_sock(vsk); + if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) { vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle); vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index da49c0b1fd32..b0e11b6dc994 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -715,7 +715,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, ASSERT_RTNL(); - if (!config_enabled(CONFIG_CFG80211_REG_RELAX_NO_IR) || + if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) || !(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR)) return false; |