summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_fs.c30
-rw-r--r--net/ceph/debugfs.c12
-rw-r--r--net/ceph/mon_client.c4
-rw-r--r--net/ceph/msgpool.c1
-rw-r--r--net/ceph/osd_client.c49
-rw-r--r--net/ceph/osdmap.c58
-rw-r--r--net/ceph/string_table.c111
-rw-r--r--net/dccp/ipv6.c12
-rw-r--r--net/ipv4/cipso_ipv4.c88
-rw-r--r--net/ipv4/tcp_input.c3
-rw-r--r--net/ipv4/tcp_output.c3
-rw-r--r--net/ipv6/Makefile1
-rw-r--r--net/ipv6/addrconf.c3
-rw-r--r--net/ipv6/af_inet6.c9
-rw-r--r--net/ipv6/calipso.c1473
-rw-r--r--net/ipv6/exthdrs.c76
-rw-r--r--net/ipv6/exthdrs_core.c2
-rw-r--r--net/ipv6/ipv6_sockglue.c1
-rw-r--r--net/ipv6/sysctl_net_ipv6.c19
-rw-r--r--net/ipv6/tcp_ipv6.c12
-rw-r--r--net/iucv/af_iucv.c5
-rw-r--r--net/netlabel/Kconfig1
-rw-r--r--net/netlabel/Makefile2
-rw-r--r--net/netlabel/netlabel_calipso.c740
-rw-r--r--net/netlabel/netlabel_calipso.h151
-rw-r--r--net/netlabel/netlabel_domainhash.c293
-rw-r--r--net/netlabel/netlabel_domainhash.h17
-rw-r--r--net/netlabel/netlabel_kapi.c394
-rw-r--r--net/netlabel/netlabel_mgmt.c85
-rw-r--r--net/netlabel/netlabel_mgmt.h27
-rw-r--r--net/netlabel/netlabel_unlabeled.c5
-rw-r--r--net/netlabel/netlabel_user.c5
-rw-r--r--net/sctp/output.c3
-rw-r--r--net/sctp/socket.c2
-rw-r--r--net/sctp/ulpqueue.c4
-rw-r--r--net/sunrpc/auth.c8
-rw-r--r--net/sunrpc/auth_generic.c9
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c3
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c2
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c12
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c5
-rw-r--r--net/sunrpc/auth_null.c1
-rw-r--r--net/sunrpc/auth_unix.c1
-rw-r--r--net/sunrpc/cache.c2
-rw-r--r--net/sunrpc/clnt.c2
-rw-r--r--net/sunrpc/rpc_pipe.c8
-rw-r--r--net/sunrpc/sched.c67
-rw-r--r--net/sunrpc/svc.c8
-rw-r--r--net/sunrpc/svc_xprt.c51
-rw-r--r--net/sunrpc/svcsock.c150
-rw-r--r--net/sunrpc/xprt.c14
-rw-r--r--net/sunrpc/xprtmultipath.c8
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c378
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c369
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c122
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c274
-rw-r--r--net/sunrpc/xprtrdma/transport.c40
-rw-r--r--net/sunrpc/xprtrdma/verbs.c242
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h118
-rw-r--r--net/sunrpc/xprtsock.c125
-rw-r--r--net/sysctl_net.c2
-rw-r--r--net/tipc/monitor.c2
-rw-r--r--net/vmw_vsock/Kconfig20
-rw-r--r--net/vmw_vsock/Makefile6
-rw-r--r--net/vmw_vsock/af_vsock.c25
-rw-r--r--net/vmw_vsock/virtio_transport.c624
-rw-r--r--net/vmw_vsock/virtio_transport_common.c992
-rw-r--r--net/vmw_vsock/vmci_transport.c2
-rw-r--r--net/wireless/chan.c2
72 files changed, 6105 insertions, 1296 deletions
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 958d9856912c..84cbed630c4b 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
crypto.o armor.o \
auth_x.o \
ceph_fs.o ceph_strings.o ceph_hash.o \
- pagevec.o snapshot.o
+ pagevec.o snapshot.o string_table.o
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 55d2bfee16d7..bddfcf6f09c2 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -747,6 +747,8 @@ out:
static void __exit exit_ceph_lib(void)
{
dout("exit_ceph_lib\n");
+ WARN_ON(!ceph_strings_empty());
+
ceph_osdc_cleanup();
ceph_msgr_exit();
ceph_crypto_shutdown();
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 41466ccb972a..7d54e944de5e 100644
--- a/net/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -9,9 +9,9 @@
*/
int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
{
- __u32 su = le32_to_cpu(layout->fl_stripe_unit);
- __u32 sc = le32_to_cpu(layout->fl_stripe_count);
- __u32 os = le32_to_cpu(layout->fl_object_size);
+ __u32 su = layout->stripe_unit;
+ __u32 sc = layout->stripe_count;
+ __u32 os = layout->object_size;
/* stripe unit, object size must be non-zero, 64k increment */
if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
@@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
return 1;
}
+void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
+ fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
+ fl->object_size = le32_to_cpu(legacy->fl_object_size);
+ fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
+ if (fl->pool_id == 0)
+ fl->pool_id = -1;
+}
+EXPORT_SYMBOL(ceph_file_layout_from_legacy);
+
+void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
+ legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
+ legacy->fl_object_size = cpu_to_le32(fl->object_size);
+ if (fl->pool_id >= 0)
+ legacy->fl_pg_pool = cpu_to_le32(fl->pool_id);
+ else
+ legacy->fl_pg_pool = 0;
+}
+EXPORT_SYMBOL(ceph_file_layout_to_legacy);
int ceph_flags_to_mode(int flags)
{
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index e77b04ca7802..c62b2b029a6e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -156,8 +156,16 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
seq_printf(s, "]/%d\t[", t->up.primary);
for (i = 0; i < t->acting.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
- seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
- t->target_oid.name_len, t->target_oid.name, t->flags);
+ seq_printf(s, "]/%d\t", t->acting.primary);
+ if (t->target_oloc.pool_ns) {
+ seq_printf(s, "%*pE/%*pE\t0x%x",
+ (int)t->target_oloc.pool_ns->len,
+ t->target_oloc.pool_ns->str,
+ t->target_oid.name_len, t->target_oid.name, t->flags);
+ } else {
+ seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len,
+ t->target_oid.name, t->flags);
+ }
if (t->paused)
seq_puts(s, "\tP");
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 37c38a7fb5c5..c83326c5ba58 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -227,9 +227,10 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
}
const char *ceph_sub_str[] = {
- [CEPH_SUB_MDSMAP] = "mdsmap",
[CEPH_SUB_MONMAP] = "monmap",
[CEPH_SUB_OSDMAP] = "osdmap",
+ [CEPH_SUB_FSMAP] = "fsmap.user",
+ [CEPH_SUB_MDSMAP] = "mdsmap",
};
/*
@@ -1193,6 +1194,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
case CEPH_MSG_MON_MAP:
case CEPH_MSG_MDS_MAP:
case CEPH_MSG_OSD_MAP:
+ case CEPH_MSG_FS_MAP_USER:
m = ceph_msg_new(type, front_len, GFP_NOFS, false);
if (!m)
return NULL; /* ENOMEM--return skip == 0 */
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index ddec1c10ac80..aaed59a47b1d 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/vmalloc.h>
+#include <linux/ceph/messenger.h>
#include <linux/ceph/msgpool.h>
static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 89469592076c..b5ec09612ff7 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -387,7 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest,
static void target_destroy(struct ceph_osd_request_target *t)
{
ceph_oid_destroy(&t->base_oid);
+ ceph_oloc_destroy(&t->base_oloc);
ceph_oid_destroy(&t->target_oid);
+ ceph_oloc_destroy(&t->target_oloc);
}
/*
@@ -533,6 +535,11 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
}
EXPORT_SYMBOL(ceph_osdc_alloc_request);
+static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc)
+{
+ return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
+}
+
int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
{
struct ceph_osd_client *osdc = req->r_osdc;
@@ -540,11 +547,13 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
int msg_size;
WARN_ON(ceph_oid_empty(&req->r_base_oid));
+ WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
/* create request message */
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
- msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+ msg_size += CEPH_ENCODING_START_BLK_LEN +
+ ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
msg_size += 4 + req->r_base_oid.name_len; /* oid */
msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
@@ -932,7 +941,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
osd_req_op_init(req, which, opcode, 0);
} else {
- u32 object_size = le32_to_cpu(layout->fl_object_size);
+ u32 object_size = layout->object_size;
u32 object_base = off - objoff;
if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
if (truncate_size <= object_base) {
@@ -948,7 +957,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
}
req->r_flags = flags;
- req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+ req->r_base_oloc.pool = layout->pool_id;
+ req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
req->r_snapid = vino.snap;
@@ -1489,12 +1499,16 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
p += sizeof(req->r_replay_version);
/* oloc */
- ceph_encode_8(&p, 4);
- ceph_encode_8(&p, 4);
- ceph_encode_32(&p, 8 + 4 + 4);
+ ceph_start_encoding(&p, 5, 4,
+ ceph_oloc_encoding_size(&req->r_t.target_oloc));
ceph_encode_64(&p, req->r_t.target_oloc.pool);
ceph_encode_32(&p, -1); /* preferred */
ceph_encode_32(&p, 0); /* key len */
+ if (req->r_t.target_oloc.pool_ns)
+ ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str,
+ req->r_t.target_oloc.pool_ns->len);
+ else
+ ceph_encode_32(&p, 0);
/* pgid */
ceph_encode_8(&p, 1);
@@ -2594,9 +2608,22 @@ static int ceph_oloc_decode(void **p, void *end,
}
if (struct_v >= 5) {
+ bool changed = false;
+
len = ceph_decode_32(p);
if (len > 0) {
- pr_warn("ceph_object_locator::nspace is set\n");
+ ceph_decode_need(p, end, len, e_inval);
+ if (!oloc->pool_ns ||
+ ceph_compare_string(oloc->pool_ns, *p, len))
+ changed = true;
+ *p += len;
+ } else {
+ if (oloc->pool_ns)
+ changed = true;
+ }
+ if (changed) {
+ /* redirect changes namespace */
+ pr_warn("ceph_object_locator::nspace is changed\n");
goto e_inval;
}
}
@@ -2806,7 +2833,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
goto out_unlock_session;
}
+ m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
ret = decode_MOSDOpReply(msg, &m);
+ m.redirect.oloc.pool_ns = NULL;
if (ret) {
pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
req->r_tid, ret);
@@ -2835,7 +2864,11 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
unlink_request(osd, req);
mutex_unlock(&osd->lock);
- ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+ /*
+ * Not ceph_oloc_copy() - changing pool_ns is not
+ * supported.
+ */
+ req->r_t.target_oloc.pool = m.redirect.oloc.pool;
req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
req->r_tid = 0;
__submit_request(req, false);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7e480bf75bcf..d2436880b305 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1510,6 +1510,24 @@ bad:
return ERR_PTR(err);
}
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+ const struct ceph_object_locator *src)
+{
+ WARN_ON(!ceph_oloc_empty(dest));
+ WARN_ON(dest->pool_ns); /* empty() only covers ->pool */
+
+ dest->pool = src->pool;
+ if (src->pool_ns)
+ dest->pool_ns = ceph_get_string(src->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_copy);
+
+void ceph_oloc_destroy(struct ceph_object_locator *oloc)
+{
+ ceph_put_string(oloc->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_destroy);
+
void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src)
{
@@ -1770,9 +1788,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 *ono,
u64 *oxoff, u64 *oxlen)
{
- u32 osize = le32_to_cpu(layout->fl_object_size);
- u32 su = le32_to_cpu(layout->fl_stripe_unit);
- u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ u32 osize = layout->object_size;
+ u32 su = layout->stripe_unit;
+ u32 sc = layout->stripe_count;
u32 bl, stripeno, stripepos, objsetno;
u32 su_per_object;
u64 t, su_offset;
@@ -1844,12 +1862,34 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
if (!pi)
return -ENOENT;
- raw_pgid->pool = oloc->pool;
- raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
- oid->name_len);
-
- dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
- raw_pgid->pool, raw_pgid->seed);
+ if (!oloc->pool_ns) {
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
+ dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+ raw_pgid->pool, raw_pgid->seed);
+ } else {
+ char stack_buf[256];
+ char *buf = stack_buf;
+ int nsl = oloc->pool_ns->len;
+ size_t total = nsl + 1 + oid->name_len;
+
+ if (total > sizeof(stack_buf)) {
+ buf = kmalloc(total, GFP_NOIO);
+ if (!buf)
+ return -ENOMEM;
+ }
+ memcpy(buf, oloc->pool_ns->str, nsl);
+ buf[nsl] = '\037';
+ memcpy(buf + nsl + 1, oid->name, oid->name_len);
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
+ if (buf != stack_buf)
+ kfree(buf);
+ dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
+ oid->name, nsl, oloc->pool_ns->str,
+ raw_pgid->pool, raw_pgid->seed);
+ }
return 0;
}
EXPORT_SYMBOL(ceph_object_locator_to_pg);
diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
new file mode 100644
index 000000000000..ca53c8319209
--- /dev/null
+++ b/net/ceph/string_table.c
@@ -0,0 +1,111 @@
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/ceph/string_table.h>
+
+static DEFINE_SPINLOCK(string_tree_lock);
+static struct rb_root string_tree = RB_ROOT;
+
+struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
+{
+ struct ceph_string *cs, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&string_tree_lock);
+ p = &string_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist && !kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ exist = NULL;
+ }
+ spin_unlock(&string_tree_lock);
+ if (exist)
+ return exist;
+
+ cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
+ if (!cs)
+ return NULL;
+
+ kref_init(&cs->kref);
+ cs->len = len;
+ memcpy(cs->str, str, len);
+ cs->str[len] = 0;
+
+retry:
+ exist = NULL;
+ parent = NULL;
+ p = &string_tree.rb_node;
+ spin_lock(&string_tree_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ ret = 0;
+ if (!exist) {
+ rb_link_node(&cs->node, parent, p);
+ rb_insert_color(&cs->node, &string_tree);
+ } else if (!kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ ret = -EAGAIN;
+ }
+ spin_unlock(&string_tree_lock);
+ if (ret == -EAGAIN)
+ goto retry;
+
+ if (exist) {
+ kfree(cs);
+ cs = exist;
+ }
+
+ return cs;
+}
+EXPORT_SYMBOL(ceph_find_or_create_string);
+
+static void ceph_free_string(struct rcu_head *head)
+{
+ struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
+ kfree(cs);
+}
+
+void ceph_release_string(struct kref *ref)
+{
+ struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
+
+ spin_lock(&string_tree_lock);
+ if (!RB_EMPTY_NODE(&cs->node)) {
+ rb_erase(&cs->node, &string_tree);
+ RB_CLEAR_NODE(&cs->node);
+ }
+ spin_unlock(&string_tree_lock);
+
+ call_rcu(&cs->rcu, ceph_free_string);
+}
+EXPORT_SYMBOL(ceph_release_string);
+
+bool ceph_strings_empty(void)
+{
+ return RB_EMPTY_ROOT(&string_tree);
+}
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 3ff137d9471d..3828f94b234c 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -216,14 +216,17 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
skb = dccp_make_response(sk, dst, req);
if (skb != NULL) {
struct dccp_hdr *dh = dccp_hdr(skb);
+ struct ipv6_txoptions *opt;
dh->dccph_checksum = dccp_v6_csum_finish(skb,
&ireq->ir_v6_loc_addr,
&ireq->ir_v6_rmt_addr);
fl6.daddr = ireq->ir_v6_rmt_addr;
rcu_read_lock();
- err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
- np->tclass);
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
+ err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -236,6 +239,7 @@ done:
static void dccp_v6_reqsk_destructor(struct request_sock *req)
{
dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+ kfree(inet_rsk(req)->ipv6_opt);
kfree_skb(inet_rsk(req)->pktopts);
}
@@ -494,7 +498,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
* Yes, keeping reference count would be much more clever, but we make
* one more one thing there: reattach optmem to newsk.
*/
- opt = rcu_dereference(np->opt);
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
if (opt) {
opt = ipv6_dup_options(newsk, opt);
RCU_INIT_POINTER(newnp->opt, opt);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 40d6b87713a1..72d6f056d863 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -135,76 +135,6 @@ int cipso_v4_rbm_strictvalid = 1;
*/
/**
- * cipso_v4_bitmap_walk - Walk a bitmap looking for a bit
- * @bitmap: the bitmap
- * @bitmap_len: length in bits
- * @offset: starting offset
- * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
- *
- * Description:
- * Starting at @offset, walk the bitmap from left to right until either the
- * desired bit is found or we reach the end. Return the bit offset, -1 if
- * not found, or -2 if error.
- */
-static int cipso_v4_bitmap_walk(const unsigned char *bitmap,
- u32 bitmap_len,
- u32 offset,
- u8 state)
-{
- u32 bit_spot;
- u32 byte_offset;
- unsigned char bitmask;
- unsigned char byte;
-
- /* gcc always rounds to zero when doing integer division */
- byte_offset = offset / 8;
- byte = bitmap[byte_offset];
- bit_spot = offset;
- bitmask = 0x80 >> (offset % 8);
-
- while (bit_spot < bitmap_len) {
- if ((state && (byte & bitmask) == bitmask) ||
- (state == 0 && (byte & bitmask) == 0))
- return bit_spot;
-
- bit_spot++;
- bitmask >>= 1;
- if (bitmask == 0) {
- byte = bitmap[++byte_offset];
- bitmask = 0x80;
- }
- }
-
- return -1;
-}
-
-/**
- * cipso_v4_bitmap_setbit - Sets a single bit in a bitmap
- * @bitmap: the bitmap
- * @bit: the bit
- * @state: if non-zero, set the bit (1) else clear the bit (0)
- *
- * Description:
- * Set a single bit in the bitmask. Returns zero on success, negative values
- * on error.
- */
-static void cipso_v4_bitmap_setbit(unsigned char *bitmap,
- u32 bit,
- u8 state)
-{
- u32 byte_spot;
- u8 bitmask;
-
- /* gcc always rounds to zero when doing integer division */
- byte_spot = bit / 8;
- bitmask = 0x80 >> (bit % 8);
- if (state)
- bitmap[byte_spot] |= bitmask;
- else
- bitmap[byte_spot] &= ~bitmask;
-}
-
-/**
* cipso_v4_cache_entry_free - Frees a cache entry
* @entry: the entry to free
*
@@ -840,10 +770,10 @@ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def,
cipso_cat_size = doi_def->map.std->cat.cipso_size;
cipso_array = doi_def->map.std->cat.cipso;
for (;;) {
- cat = cipso_v4_bitmap_walk(bitmap,
- bitmap_len_bits,
- cat + 1,
- 1);
+ cat = netlbl_bitmap_walk(bitmap,
+ bitmap_len_bits,
+ cat + 1,
+ 1);
if (cat < 0)
break;
if (cat >= cipso_cat_size ||
@@ -909,7 +839,7 @@ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def,
}
if (net_spot >= net_clen_bits)
return -ENOSPC;
- cipso_v4_bitmap_setbit(net_cat, net_spot, 1);
+ netlbl_bitmap_setbit(net_cat, net_spot, 1);
if (net_spot > net_spot_max)
net_spot_max = net_spot;
@@ -951,10 +881,10 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
}
for (;;) {
- net_spot = cipso_v4_bitmap_walk(net_cat,
- net_clen_bits,
- net_spot + 1,
- 1);
+ net_spot = netlbl_bitmap_walk(net_cat,
+ net_clen_bits,
+ net_spot + 1,
+ 1);
if (net_spot < 0) {
if (net_spot == -2)
return -EFAULT;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f9f9e375d7de..3ebf45b38bc3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6147,6 +6147,9 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
kmemcheck_annotate_bitfield(ireq, flags);
ireq->opt = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ ireq->pktopts = NULL;
+#endif
atomic64_set(&ireq->ir_cookie, 0);
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b26aa870adc0..bdaef7fd6e47 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -236,7 +236,8 @@ void tcp_select_initial_window(int __space, __u32 mss,
/* Set window scaling on max possible window
* See RFC1323 for an explanation of the limit to 14
*/
- space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+ space = max_t(u32, space, sysctl_tcp_rmem[2]);
+ space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
while (space > 65535 && (*rcv_wscale) < 14) {
space >>= 1;
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 6d8ea099213e..c174ccb340a1 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -22,6 +22,7 @@ ipv6-$(CONFIG_NETFILTER) += netfilter.o
ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
ipv6-$(CONFIG_PROC_FS) += proc.o
ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
+ipv6-$(CONFIG_NETLABEL) += calipso.o
ipv6-objs += $(ipv6-y)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6287a8b9f428..ab3e796596b1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3624,8 +3624,7 @@ restart:
state = ifa->state;
ifa->state = INET6_IFADDR_STATE_DEAD;
- list_del(&ifa->if_list);
- list_add(&ifa->if_list, &del_list);
+ list_move(&ifa->if_list, &del_list);
}
spin_unlock_bh(&ifa->lock);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2076c21107d0..b454055ba625 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -60,6 +60,7 @@
#ifdef CONFIG_IPV6_TUNNEL
#include <net/ip6_tunnel.h>
#endif
+#include <net/calipso.h>
#include <asm/uaccess.h>
#include <linux/mroute6.h>
@@ -983,6 +984,10 @@ static int __init inet6_init(void)
if (err)
goto pingv6_fail;
+ err = calipso_init();
+ if (err)
+ goto calipso_fail;
+
#ifdef CONFIG_SYSCTL
err = ipv6_sysctl_register();
if (err)
@@ -993,8 +998,10 @@ out:
#ifdef CONFIG_SYSCTL
sysctl_fail:
- pingv6_exit();
+ calipso_exit();
#endif
+calipso_fail:
+ pingv6_exit();
pingv6_fail:
ipv6_packet_cleanup();
ipv6_packet_fail:
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
new file mode 100644
index 000000000000..c53b92c617c5
--- /dev/null
+++ b/net/ipv6/calipso.c
@@ -0,0 +1,1473 @@
+/*
+ * CALIPSO - Common Architecture Label IPv6 Security Option
+ *
+ * This is an implementation of the CALIPSO protocol as specified in
+ * RFC 5570.
+ *
+ * Authors: Paul Moore <paul.moore@hp.com>
+ * Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/jhash.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/netlabel.h>
+#include <net/calipso.h>
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <asm/unaligned.h>
+#include <linux/crc-ccitt.h>
+
+/* Maximium size of the calipso option including
+ * the two-byte TLV header.
+ */
+#define CALIPSO_OPT_LEN_MAX (2 + 252)
+
+/* Size of the minimum calipso option including
+ * the two-byte TLV header.
+ */
+#define CALIPSO_HDR_LEN (2 + 8)
+
+/* Maximium size of the calipso option including
+ * the two-byte TLV header and upto 3 bytes of
+ * leading pad and 7 bytes of trailing pad.
+ */
+#define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7)
+
+ /* Maximium size of u32 aligned buffer required to hold calipso
+ * option. Max of 3 initial pad bytes starting from buffer + 3.
+ * i.e. the worst case is when the previous tlv finishes on 4n + 3.
+ */
+#define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX)
+
+/* List of available DOI definitions */
+static DEFINE_SPINLOCK(calipso_doi_list_lock);
+static LIST_HEAD(calipso_doi_list);
+
+/* Label mapping cache */
+int calipso_cache_enabled = 1;
+int calipso_cache_bucketsize = 10;
+#define CALIPSO_CACHE_BUCKETBITS 7
+#define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS)
+#define CALIPSO_CACHE_REORDERLIMIT 10
+struct calipso_map_cache_bkt {
+ spinlock_t lock;
+ u32 size;
+ struct list_head list;
+};
+
+struct calipso_map_cache_entry {
+ u32 hash;
+ unsigned char *key;
+ size_t key_len;
+
+ struct netlbl_lsm_cache *lsm_data;
+
+ u32 activity;
+ struct list_head list;
+};
+
+static struct calipso_map_cache_bkt *calipso_cache;
+
+/* Label Mapping Cache Functions
+ */
+
+/**
+ * calipso_cache_entry_free - Frees a cache entry
+ * @entry: the entry to free
+ *
+ * Description:
+ * This function frees the memory associated with a cache entry including the
+ * LSM cache data if there are no longer any users, i.e. reference count == 0.
+ *
+ */
+static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry)
+{
+ if (entry->lsm_data)
+ netlbl_secattr_cache_free(entry->lsm_data);
+ kfree(entry->key);
+ kfree(entry);
+}
+
+/**
+ * calipso_map_cache_hash - Hashing function for the CALIPSO cache
+ * @key: the hash key
+ * @key_len: the length of the key in bytes
+ *
+ * Description:
+ * The CALIPSO tag hashing function. Returns a 32-bit hash value.
+ *
+ */
+static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+/**
+ * calipso_cache_init - Initialize the CALIPSO cache
+ *
+ * Description:
+ * Initializes the CALIPSO label mapping cache, this function should be called
+ * before any of the other functions defined in this file. Returns zero on
+ * success, negative values on error.
+ *
+ */
+static int __init calipso_cache_init(void)
+{
+ u32 iter;
+
+ calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS,
+ sizeof(struct calipso_map_cache_bkt),
+ GFP_KERNEL);
+ if (!calipso_cache)
+ return -ENOMEM;
+
+ for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) {
+ spin_lock_init(&calipso_cache[iter].lock);
+ calipso_cache[iter].size = 0;
+ INIT_LIST_HEAD(&calipso_cache[iter].list);
+ }
+
+ return 0;
+}
+
+/**
+ * calipso_cache_invalidate - Invalidates the current CALIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CALIPSO cache. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+static void calipso_cache_invalidate(void)
+{
+ struct calipso_map_cache_entry *entry, *tmp_entry;
+ u32 iter;
+
+ for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) {
+ spin_lock_bh(&calipso_cache[iter].lock);
+ list_for_each_entry_safe(entry,
+ tmp_entry,
+ &calipso_cache[iter].list, list) {
+ list_del(&entry->list);
+ calipso_cache_entry_free(entry);
+ }
+ calipso_cache[iter].size = 0;
+ spin_unlock_bh(&calipso_cache[iter].lock);
+ }
+}
+
+/**
+ * calipso_cache_check - Check the CALIPSO cache for a label mapping
+ * @key: the buffer to check
+ * @key_len: buffer length in bytes
+ * @secattr: the security attribute struct to use
+ *
+ * Description:
+ * This function checks the cache to see if a label mapping already exists for
+ * the given key. If there is a match then the cache is adjusted and the
+ * @secattr struct is populated with the correct LSM security attributes. The
+ * cache is adjusted in the following manner if the entry is not already the
+ * first in the cache bucket:
+ *
+ * 1. The cache entry's activity counter is incremented
+ * 2. The previous (higher ranking) entry's activity counter is decremented
+ * 3. If the difference between the two activity counters is geater than
+ * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped
+ *
+ * Returns zero on success, -ENOENT for a cache miss, and other negative values
+ * on error.
+ *
+ */
+static int calipso_cache_check(const unsigned char *key,
+ u32 key_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ u32 bkt;
+ struct calipso_map_cache_entry *entry;
+ struct calipso_map_cache_entry *prev_entry = NULL;
+ u32 hash;
+
+ if (!calipso_cache_enabled)
+ return -ENOENT;
+
+ hash = calipso_map_cache_hash(key, key_len);
+ bkt = hash & (CALIPSO_CACHE_BUCKETS - 1);
+ spin_lock_bh(&calipso_cache[bkt].lock);
+ list_for_each_entry(entry, &calipso_cache[bkt].list, list) {
+ if (entry->hash == hash &&
+ entry->key_len == key_len &&
+ memcmp(entry->key, key, key_len) == 0) {
+ entry->activity += 1;
+ atomic_inc(&entry->lsm_data->refcount);
+ secattr->cache = entry->lsm_data;
+ secattr->flags |= NETLBL_SECATTR_CACHE;
+ secattr->type = NETLBL_NLTYPE_CALIPSO;
+ if (!prev_entry) {
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+ return 0;
+ }
+
+ if (prev_entry->activity > 0)
+ prev_entry->activity -= 1;
+ if (entry->activity > prev_entry->activity &&
+ entry->activity - prev_entry->activity >
+ CALIPSO_CACHE_REORDERLIMIT) {
+ __list_del(entry->list.prev, entry->list.next);
+ __list_add(&entry->list,
+ prev_entry->list.prev,
+ &prev_entry->list);
+ }
+
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+ return 0;
+ }
+ prev_entry = entry;
+ }
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+
+ return -ENOENT;
+}
+
+/**
+ * calipso_cache_add - Add an entry to the CALIPSO cache
+ * @calipso_ptr: the CALIPSO option
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CALIPSO label mapping cache. Add the new entry to
+ * head of the cache bucket's list, if the cache bucket is out of room remove
+ * the last entry in the list first. It is important to note that there is
+ * currently no checking for duplicate keys. Returns zero on success,
+ * negative values on failure. The key stored starts at calipso_ptr + 2,
+ * i.e. the type and length bytes are not stored, this corresponds to
+ * calipso_ptr[1] bytes of data.
+ *
+ */
+static int calipso_cache_add(const unsigned char *calipso_ptr,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -EPERM;
+ u32 bkt;
+ struct calipso_map_cache_entry *entry = NULL;
+ struct calipso_map_cache_entry *old_entry = NULL;
+ u32 calipso_ptr_len;
+
+ if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0)
+ return 0;
+
+ calipso_ptr_len = calipso_ptr[1];
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ return -ENOMEM;
+ entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC);
+ if (!entry->key) {
+ ret_val = -ENOMEM;
+ goto cache_add_failure;
+ }
+ entry->key_len = calipso_ptr_len;
+ entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len);
+ atomic_inc(&secattr->cache->refcount);
+ entry->lsm_data = secattr->cache;
+
+ bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1);
+ spin_lock_bh(&calipso_cache[bkt].lock);
+ if (calipso_cache[bkt].size < calipso_cache_bucketsize) {
+ list_add(&entry->list, &calipso_cache[bkt].list);
+ calipso_cache[bkt].size += 1;
+ } else {
+ old_entry = list_entry(calipso_cache[bkt].list.prev,
+ struct calipso_map_cache_entry, list);
+ list_del(&old_entry->list);
+ list_add(&entry->list, &calipso_cache[bkt].list);
+ calipso_cache_entry_free(old_entry);
+ }
+ spin_unlock_bh(&calipso_cache[bkt].lock);
+
+ return 0;
+
+cache_add_failure:
+ if (entry)
+ calipso_cache_entry_free(entry);
+ return ret_val;
+}
+
+/* DOI List Functions
+ */
+
+/**
+ * calipso_doi_search - Searches for a DOI definition
+ * @doi: the DOI to search for
+ *
+ * Description:
+ * Search the DOI definition list for a DOI definition with a DOI value that
+ * matches @doi. The caller is responsible for calling rcu_read_[un]lock().
+ * Returns a pointer to the DOI definition on success and NULL on failure.
+ */
+static struct calipso_doi *calipso_doi_search(u32 doi)
+{
+ struct calipso_doi *iter;
+
+ list_for_each_entry_rcu(iter, &calipso_doi_list, list)
+ if (iter->doi == doi && atomic_read(&iter->refcount))
+ return iter;
+ return NULL;
+}
+
+/**
+ * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CALIPSO engine and calls this
+ * function to add it to the list of acceptable domains. The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see calipso.h for details). Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+static int calipso_doi_add(struct calipso_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -EINVAL;
+ u32 doi;
+ u32 doi_type;
+ struct audit_buffer *audit_buf;
+
+ doi = doi_def->doi;
+ doi_type = doi_def->type;
+
+ if (doi_def->doi == CALIPSO_DOI_UNKNOWN)
+ goto doi_add_return;
+
+ atomic_set(&doi_def->refcount, 1);
+
+ spin_lock(&calipso_doi_list_lock);
+ if (calipso_doi_search(doi_def->doi)) {
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = -EEXIST;
+ goto doi_add_return;
+ }
+ list_add_tail_rcu(&doi_def->list, &calipso_doi_list);
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = 0;
+
+doi_add_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info);
+ if (audit_buf) {
+ const char *type_str;
+
+ switch (doi_type) {
+ case CALIPSO_MAP_PASS:
+ type_str = "pass";
+ break;
+ default:
+ type_str = "(unknown)";
+ }
+ audit_log_format(audit_buf,
+ " calipso_doi=%u calipso_type=%s res=%u",
+ doi, type_str, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * calipso_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+static void calipso_doi_free(struct calipso_doi *doi_def)
+{
+ kfree(doi_def);
+}
+
+/**
+ * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer
+ * @entry: the entry's RCU field
+ *
+ * Description:
+ * This function is designed to be used as a callback to the call_rcu()
+ * function so that the memory allocated to the DOI definition can be released
+ * safely.
+ *
+ */
+static void calipso_doi_free_rcu(struct rcu_head *entry)
+{
+ struct calipso_doi *doi_def;
+
+ doi_def = container_of(entry, struct calipso_doi, rcu);
+ calipso_doi_free(doi_def);
+}
+
+/**
+ * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct calipso_doi *doi_def;
+ struct audit_buffer *audit_buf;
+
+ spin_lock(&calipso_doi_list_lock);
+ doi_def = calipso_doi_search(doi);
+ if (!doi_def) {
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = -ENOENT;
+ goto doi_remove_return;
+ }
+ if (!atomic_dec_and_test(&doi_def->refcount)) {
+ spin_unlock(&calipso_doi_list_lock);
+ ret_val = -EBUSY;
+ goto doi_remove_return;
+ }
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&calipso_doi_list_lock);
+
+ call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+ ret_val = 0;
+
+doi_remove_return:
+ audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info);
+ if (audit_buf) {
+ audit_log_format(audit_buf,
+ " calipso_doi=%u res=%u",
+ doi, ret_val == 0 ? 1 : 0);
+ audit_log_end(audit_buf);
+ }
+
+ return ret_val;
+}
+
+/**
+ * calipso_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller. Otherwise NULL is returned. The caller must ensure that
+ * calipso_doi_putdef() is called when the caller is done.
+ *
+ */
+static struct calipso_doi *calipso_doi_getdef(u32 doi)
+{
+ struct calipso_doi *doi_def;
+
+ rcu_read_lock();
+ doi_def = calipso_doi_search(doi);
+ if (!doi_def)
+ goto doi_getdef_return;
+ if (!atomic_inc_not_zero(&doi_def->refcount))
+ doi_def = NULL;
+
+doi_getdef_return:
+ rcu_read_unlock();
+ return doi_def;
+}
+
+/**
+ * calipso_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from calipso_doi_getdef().
+ *
+ */
+static void calipso_doi_putdef(struct calipso_doi *doi_def)
+{
+ if (!doi_def)
+ return;
+
+ if (!atomic_dec_and_test(&doi_def->refcount))
+ return;
+ spin_lock(&calipso_doi_list_lock);
+ list_del_rcu(&doi_def->list);
+ spin_unlock(&calipso_doi_list_lock);
+
+ call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+}
+
+/**
+ * calipso_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return. Updates the value in @skip_cnt upon
+ * return. Returns zero on success, negative values on failure.
+ *
+ */
+static int calipso_doi_walk(u32 *skip_cnt,
+ int (*callback)(struct calipso_doi *doi_def,
+ void *arg),
+ void *cb_arg)
+{
+ int ret_val = -ENOENT;
+ u32 doi_cnt = 0;
+ struct calipso_doi *iter_doi;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list)
+ if (atomic_read(&iter_doi->refcount) > 0) {
+ if (doi_cnt++ < *skip_cnt)
+ continue;
+ ret_val = callback(iter_doi, cb_arg);
+ if (ret_val < 0) {
+ doi_cnt--;
+ goto doi_walk_return;
+ }
+ }
+
+doi_walk_return:
+ rcu_read_unlock();
+ *skip_cnt = doi_cnt;
+ return ret_val;
+}
+
+/**
+ * calipso_validate - Validate a CALIPSO option
+ * @skb: the packet
+ * @option: the start of the option
+ *
+ * Description:
+ * This routine is called to validate a CALIPSO option.
+ * If the option is valid then %true is returned, otherwise
+ * %false is returned.
+ *
+ * The caller should have already checked that the length of the
+ * option (including the TLV header) is >= 10 and that the catmap
+ * length is consistent with the option length.
+ *
+ * We leave checks on the level and categories to the socket layer.
+ */
+bool calipso_validate(const struct sk_buff *skb, const unsigned char *option)
+{
+ struct calipso_doi *doi_def;
+ bool ret_val;
+ u16 crc, len = option[1] + 2;
+ static const u8 zero[2];
+
+ /* The original CRC runs over the option including the TLV header
+ * with the CRC-16 field (at offset 8) zeroed out. */
+ crc = crc_ccitt(0xffff, option, 8);
+ crc = crc_ccitt(crc, zero, sizeof(zero));
+ if (len > 10)
+ crc = crc_ccitt(crc, option + 10, len - 10);
+ crc = ~crc;
+ if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff))
+ return false;
+
+ rcu_read_lock();
+ doi_def = calipso_doi_search(get_unaligned_be32(option + 2));
+ ret_val = !!doi_def;
+ rcu_read_unlock();
+
+ return ret_val;
+}
+
+/**
+ * calipso_map_cat_hton - Perform a category mapping from host to network
+ * @doi_def: the DOI definition
+ * @secattr: the security attributes
+ * @net_cat: the zero'd out category bitmap in network/CALIPSO format
+ * @net_cat_len: the length of the CALIPSO bitmap in bytes
+ *
+ * Description:
+ * Perform a label mapping to translate a local MLS category bitmap to the
+ * correct CALIPSO bitmap using the given DOI definition. Returns the minimum
+ * size in bytes of the network bitmap on success, negative values otherwise.
+ *
+ */
+static int calipso_map_cat_hton(const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr,
+ unsigned char *net_cat,
+ u32 net_cat_len)
+{
+ int spot = -1;
+ u32 net_spot_max = 0;
+ u32 net_clen_bits = net_cat_len * 8;
+
+ for (;;) {
+ spot = netlbl_catmap_walk(secattr->attr.mls.cat,
+ spot + 1);
+ if (spot < 0)
+ break;
+ if (spot >= net_clen_bits)
+ return -ENOSPC;
+ netlbl_bitmap_setbit(net_cat, spot, 1);
+
+ if (spot > net_spot_max)
+ net_spot_max = spot;
+ }
+
+ return (net_spot_max / 32 + 1) * 4;
+}
+
+/**
+ * calipso_map_cat_ntoh - Perform a category mapping from network to host
+ * @doi_def: the DOI definition
+ * @net_cat: the category bitmap in network/CALIPSO format
+ * @net_cat_len: the length of the CALIPSO bitmap in bytes
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Perform a label mapping to translate a CALIPSO bitmap to the correct local
+ * MLS category bitmap using the given DOI definition. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def,
+ const unsigned char *net_cat,
+ u32 net_cat_len,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ int spot = -1;
+ u32 net_clen_bits = net_cat_len * 8;
+
+ for (;;) {
+ spot = netlbl_bitmap_walk(net_cat,
+ net_clen_bits,
+ spot + 1,
+ 1);
+ if (spot < 0) {
+ if (spot == -2)
+ return -EFAULT;
+ return 0;
+ }
+
+ ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
+ spot,
+ GFP_ATOMIC);
+ if (ret_val != 0)
+ return ret_val;
+ }
+
+ return -EINVAL;
+}
+
+/**
+ * calipso_pad_write - Writes pad bytes in TLV format
+ * @buf: the buffer
+ * @offset: offset from start of buffer to write padding
+ * @count: number of pad bytes to write
+ *
+ * Description:
+ * Write @count bytes of TLV padding into @buffer starting at offset @offset.
+ * @count should be less than 8 - see RFC 4942.
+ *
+ */
+static int calipso_pad_write(unsigned char *buf, unsigned int offset,
+ unsigned int count)
+{
+ if (WARN_ON_ONCE(count >= 8))
+ return -EINVAL;
+
+ switch (count) {
+ case 0:
+ break;
+ case 1:
+ buf[offset] = IPV6_TLV_PAD1;
+ break;
+ default:
+ buf[offset] = IPV6_TLV_PADN;
+ buf[offset + 1] = count - 2;
+ if (count > 2)
+ memset(buf + offset + 2, 0, count - 2);
+ break;
+ }
+ return 0;
+}
+
+/**
+ * calipso_genopt - Generate a CALIPSO option
+ * @buf: the option buffer
+ * @start: offset from which to write
+ * @buf_len: the size of opt_buf
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Generate a CALIPSO option using the DOI definition and security attributes
+ * passed to the function. This also generates upto three bytes of leading
+ * padding that ensures that the option is 4n + 2 aligned. It returns the
+ * number of bytes written (including any initial padding).
+ */
+static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ u32 len, pad;
+ u16 crc;
+ static const unsigned char padding[4] = {2, 1, 0, 3};
+ unsigned char *calipso;
+
+ /* CALIPSO has 4n + 2 alignment */
+ pad = padding[start & 3];
+ if (buf_len <= start + pad + CALIPSO_HDR_LEN)
+ return -ENOSPC;
+
+ if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0)
+ return -EPERM;
+
+ len = CALIPSO_HDR_LEN;
+
+ if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
+ ret_val = calipso_map_cat_hton(doi_def,
+ secattr,
+ buf + start + pad + len,
+ buf_len - start - pad - len);
+ if (ret_val < 0)
+ return ret_val;
+ len += ret_val;
+ }
+
+ calipso_pad_write(buf, start, pad);
+ calipso = buf + start + pad;
+
+ calipso[0] = IPV6_TLV_CALIPSO;
+ calipso[1] = len - 2;
+ *(__be32 *)(calipso + 2) = htonl(doi_def->doi);
+ calipso[6] = (len - CALIPSO_HDR_LEN) / 4;
+ calipso[7] = secattr->attr.mls.lvl,
+ crc = ~crc_ccitt(0xffff, calipso, len);
+ calipso[8] = crc & 0xff;
+ calipso[9] = (crc >> 8) & 0xff;
+ return pad + len;
+}
+
+/* Hop-by-hop hdr helper functions
+ */
+
+/**
+ * calipso_opt_update - Replaces socket's hop options with a new set
+ * @sk: the socket
+ * @hop: new hop options
+ *
+ * Description:
+ * Replaces @sk's hop options with @hop. @hop may be NULL to leave
+ * the socket with no hop options.
+ *
+ */
+static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
+{
+ struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
+
+ txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS,
+ hop, hop ? ipv6_optlen(hop) : 0);
+ txopt_put(old);
+ if (IS_ERR(txopts))
+ return PTR_ERR(txopts);
+
+ txopts = ipv6_update_options(sk, txopts);
+ if (txopts) {
+ atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+ txopt_put(txopts);
+ }
+
+ return 0;
+}
+
+/**
+ * calipso_tlv_len - Returns the length of the TLV
+ * @opt: the option header
+ * @offset: offset of the TLV within the header
+ *
+ * Description:
+ * Returns the length of the TLV option at offset @offset within
+ * the option header @opt. Checks that the entire TLV fits inside
+ * the option header, returns a negative value if this is not the case.
+ */
+static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset)
+{
+ unsigned char *tlv = (unsigned char *)opt;
+ unsigned int opt_len = ipv6_optlen(opt), tlv_len;
+
+ if (offset < sizeof(*opt) || offset >= opt_len)
+ return -EINVAL;
+ if (tlv[offset] == IPV6_TLV_PAD1)
+ return 1;
+ if (offset + 1 >= opt_len)
+ return -EINVAL;
+ tlv_len = tlv[offset + 1] + 2;
+ if (offset + tlv_len > opt_len)
+ return -EINVAL;
+ return tlv_len;
+}
+
+/**
+ * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header
+ * @hop: the hop options header
+ * @start: on return holds the offset of any leading padding
+ * @end: on return holds the offset of the first non-pad TLV after CALIPSO
+ *
+ * Description:
+ * Finds the space occupied by a CALIPSO option (including any leading and
+ * trailing padding).
+ *
+ * If a CALIPSO option exists set @start and @end to the
+ * offsets within @hop of the start of padding before the first
+ * CALIPSO option and the end of padding after the first CALIPSO
+ * option. In this case the function returns 0.
+ *
+ * In the absence of a CALIPSO option, @start and @end will be
+ * set to the start and end of any trailing padding in the header.
+ * This is useful when appending a new option, as the caller may want
+ * to overwrite some of this padding. In this case the function will
+ * return -ENOENT.
+ */
+static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start,
+ unsigned int *end)
+{
+ int ret_val = -ENOENT, tlv_len;
+ unsigned int opt_len, offset, offset_s = 0, offset_e = 0;
+ unsigned char *opt = (unsigned char *)hop;
+
+ opt_len = ipv6_optlen(hop);
+ offset = sizeof(*hop);
+
+ while (offset < opt_len) {
+ tlv_len = calipso_tlv_len(hop, offset);
+ if (tlv_len < 0)
+ return tlv_len;
+
+ switch (opt[offset]) {
+ case IPV6_TLV_PAD1:
+ case IPV6_TLV_PADN:
+ if (offset_e)
+ offset_e = offset;
+ break;
+ case IPV6_TLV_CALIPSO:
+ ret_val = 0;
+ offset_e = offset;
+ break;
+ default:
+ if (offset_e == 0)
+ offset_s = offset;
+ else
+ goto out;
+ }
+ offset += tlv_len;
+ }
+
+out:
+ if (offset_s)
+ *start = offset_s + calipso_tlv_len(hop, offset_s);
+ else
+ *start = sizeof(*hop);
+ if (offset_e)
+ *end = offset_e + calipso_tlv_len(hop, offset_e);
+ else
+ *end = opt_len;
+
+ return ret_val;
+}
+
+/**
+ * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr
+ * @hop: the original hop options header
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Creates a new hop options header based on @hop with a
+ * CALIPSO option added to it. If @hop already contains a CALIPSO
+ * option this is overwritten, otherwise the new option is appended
+ * after any existing options. If @hop is NULL then the new header
+ * will contain just the CALIPSO option and any needed padding.
+ *
+ */
+static struct ipv6_opt_hdr *
+calipso_opt_insert(struct ipv6_opt_hdr *hop,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ unsigned int start, end, buf_len, pad, hop_len;
+ struct ipv6_opt_hdr *new;
+ int ret_val;
+
+ if (hop) {
+ hop_len = ipv6_optlen(hop);
+ ret_val = calipso_opt_find(hop, &start, &end);
+ if (ret_val && ret_val != -ENOENT)
+ return ERR_PTR(ret_val);
+ } else {
+ hop_len = 0;
+ start = sizeof(*hop);
+ end = 0;
+ }
+
+ buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD;
+ new = kzalloc(buf_len, GFP_ATOMIC);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ if (start > sizeof(*hop))
+ memcpy(new, hop, start);
+ ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def,
+ secattr);
+ if (ret_val < 0)
+ return ERR_PTR(ret_val);
+
+ buf_len = start + ret_val;
+ /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */
+ pad = ((buf_len & 4) + (end & 7)) & 7;
+ calipso_pad_write((unsigned char *)new, buf_len, pad);
+ buf_len += pad;
+
+ if (end != hop_len) {
+ memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end);
+ buf_len += hop_len - end;
+ }
+ new->nexthdr = 0;
+ new->hdrlen = buf_len / 8 - 1;
+
+ return new;
+}
+
+/**
+ * calipso_opt_del - Removes the CALIPSO option from an option header
+ * @hop: the original header
+ * @new: the new header
+ *
+ * Description:
+ * Creates a new header based on @hop without any CALIPSO option. If @hop
+ * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains
+ * no other non-padding options, it returns zero with @new set to NULL.
+ * Otherwise it returns zero, creates a new header without the CALIPSO
+ * option (and removing as much padding as possible) and returns with
+ * @new set to that header.
+ *
+ */
+static int calipso_opt_del(struct ipv6_opt_hdr *hop,
+ struct ipv6_opt_hdr **new)
+{
+ int ret_val;
+ unsigned int start, end, delta, pad, hop_len;
+
+ ret_val = calipso_opt_find(hop, &start, &end);
+ if (ret_val)
+ return ret_val;
+
+ hop_len = ipv6_optlen(hop);
+ if (start == sizeof(*hop) && end == hop_len) {
+ /* There's no other option in the header so return NULL */
+ *new = NULL;
+ return 0;
+ }
+
+ delta = (end - start) & ~7;
+ *new = kzalloc(hop_len - delta, GFP_ATOMIC);
+ if (!*new)
+ return -ENOMEM;
+
+ memcpy(*new, hop, start);
+ (*new)->hdrlen -= delta / 8;
+ pad = (end - start) & 7;
+ calipso_pad_write((unsigned char *)*new, start, pad);
+ if (end != hop_len)
+ memcpy((char *)*new + start + pad, (char *)hop + end,
+ hop_len - end);
+
+ return 0;
+}
+
+/**
+ * calipso_opt_getattr - Get the security attributes from a memory block
+ * @calipso: the CALIPSO option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @calipso and return the security attributes in @secattr.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_opt_getattr(const unsigned char *calipso,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ u32 doi, len = calipso[1], cat_len = calipso[6] * 4;
+ struct calipso_doi *doi_def;
+
+ if (cat_len + 8 > len)
+ return -EINVAL;
+
+ if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0)
+ return 0;
+
+ doi = get_unaligned_be32(calipso + 2);
+ rcu_read_lock();
+ doi_def = calipso_doi_search(doi);
+ if (!doi_def)
+ goto getattr_return;
+
+ secattr->attr.mls.lvl = calipso[7];
+ secattr->flags |= NETLBL_SECATTR_MLS_LVL;
+
+ if (cat_len) {
+ ret_val = calipso_map_cat_ntoh(doi_def,
+ calipso + 10,
+ cat_len,
+ secattr);
+ if (ret_val != 0) {
+ netlbl_catmap_free(secattr->attr.mls.cat);
+ goto getattr_return;
+ }
+
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ }
+
+ secattr->type = NETLBL_NLTYPE_CALIPSO;
+
+getattr_return:
+ rcu_read_unlock();
+ return ret_val;
+}
+
+/* sock functions.
+ */
+
+/**
+ * calipso_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CALIPSO option attached to the sock and if
+ * there is return the CALIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+static int calipso_sock_getattr(struct sock *sk,
+ struct netlbl_lsm_secattr *secattr)
+{
+ struct ipv6_opt_hdr *hop;
+ int opt_len, len, ret_val = -ENOMSG, offset;
+ unsigned char *opt;
+ struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+
+ if (!txopts || !txopts->hopopt)
+ goto done;
+
+ hop = txopts->hopopt;
+ opt = (unsigned char *)hop;
+ opt_len = ipv6_optlen(hop);
+ offset = sizeof(*hop);
+ while (offset < opt_len) {
+ len = calipso_tlv_len(hop, offset);
+ if (len < 0) {
+ ret_val = len;
+ goto done;
+ }
+ switch (opt[offset]) {
+ case IPV6_TLV_CALIPSO:
+ if (len < CALIPSO_HDR_LEN)
+ ret_val = -EINVAL;
+ else
+ ret_val = calipso_opt_getattr(&opt[offset],
+ secattr);
+ goto done;
+ default:
+ offset += len;
+ break;
+ }
+ }
+done:
+ txopt_put(txopts);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_setattr - Add a CALIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+static int calipso_sock_setattr(struct sock *sk,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct ipv6_opt_hdr *old, *new;
+ struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+
+ old = NULL;
+ if (txopts)
+ old = txopts->hopopt;
+
+ new = calipso_opt_insert(old, doi_def, secattr);
+ txopt_put(txopts);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ ret_val = calipso_opt_update(sk, new);
+
+ kfree(new);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_delattr - Delete the CALIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a socket, if present.
+ *
+ */
+static void calipso_sock_delattr(struct sock *sk)
+{
+ struct ipv6_opt_hdr *new_hop;
+ struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+
+ if (!txopts || !txopts->hopopt)
+ goto done;
+
+ if (calipso_opt_del(txopts->hopopt, &new_hop))
+ goto done;
+
+ calipso_opt_update(sk, new_hop);
+ kfree(new_hop);
+
+done:
+ txopt_put(txopts);
+}
+
+/* request sock functions.
+ */
+
+/**
+ * calipso_req_setattr - Add a CALIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+static int calipso_req_setattr(struct request_sock *req,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ struct ipv6_txoptions *txopts;
+ struct inet_request_sock *req_inet = inet_rsk(req);
+ struct ipv6_opt_hdr *old, *new;
+ struct sock *sk = sk_to_full_sk(req_to_sk(req));
+
+ if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt)
+ old = req_inet->ipv6_opt->hopopt;
+ else
+ old = NULL;
+
+ new = calipso_opt_insert(old, doi_def, secattr);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
+ new, new ? ipv6_optlen(new) : 0);
+
+ kfree(new);
+
+ if (IS_ERR(txopts))
+ return PTR_ERR(txopts);
+
+ txopts = xchg(&req_inet->ipv6_opt, txopts);
+ if (txopts) {
+ atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+ txopt_put(txopts);
+ }
+
+ return 0;
+}
+
+/**
+ * calipso_req_delattr - Delete the CALIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a request socket, if present.
+ *
+ */
+static void calipso_req_delattr(struct request_sock *req)
+{
+ struct inet_request_sock *req_inet = inet_rsk(req);
+ struct ipv6_opt_hdr *new;
+ struct ipv6_txoptions *txopts;
+ struct sock *sk = sk_to_full_sk(req_to_sk(req));
+
+ if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt)
+ return;
+
+ if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
+ return; /* Nothing to do */
+
+ txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
+ new, new ? ipv6_optlen(new) : 0);
+
+ if (!IS_ERR(txopts)) {
+ txopts = xchg(&req_inet->ipv6_opt, txopts);
+ if (txopts) {
+ atomic_sub(txopts->tot_len, &sk->sk_omem_alloc);
+ txopt_put(txopts);
+ }
+ }
+ kfree(new);
+}
+
+/* skbuff functions.
+ */
+
+/**
+ * calipso_skbuff_optptr - Find the CALIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer
+ * to the start of the CALIPSO option on success, NULL if one if not found.
+ *
+ */
+static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb);
+ int offset;
+
+ if (ip6_hdr->nexthdr != NEXTHDR_HOP)
+ return NULL;
+
+ offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO);
+ if (offset >= 0)
+ return (unsigned char *)ip6_hdr + offset;
+
+ return NULL;
+}
+
+/**
+ * calipso_skbuff_setattr - Set the CALIPSO option on a packet
+ * @skb: the packet
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CALIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+static int calipso_skbuff_setattr(struct sk_buff *skb,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val;
+ struct ipv6hdr *ip6_hdr;
+ struct ipv6_opt_hdr *hop;
+ unsigned char buf[CALIPSO_MAX_BUFFER];
+ int len_delta, new_end, pad;
+ unsigned int start, end;
+
+ ip6_hdr = ipv6_hdr(skb);
+ if (ip6_hdr->nexthdr == NEXTHDR_HOP) {
+ hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+ ret_val = calipso_opt_find(hop, &start, &end);
+ if (ret_val && ret_val != -ENOENT)
+ return ret_val;
+ } else {
+ start = 0;
+ end = 0;
+ }
+
+ memset(buf, 0, sizeof(buf));
+ ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr);
+ if (ret_val < 0)
+ return ret_val;
+
+ new_end = start + ret_val;
+ /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */
+ pad = ((new_end & 4) + (end & 7)) & 7;
+ len_delta = new_end - (int)end + pad;
+ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ if (ret_val < 0)
+ return ret_val;
+
+ if (len_delta) {
+ if (len_delta > 0)
+ skb_push(skb, len_delta);
+ else
+ skb_pull(skb, -len_delta);
+ memmove((char *)ip6_hdr - len_delta, ip6_hdr,
+ sizeof(*ip6_hdr) + start);
+ skb_reset_network_header(skb);
+ ip6_hdr = ipv6_hdr(skb);
+ }
+
+ hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+ if (start == 0) {
+ struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf;
+
+ new_hop->nexthdr = ip6_hdr->nexthdr;
+ new_hop->hdrlen = len_delta / 8 - 1;
+ ip6_hdr->nexthdr = NEXTHDR_HOP;
+ } else {
+ hop->hdrlen += len_delta / 8;
+ }
+ memcpy((char *)hop + start, buf + (start & 3), new_end - start);
+ calipso_pad_write((unsigned char *)hop, new_end, pad);
+
+ return 0;
+}
+
+/**
+ * calipso_skbuff_delattr - Delete any CALIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CALIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int calipso_skbuff_delattr(struct sk_buff *skb)
+{
+ int ret_val;
+ struct ipv6hdr *ip6_hdr;
+ struct ipv6_opt_hdr *old_hop;
+ u32 old_hop_len, start = 0, end = 0, delta, size, pad;
+
+ if (!calipso_skbuff_optptr(skb))
+ return 0;
+
+ /* since we are changing the packet we should make a copy */
+ ret_val = skb_cow(skb, skb_headroom(skb));
+ if (ret_val < 0)
+ return ret_val;
+
+ ip6_hdr = ipv6_hdr(skb);
+ old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1);
+ old_hop_len = ipv6_optlen(old_hop);
+
+ ret_val = calipso_opt_find(old_hop, &start, &end);
+ if (ret_val)
+ return ret_val;
+
+ if (start == sizeof(*old_hop) && end == old_hop_len) {
+ /* There's no other option in the header so we delete
+ * the whole thing. */
+ delta = old_hop_len;
+ size = sizeof(*ip6_hdr);
+ ip6_hdr->nexthdr = old_hop->nexthdr;
+ } else {
+ delta = (end - start) & ~7;
+ if (delta)
+ old_hop->hdrlen -= delta / 8;
+ pad = (end - start) & 7;
+ size = sizeof(*ip6_hdr) + start + pad;
+ calipso_pad_write((unsigned char *)old_hop, start, pad);
+ }
+
+ if (delta) {
+ skb_pull(skb, delta);
+ memmove((char *)ip6_hdr + delta, ip6_hdr, size);
+ skb_reset_network_header(skb);
+ }
+
+ return 0;
+}
+
+static const struct netlbl_calipso_ops ops = {
+ .doi_add = calipso_doi_add,
+ .doi_free = calipso_doi_free,
+ .doi_remove = calipso_doi_remove,
+ .doi_getdef = calipso_doi_getdef,
+ .doi_putdef = calipso_doi_putdef,
+ .doi_walk = calipso_doi_walk,
+ .sock_getattr = calipso_sock_getattr,
+ .sock_setattr = calipso_sock_setattr,
+ .sock_delattr = calipso_sock_delattr,
+ .req_setattr = calipso_req_setattr,
+ .req_delattr = calipso_req_delattr,
+ .opt_getattr = calipso_opt_getattr,
+ .skbuff_optptr = calipso_skbuff_optptr,
+ .skbuff_setattr = calipso_skbuff_setattr,
+ .skbuff_delattr = calipso_skbuff_delattr,
+ .cache_invalidate = calipso_cache_invalidate,
+ .cache_add = calipso_cache_add
+};
+
+/**
+ * calipso_init - Initialize the CALIPSO module
+ *
+ * Description:
+ * Initialize the CALIPSO module and prepare it for use. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int __init calipso_init(void)
+{
+ int ret_val;
+
+ ret_val = calipso_cache_init();
+ if (!ret_val)
+ netlbl_calipso_ops_register(&ops);
+ return ret_val;
+}
+
+void calipso_exit(void)
+{
+ netlbl_calipso_ops_register(NULL);
+ calipso_cache_invalidate();
+ kfree(calipso_cache);
+}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 8de5dd7aaa05..139ceb68bd37 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -43,6 +43,7 @@
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
+#include <net/calipso.h>
#if IS_ENABLED(CONFIG_IPV6_MIP6)
#include <net/xfrm.h>
#endif
@@ -603,6 +604,28 @@ drop:
return false;
}
+/* CALIPSO RFC 5570 */
+
+static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff)
+{
+ const unsigned char *nh = skb_network_header(skb);
+
+ if (nh[optoff + 1] < 8)
+ goto drop;
+
+ if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1])
+ goto drop;
+
+ if (!calipso_validate(skb, nh + optoff))
+ goto drop;
+
+ return true;
+
+drop:
+ kfree_skb(skb);
+ return false;
+}
+
static const struct tlvtype_proc tlvprochopopt_lst[] = {
{
.type = IPV6_TLV_ROUTERALERT,
@@ -612,6 +635,10 @@ static const struct tlvtype_proc tlvprochopopt_lst[] = {
.type = IPV6_TLV_JUMBO,
.func = ipv6_hop_jumbo,
},
+ {
+ .type = IPV6_TLV_CALIPSO,
+ .func = ipv6_hop_calipso,
+ },
{ -1, }
};
@@ -758,6 +785,27 @@ static int ipv6_renew_option(void *ohdr,
return 0;
}
+/**
+ * ipv6_renew_options - replace a specific ext hdr with a new one.
+ *
+ * @sk: sock from which to allocate memory
+ * @opt: original options
+ * @newtype: option type to replace in @opt
+ * @newopt: new option of type @newtype to replace (user-mem)
+ * @newoptlen: length of @newopt
+ *
+ * Returns a new set of options which is a copy of @opt with the
+ * option type @newtype replaced with @newopt.
+ *
+ * @opt may be NULL, in which case a new set of options is returned
+ * containing just @newopt.
+ *
+ * @newopt may be NULL, in which case the specified option type is
+ * not copied into the new set of options.
+ *
+ * The new set of options is allocated from the socket option memory
+ * buffer of @sk.
+ */
struct ipv6_txoptions *
ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
int newtype,
@@ -830,6 +878,34 @@ out:
return ERR_PTR(err);
}
+/**
+ * ipv6_renew_options_kern - replace a specific ext hdr with a new one.
+ *
+ * @sk: sock from which to allocate memory
+ * @opt: original options
+ * @newtype: option type to replace in @opt
+ * @newopt: new option of type @newtype to replace (kernel-mem)
+ * @newoptlen: length of @newopt
+ *
+ * See ipv6_renew_options(). The difference is that @newopt is
+ * kernel memory, rather than user memory.
+ */
+struct ipv6_txoptions *
+ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt,
+ int newtype, struct ipv6_opt_hdr *newopt,
+ int newoptlen)
+{
+ struct ipv6_txoptions *ret_val;
+ const mm_segment_t old_fs = get_fs();
+
+ set_fs(KERNEL_DS);
+ ret_val = ipv6_renew_options(sk, opt, newtype,
+ (struct ipv6_opt_hdr __user *)newopt,
+ newoptlen);
+ set_fs(old_fs);
+ return ret_val;
+}
+
struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
struct ipv6_txoptions *opt)
{
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 9508a20fbf61..305e2ed730bf 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -112,7 +112,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp,
}
EXPORT_SYMBOL(ipv6_skip_exthdr);
-int ipv6_find_tlv(struct sk_buff *skb, int offset, int type)
+int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type)
{
const unsigned char *nh = skb_network_header(skb);
int packet_len = skb_tail_pointer(skb) - skb_network_header(skb);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a9895e15ee9c..5330262ab673 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -98,7 +98,6 @@ int ip6_ra_control(struct sock *sk, int sel)
return 0;
}
-static
struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
struct ipv6_txoptions *opt)
{
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 45243bbe5253..69c50e737c54 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -15,6 +15,9 @@
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/inet_frag.h>
+#ifdef CONFIG_NETLABEL
+#include <net/calipso.h>
+#endif
static int one = 1;
static int auto_flowlabels_min;
@@ -106,6 +109,22 @@ static struct ctl_table ipv6_rotable[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one
},
+#ifdef CONFIG_NETLABEL
+ {
+ .procname = "calipso_cache_enable",
+ .data = &calipso_cache_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "calipso_cache_bucket_size",
+ .data = &calipso_cache_bucketsize,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_NETLABEL */
{ }
};
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 37cf91323319..33df8b8575cc 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -443,6 +443,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt;
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
@@ -463,8 +464,10 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
rcu_read_lock();
- err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt),
- np->tclass);
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
+ err = ip6_xmit(sk, skb, fl6, opt, np->tclass);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -476,6 +479,7 @@ done:
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
+ kfree(inet_rsk(req)->ipv6_opt);
kfree_skb(inet_rsk(req)->pktopts);
}
@@ -1112,7 +1116,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
but we make one more one thing there: reattach optmem
to newsk.
*/
- opt = rcu_dereference(np->opt);
+ opt = ireq->ipv6_opt;
+ if (!opt)
+ opt = rcu_dereference(np->opt);
if (opt) {
opt = ipv6_dup_options(newsk, opt);
RCU_INIT_POINTER(newnp->opt, opt);
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 37d674e6f8a9..02b45a8e8b35 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -22,6 +22,7 @@
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/poll.h>
+#include <linux/security.h>
#include <net/sock.h>
#include <asm/ebcdic.h>
#include <asm/cpcmd.h>
@@ -530,8 +531,10 @@ static void iucv_sock_close(struct sock *sk)
static void iucv_sock_init(struct sock *sk, struct sock *parent)
{
- if (parent)
+ if (parent) {
sk->sk_type = parent->sk_type;
+ security_sk_clone(parent, sk);
+ }
}
static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern)
diff --git a/net/netlabel/Kconfig b/net/netlabel/Kconfig
index 56958c85f2b4..d9eaa30ffe3f 100644
--- a/net/netlabel/Kconfig
+++ b/net/netlabel/Kconfig
@@ -5,6 +5,7 @@
config NETLABEL
bool "NetLabel subsystem support"
depends on SECURITY
+ select CRC_CCITT if IPV6
default n
---help---
NetLabel provides support for explicit network packet labeling
diff --git a/net/netlabel/Makefile b/net/netlabel/Makefile
index d2732fc952e2..d341ede0dca5 100644
--- a/net/netlabel/Makefile
+++ b/net/netlabel/Makefile
@@ -12,4 +12,4 @@ obj-y += netlabel_mgmt.o
# protocol modules
obj-y += netlabel_unlabeled.o
obj-y += netlabel_cipso_v4.o
-
+obj-$(subst m,y,$(CONFIG_IPV6)) += netlabel_calipso.o
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
new file mode 100644
index 000000000000..2ec93c5e77bb
--- /dev/null
+++ b/net/netlabel/netlabel_calipso.c
@@ -0,0 +1,740 @@
+/*
+ * NetLabel CALIPSO/IPv6 Support
+ *
+ * This file defines the CALIPSO/IPv6 functions for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and CALIPSO.
+ *
+ * Authors: Paul Moore <paul@paul-moore.com>
+ * Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/audit.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/netlabel.h>
+#include <net/calipso.h>
+#include <linux/atomic.h>
+
+#include "netlabel_user.h"
+#include "netlabel_calipso.h"
+#include "netlabel_mgmt.h"
+#include "netlabel_domainhash.h"
+
+/* Argument struct for calipso_doi_walk() */
+struct netlbl_calipso_doiwalk_arg {
+ struct netlink_callback *nl_cb;
+ struct sk_buff *skb;
+ u32 seq;
+};
+
+/* Argument struct for netlbl_domhsh_walk() */
+struct netlbl_domhsh_walk_arg {
+ struct netlbl_audit *audit_info;
+ u32 doi;
+};
+
+/* NetLabel Generic NETLINK CALIPSO family */
+static struct genl_family netlbl_calipso_gnl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = NETLBL_NLTYPE_CALIPSO_NAME,
+ .version = NETLBL_PROTO_VERSION,
+ .maxattr = NLBL_CALIPSO_A_MAX,
+};
+
+/* NetLabel Netlink attribute policy */
+static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = {
+ [NLBL_CALIPSO_A_DOI] = { .type = NLA_U32 },
+ [NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 },
+};
+
+/* NetLabel Command Handlers
+ */
+/**
+ * netlbl_calipso_add_pass - Adds a CALIPSO pass DOI definition
+ * @info: the Generic NETLINK info block
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Create a new CALIPSO_MAP_PASS DOI definition based on the given ADD message
+ * and add it to the CALIPSO engine. Return zero on success and non-zero on
+ * error.
+ *
+ */
+static int netlbl_calipso_add_pass(struct genl_info *info,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val;
+ struct calipso_doi *doi_def = NULL;
+
+ doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL);
+ if (!doi_def)
+ return -ENOMEM;
+ doi_def->type = CALIPSO_MAP_PASS;
+ doi_def->doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]);
+ ret_val = calipso_doi_add(doi_def, audit_info);
+ if (ret_val != 0)
+ calipso_doi_free(doi_def);
+
+ return ret_val;
+}
+
+/**
+ * netlbl_calipso_add - Handle an ADD message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Create a new DOI definition based on the given ADD message and add it to the
+ * CALIPSO engine. Returns zero on success, negative values on failure.
+ *
+ */
+static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info)
+
+{
+ int ret_val = -EINVAL;
+ struct netlbl_audit audit_info;
+
+ if (!info->attrs[NLBL_CALIPSO_A_DOI] ||
+ !info->attrs[NLBL_CALIPSO_A_MTYPE])
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+ switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) {
+ case CALIPSO_MAP_PASS:
+ ret_val = netlbl_calipso_add_pass(info, &audit_info);
+ break;
+ }
+ if (ret_val == 0)
+ atomic_inc(&netlabel_mgmt_protocount);
+
+ return ret_val;
+}
+
+/**
+ * netlbl_calipso_list - Handle a LIST message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated LIST message and respond accordingly.
+ * Returns zero on success and negative values on error.
+ *
+ */
+static int netlbl_calipso_list(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret_val;
+ struct sk_buff *ans_skb = NULL;
+ void *data;
+ u32 doi;
+ struct calipso_doi *doi_def;
+
+ if (!info->attrs[NLBL_CALIPSO_A_DOI]) {
+ ret_val = -EINVAL;
+ goto list_failure;
+ }
+
+ doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]);
+
+ doi_def = calipso_doi_getdef(doi);
+ if (!doi_def) {
+ ret_val = -EINVAL;
+ goto list_failure;
+ }
+
+ ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ans_skb) {
+ ret_val = -ENOMEM;
+ goto list_failure_put;
+ }
+ data = genlmsg_put_reply(ans_skb, info, &netlbl_calipso_gnl_family,
+ 0, NLBL_CALIPSO_C_LIST);
+ if (!data) {
+ ret_val = -ENOMEM;
+ goto list_failure_put;
+ }
+
+ ret_val = nla_put_u32(ans_skb, NLBL_CALIPSO_A_MTYPE, doi_def->type);
+ if (ret_val != 0)
+ goto list_failure_put;
+
+ calipso_doi_putdef(doi_def);
+
+ genlmsg_end(ans_skb, data);
+ return genlmsg_reply(ans_skb, info);
+
+list_failure_put:
+ calipso_doi_putdef(doi_def);
+list_failure:
+ kfree_skb(ans_skb);
+ return ret_val;
+}
+
+/**
+ * netlbl_calipso_listall_cb - calipso_doi_walk() callback for LISTALL
+ * @doi_def: the CALIPSO DOI definition
+ * @arg: the netlbl_calipso_doiwalk_arg structure
+ *
+ * Description:
+ * This function is designed to be used as a callback to the
+ * calipso_doi_walk() function for use in generating a response for a LISTALL
+ * message. Returns the size of the message on success, negative values on
+ * failure.
+ *
+ */
+static int netlbl_calipso_listall_cb(struct calipso_doi *doi_def, void *arg)
+{
+ int ret_val = -ENOMEM;
+ struct netlbl_calipso_doiwalk_arg *cb_arg = arg;
+ void *data;
+
+ data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid,
+ cb_arg->seq, &netlbl_calipso_gnl_family,
+ NLM_F_MULTI, NLBL_CALIPSO_C_LISTALL);
+ if (!data)
+ goto listall_cb_failure;
+
+ ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_DOI, doi_def->doi);
+ if (ret_val != 0)
+ goto listall_cb_failure;
+ ret_val = nla_put_u32(cb_arg->skb,
+ NLBL_CALIPSO_A_MTYPE,
+ doi_def->type);
+ if (ret_val != 0)
+ goto listall_cb_failure;
+
+ genlmsg_end(cb_arg->skb, data);
+ return 0;
+
+listall_cb_failure:
+ genlmsg_cancel(cb_arg->skb, data);
+ return ret_val;
+}
+
+/**
+ * netlbl_calipso_listall - Handle a LISTALL message
+ * @skb: the NETLINK buffer
+ * @cb: the NETLINK callback
+ *
+ * Description:
+ * Process a user generated LISTALL message and respond accordingly. Returns
+ * zero on success and negative values on error.
+ *
+ */
+static int netlbl_calipso_listall(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netlbl_calipso_doiwalk_arg cb_arg;
+ u32 doi_skip = cb->args[0];
+
+ cb_arg.nl_cb = cb;
+ cb_arg.skb = skb;
+ cb_arg.seq = cb->nlh->nlmsg_seq;
+
+ calipso_doi_walk(&doi_skip, netlbl_calipso_listall_cb, &cb_arg);
+
+ cb->args[0] = doi_skip;
+ return skb->len;
+}
+
+/**
+ * netlbl_calipso_remove_cb - netlbl_calipso_remove() callback for REMOVE
+ * @entry: LSM domain mapping entry
+ * @arg: the netlbl_domhsh_walk_arg structure
+ *
+ * Description:
+ * This function is intended for use by netlbl_calipso_remove() as the callback
+ * for the netlbl_domhsh_walk() function; it removes LSM domain map entries
+ * which are associated with the CALIPSO DOI specified in @arg. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+static int netlbl_calipso_remove_cb(struct netlbl_dom_map *entry, void *arg)
+{
+ struct netlbl_domhsh_walk_arg *cb_arg = arg;
+
+ if (entry->def.type == NETLBL_NLTYPE_CALIPSO &&
+ entry->def.calipso->doi == cb_arg->doi)
+ return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info);
+
+ return 0;
+}
+
+/**
+ * netlbl_calipso_remove - Handle a REMOVE message
+ * @skb: the NETLINK buffer
+ * @info: the Generic NETLINK info block
+ *
+ * Description:
+ * Process a user generated REMOVE message and respond accordingly. Returns
+ * zero on success, negative values on failure.
+ *
+ */
+static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret_val = -EINVAL;
+ struct netlbl_domhsh_walk_arg cb_arg;
+ struct netlbl_audit audit_info;
+ u32 skip_bkt = 0;
+ u32 skip_chain = 0;
+
+ if (!info->attrs[NLBL_CALIPSO_A_DOI])
+ return -EINVAL;
+
+ netlbl_netlink_auditinfo(skb, &audit_info);
+ cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]);
+ cb_arg.audit_info = &audit_info;
+ ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain,
+ netlbl_calipso_remove_cb, &cb_arg);
+ if (ret_val == 0 || ret_val == -ENOENT) {
+ ret_val = calipso_doi_remove(cb_arg.doi, &audit_info);
+ if (ret_val == 0)
+ atomic_dec(&netlabel_mgmt_protocount);
+ }
+
+ return ret_val;
+}
+
+/* NetLabel Generic NETLINK Command Definitions
+ */
+
+static const struct genl_ops netlbl_calipso_ops[] = {
+ {
+ .cmd = NLBL_CALIPSO_C_ADD,
+ .flags = GENL_ADMIN_PERM,
+ .policy = calipso_genl_policy,
+ .doit = netlbl_calipso_add,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_CALIPSO_C_REMOVE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = calipso_genl_policy,
+ .doit = netlbl_calipso_remove,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_CALIPSO_C_LIST,
+ .flags = 0,
+ .policy = calipso_genl_policy,
+ .doit = netlbl_calipso_list,
+ .dumpit = NULL,
+ },
+ {
+ .cmd = NLBL_CALIPSO_C_LISTALL,
+ .flags = 0,
+ .policy = calipso_genl_policy,
+ .doit = NULL,
+ .dumpit = netlbl_calipso_listall,
+ },
+};
+
+/* NetLabel Generic NETLINK Protocol Functions
+ */
+
+/**
+ * netlbl_calipso_genl_init - Register the CALIPSO NetLabel component
+ *
+ * Description:
+ * Register the CALIPSO packet NetLabel component with the Generic NETLINK
+ * mechanism. Returns zero on success, negative values on failure.
+ *
+ */
+int __init netlbl_calipso_genl_init(void)
+{
+ return genl_register_family_with_ops(&netlbl_calipso_gnl_family,
+ netlbl_calipso_ops);
+}
+
+static const struct netlbl_calipso_ops *calipso_ops;
+
+/**
+ * netlbl_calipso_ops_register - Register the CALIPSO operations
+ *
+ * Description:
+ * Register the CALIPSO packet engine operations.
+ *
+ */
+const struct netlbl_calipso_ops *
+netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops)
+{
+ return xchg(&calipso_ops, ops);
+}
+EXPORT_SYMBOL(netlbl_calipso_ops_register);
+
+static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void)
+{
+ return ACCESS_ONCE(calipso_ops);
+}
+
+/**
+ * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine
+ * @doi_def: the DOI structure
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * The caller defines a new DOI for use by the CALIPSO engine and calls this
+ * function to add it to the list of acceptable domains. The caller must
+ * ensure that the mapping table specified in @doi_def->map meets all of the
+ * requirements of the mapping type (see calipso.h for details). Returns
+ * zero on success and non-zero on failure.
+ *
+ */
+int calipso_doi_add(struct calipso_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->doi_add(doi_def, audit_info);
+ return ret_val;
+}
+
+/**
+ * calipso_doi_free - Frees a DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * This function frees all of the memory associated with a DOI definition.
+ *
+ */
+void calipso_doi_free(struct calipso_doi *doi_def)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->doi_free(doi_def);
+}
+
+/**
+ * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
+ * @doi: the DOI value
+ * @audit_secid: the LSM secid to use in the audit message
+ *
+ * Description:
+ * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
+ * be called to release their own LSM domain mappings as well as our own
+ * domain list. Returns zero on success and negative values on failure.
+ *
+ */
+int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->doi_remove(doi, audit_info);
+ return ret_val;
+}
+
+/**
+ * calipso_doi_getdef - Returns a reference to a valid DOI definition
+ * @doi: the DOI value
+ *
+ * Description:
+ * Searches for a valid DOI definition and if one is found it is returned to
+ * the caller. Otherwise NULL is returned. The caller must ensure that
+ * calipso_doi_putdef() is called when the caller is done.
+ *
+ */
+struct calipso_doi *calipso_doi_getdef(u32 doi)
+{
+ struct calipso_doi *ret_val = NULL;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->doi_getdef(doi);
+ return ret_val;
+}
+
+/**
+ * calipso_doi_putdef - Releases a reference for the given DOI definition
+ * @doi_def: the DOI definition
+ *
+ * Description:
+ * Releases a DOI definition reference obtained from calipso_doi_getdef().
+ *
+ */
+void calipso_doi_putdef(struct calipso_doi *doi_def)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->doi_putdef(doi_def);
+}
+
+/**
+ * calipso_doi_walk - Iterate through the DOI definitions
+ * @skip_cnt: skip past this number of DOI definitions, updated
+ * @callback: callback for each DOI definition
+ * @cb_arg: argument for the callback function
+ *
+ * Description:
+ * Iterate over the DOI definition list, skipping the first @skip_cnt entries.
+ * For each entry call @callback, if @callback returns a negative value stop
+ * 'walking' through the list and return. Updates the value in @skip_cnt upon
+ * return. Returns zero on success, negative values on failure.
+ *
+ */
+int calipso_doi_walk(u32 *skip_cnt,
+ int (*callback)(struct calipso_doi *doi_def, void *arg),
+ void *cb_arg)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->doi_walk(skip_cnt, callback, cb_arg);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_getattr - Get the security attributes from a sock
+ * @sk: the sock
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Query @sk to see if there is a CALIPSO option attached to the sock and if
+ * there is return the CALIPSO security attributes in @secattr. This function
+ * requires that @sk be locked, or privately held, but it does not do any
+ * locking itself. Returns zero on success and negative values on failure.
+ *
+ */
+int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->sock_getattr(sk, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_setattr - Add a CALIPSO option to a socket
+ * @sk: the socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. This function requires
+ * exclusive access to @sk, which means it either needs to be in the
+ * process of being created or locked. Returns zero on success and negative
+ * values on failure.
+ *
+ */
+int calipso_sock_setattr(struct sock *sk,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->sock_setattr(sk, doi_def, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_sock_delattr - Delete the CALIPSO option from a socket
+ * @sk: the socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a socket, if present.
+ *
+ */
+void calipso_sock_delattr(struct sock *sk)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->sock_delattr(sk);
+}
+
+/**
+ * calipso_req_setattr - Add a CALIPSO option to a connection request socket
+ * @req: the connection request socket
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the specific security attributes of the socket
+ *
+ * Description:
+ * Set the CALIPSO option on the given socket using the DOI definition and
+ * security attributes passed to the function. Returns zero on success and
+ * negative values on failure.
+ *
+ */
+int calipso_req_setattr(struct request_sock *req,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->req_setattr(req, doi_def, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_req_delattr - Delete the CALIPSO option from a request socket
+ * @reg: the request socket
+ *
+ * Description:
+ * Removes the CALIPSO option from a request socket, if present.
+ *
+ */
+void calipso_req_delattr(struct request_sock *req)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->req_delattr(req);
+}
+
+/**
+ * calipso_optptr - Find the CALIPSO option in the packet
+ * @skb: the packet
+ *
+ * Description:
+ * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer
+ * to the start of the CALIPSO option on success, NULL if one if not found.
+ *
+ */
+unsigned char *calipso_optptr(const struct sk_buff *skb)
+{
+ unsigned char *ret_val = NULL;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->skbuff_optptr(skb);
+ return ret_val;
+}
+
+/**
+ * calipso_getattr - Get the security attributes from a memory block.
+ * @calipso: the CALIPSO option
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Inspect @calipso and return the security attributes in @secattr.
+ * Returns zero on success and negative values on failure.
+ *
+ */
+int calipso_getattr(const unsigned char *calipso,
+ struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->opt_getattr(calipso, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_skbuff_setattr - Set the CALIPSO option on a packet
+ * @skb: the packet
+ * @doi_def: the CALIPSO DOI to use
+ * @secattr: the security attributes
+ *
+ * Description:
+ * Set the CALIPSO option on the given packet based on the security attributes.
+ * Returns a pointer to the IP header on success and NULL on failure.
+ *
+ */
+int calipso_skbuff_setattr(struct sk_buff *skb,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->skbuff_setattr(skb, doi_def, secattr);
+ return ret_val;
+}
+
+/**
+ * calipso_skbuff_delattr - Delete any CALIPSO options from a packet
+ * @skb: the packet
+ *
+ * Description:
+ * Removes any and all CALIPSO options from the given packet. Returns zero on
+ * success, negative values on failure.
+ *
+ */
+int calipso_skbuff_delattr(struct sk_buff *skb)
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->skbuff_delattr(skb);
+ return ret_val;
+}
+
+/**
+ * calipso_cache_invalidate - Invalidates the current CALIPSO cache
+ *
+ * Description:
+ * Invalidates and frees any entries in the CALIPSO cache. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void calipso_cache_invalidate(void)
+{
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ops->cache_invalidate();
+}
+
+/**
+ * calipso_cache_add - Add an entry to the CALIPSO cache
+ * @calipso_ptr: the CALIPSO option
+ * @secattr: the packet's security attributes
+ *
+ * Description:
+ * Add a new entry into the CALIPSO label mapping cache.
+ * Returns zero on success, negative values on failure.
+ *
+ */
+int calipso_cache_add(const unsigned char *calipso_ptr,
+ const struct netlbl_lsm_secattr *secattr)
+
+{
+ int ret_val = -ENOMSG;
+ const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get();
+
+ if (ops)
+ ret_val = ops->cache_add(calipso_ptr, secattr);
+ return ret_val;
+}
diff --git a/net/netlabel/netlabel_calipso.h b/net/netlabel/netlabel_calipso.h
new file mode 100644
index 000000000000..9fd291cd0fc5
--- /dev/null
+++ b/net/netlabel/netlabel_calipso.h
@@ -0,0 +1,151 @@
+/*
+ * NetLabel CALIPSO Support
+ *
+ * This file defines the CALIPSO functions for the NetLabel system. The
+ * NetLabel system manages static and dynamic label mappings for network
+ * protocols such as CIPSO and RIPSO.
+ *
+ * Authors: Paul Moore <paul@paul-moore.com>
+ * Huw Davies <huw@codeweavers.com>
+ *
+ */
+
+/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006
+ * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _NETLABEL_CALIPSO
+#define _NETLABEL_CALIPSO
+
+#include <net/netlabel.h>
+#include <net/calipso.h>
+
+/* The following NetLabel payloads are supported by the CALIPSO subsystem.
+ *
+ * o ADD:
+ * Sent by an application to add a new DOI mapping table.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_DOI
+ * NLBL_CALIPSO_A_MTYPE
+ *
+ * If using CALIPSO_MAP_PASS no additional attributes are required.
+ *
+ * o REMOVE:
+ * Sent by an application to remove a specific DOI mapping table from the
+ * CALIPSO system.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_DOI
+ *
+ * o LIST:
+ * Sent by an application to list the details of a DOI definition. On
+ * success the kernel should send a response using the following format.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_DOI
+ *
+ * The valid response message format depends on the type of the DOI mapping,
+ * the defined formats are shown below.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_MTYPE
+ *
+ * If using CALIPSO_MAP_PASS no additional attributes are required.
+ *
+ * o LISTALL:
+ * This message is sent by an application to list the valid DOIs on the
+ * system. When sent by an application there is no payload and the
+ * NLM_F_DUMP flag should be set. The kernel should respond with a series of
+ * the following messages.
+ *
+ * Required attributes:
+ *
+ * NLBL_CALIPSO_A_DOI
+ * NLBL_CALIPSO_A_MTYPE
+ *
+ */
+
+/* NetLabel CALIPSO commands */
+enum {
+ NLBL_CALIPSO_C_UNSPEC,
+ NLBL_CALIPSO_C_ADD,
+ NLBL_CALIPSO_C_REMOVE,
+ NLBL_CALIPSO_C_LIST,
+ NLBL_CALIPSO_C_LISTALL,
+ __NLBL_CALIPSO_C_MAX,
+};
+
+/* NetLabel CALIPSO attributes */
+enum {
+ NLBL_CALIPSO_A_UNSPEC,
+ NLBL_CALIPSO_A_DOI,
+ /* (NLA_U32)
+ * the DOI value */
+ NLBL_CALIPSO_A_MTYPE,
+ /* (NLA_U32)
+ * the mapping table type (defined in the calipso.h header as
+ * CALIPSO_MAP_*) */
+ __NLBL_CALIPSO_A_MAX,
+};
+
+#define NLBL_CALIPSO_A_MAX (__NLBL_CALIPSO_A_MAX - 1)
+
+/* NetLabel protocol functions */
+#if IS_ENABLED(CONFIG_IPV6)
+int netlbl_calipso_genl_init(void);
+#else
+static inline int netlbl_calipso_genl_init(void)
+{
+ return 0;
+}
+#endif
+
+int calipso_doi_add(struct calipso_doi *doi_def,
+ struct netlbl_audit *audit_info);
+void calipso_doi_free(struct calipso_doi *doi_def);
+int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info);
+struct calipso_doi *calipso_doi_getdef(u32 doi);
+void calipso_doi_putdef(struct calipso_doi *doi_def);
+int calipso_doi_walk(u32 *skip_cnt,
+ int (*callback)(struct calipso_doi *doi_def, void *arg),
+ void *cb_arg);
+int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr);
+int calipso_sock_setattr(struct sock *sk,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr);
+void calipso_sock_delattr(struct sock *sk);
+int calipso_req_setattr(struct request_sock *req,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr);
+void calipso_req_delattr(struct request_sock *req);
+unsigned char *calipso_optptr(const struct sk_buff *skb);
+int calipso_getattr(const unsigned char *calipso,
+ struct netlbl_lsm_secattr *secattr);
+int calipso_skbuff_setattr(struct sk_buff *skb,
+ const struct calipso_doi *doi_def,
+ const struct netlbl_lsm_secattr *secattr);
+int calipso_skbuff_delattr(struct sk_buff *skb);
+void calipso_cache_invalidate(void);
+int calipso_cache_add(const unsigned char *calipso_ptr,
+ const struct netlbl_lsm_secattr *secattr);
+
+#endif
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index ada67422234b..41d0e95d171e 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -37,10 +37,12 @@
#include <linux/slab.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
+#include <net/calipso.h>
#include <asm/bug.h>
#include "netlabel_mgmt.h"
#include "netlabel_addrlist.h"
+#include "netlabel_calipso.h"
#include "netlabel_domainhash.h"
#include "netlabel_user.h"
@@ -55,8 +57,9 @@ struct netlbl_domhsh_tbl {
static DEFINE_SPINLOCK(netlbl_domhsh_lock);
#define netlbl_domhsh_rcu_deref(p) \
rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
-static struct netlbl_domhsh_tbl *netlbl_domhsh;
-static struct netlbl_dom_map *netlbl_domhsh_def;
+static struct netlbl_domhsh_tbl __rcu *netlbl_domhsh;
+static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv4;
+static struct netlbl_dom_map __rcu *netlbl_domhsh_def_ipv6;
/*
* Domain Hash Table Helper Functions
@@ -126,18 +129,26 @@ static u32 netlbl_domhsh_hash(const char *key)
return val & (netlbl_domhsh_rcu_deref(netlbl_domhsh)->size - 1);
}
+static bool netlbl_family_match(u16 f1, u16 f2)
+{
+ return (f1 == f2) || (f1 == AF_UNSPEC) || (f2 == AF_UNSPEC);
+}
+
/**
* netlbl_domhsh_search - Search for a domain entry
* @domain: the domain
+ * @family: the address family
*
* Description:
* Searches the domain hash table and returns a pointer to the hash table
- * entry if found, otherwise NULL is returned. The caller is responsible for
+ * entry if found, otherwise NULL is returned. @family may be %AF_UNSPEC
+ * which matches any address family entries. The caller is responsible for
* ensuring that the hash table is protected with either a RCU read lock or the
* hash table lock.
*
*/
-static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
+static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain,
+ u16 family)
{
u32 bkt;
struct list_head *bkt_list;
@@ -147,7 +158,9 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
bkt = netlbl_domhsh_hash(domain);
bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
list_for_each_entry_rcu(iter, bkt_list, list)
- if (iter->valid && strcmp(iter->domain, domain) == 0)
+ if (iter->valid &&
+ netlbl_family_match(iter->family, family) &&
+ strcmp(iter->domain, domain) == 0)
return iter;
}
@@ -157,28 +170,37 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain)
/**
* netlbl_domhsh_search_def - Search for a domain entry
* @domain: the domain
- * @def: return default if no match is found
+ * @family: the address family
*
* Description:
* Searches the domain hash table and returns a pointer to the hash table
* entry if an exact match is found, if an exact match is not present in the
* hash table then the default entry is returned if valid otherwise NULL is
- * returned. The caller is responsible ensuring that the hash table is
+ * returned. @family may be %AF_UNSPEC which matches any address family
+ * entries. The caller is responsible ensuring that the hash table is
* protected with either a RCU read lock or the hash table lock.
*
*/
-static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain)
+static struct netlbl_dom_map *netlbl_domhsh_search_def(const char *domain,
+ u16 family)
{
struct netlbl_dom_map *entry;
- entry = netlbl_domhsh_search(domain);
- if (entry == NULL) {
- entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def);
- if (entry != NULL && !entry->valid)
- entry = NULL;
+ entry = netlbl_domhsh_search(domain, family);
+ if (entry != NULL)
+ return entry;
+ if (family == AF_INET || family == AF_UNSPEC) {
+ entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv4);
+ if (entry != NULL && entry->valid)
+ return entry;
+ }
+ if (family == AF_INET6 || family == AF_UNSPEC) {
+ entry = netlbl_domhsh_rcu_deref(netlbl_domhsh_def_ipv6);
+ if (entry != NULL && entry->valid)
+ return entry;
}
- return entry;
+ return NULL;
}
/**
@@ -203,6 +225,7 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
{
struct audit_buffer *audit_buf;
struct cipso_v4_doi *cipsov4 = NULL;
+ struct calipso_doi *calipso = NULL;
u32 type;
audit_buf = netlbl_audit_start_common(AUDIT_MAC_MAP_ADD, audit_info);
@@ -221,12 +244,14 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
struct netlbl_domaddr6_map *map6;
map6 = netlbl_domhsh_addr6_entry(addr6);
type = map6->def.type;
+ calipso = map6->def.calipso;
netlbl_af6list_audit_addr(audit_buf, 0, NULL,
&addr6->addr, &addr6->mask);
#endif /* IPv6 */
} else {
type = entry->def.type;
cipsov4 = entry->def.cipso;
+ calipso = entry->def.calipso;
}
switch (type) {
case NETLBL_NLTYPE_UNLABELED:
@@ -238,6 +263,12 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
" nlbl_protocol=cipsov4 cipso_doi=%u",
cipsov4->doi);
break;
+ case NETLBL_NLTYPE_CALIPSO:
+ BUG_ON(calipso == NULL);
+ audit_log_format(audit_buf,
+ " nlbl_protocol=calipso calipso_doi=%u",
+ calipso->doi);
+ break;
}
audit_log_format(audit_buf, " res=%u", result == 0 ? 1 : 0);
audit_log_end(audit_buf);
@@ -264,13 +295,25 @@ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry)
if (entry == NULL)
return -EINVAL;
+ if (entry->family != AF_INET && entry->family != AF_INET6 &&
+ (entry->family != AF_UNSPEC ||
+ entry->def.type != NETLBL_NLTYPE_UNLABELED))
+ return -EINVAL;
+
switch (entry->def.type) {
case NETLBL_NLTYPE_UNLABELED:
- if (entry->def.cipso != NULL || entry->def.addrsel != NULL)
+ if (entry->def.cipso != NULL || entry->def.calipso != NULL ||
+ entry->def.addrsel != NULL)
return -EINVAL;
break;
case NETLBL_NLTYPE_CIPSOV4:
- if (entry->def.cipso == NULL)
+ if (entry->family != AF_INET ||
+ entry->def.cipso == NULL)
+ return -EINVAL;
+ break;
+ case NETLBL_NLTYPE_CALIPSO:
+ if (entry->family != AF_INET6 ||
+ entry->def.calipso == NULL)
return -EINVAL;
break;
case NETLBL_NLTYPE_ADDRSELECT:
@@ -294,6 +337,12 @@ static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry)
map6 = netlbl_domhsh_addr6_entry(iter6);
switch (map6->def.type) {
case NETLBL_NLTYPE_UNLABELED:
+ if (map6->def.calipso != NULL)
+ return -EINVAL;
+ break;
+ case NETLBL_NLTYPE_CALIPSO:
+ if (map6->def.calipso == NULL)
+ return -EINVAL;
break;
default:
return -EINVAL;
@@ -358,15 +407,18 @@ int __init netlbl_domhsh_init(u32 size)
*
* Description:
* Adds a new entry to the domain hash table and handles any updates to the
- * lower level protocol handler (i.e. CIPSO). Returns zero on success,
- * negative on failure.
+ * lower level protocol handler (i.e. CIPSO). @entry->family may be set to
+ * %AF_UNSPEC which will add an entry that matches all address families. This
+ * is only useful for the unlabelled type and will only succeed if there is no
+ * existing entry for any address family with the same domain. Returns zero
+ * on success, negative on failure.
*
*/
int netlbl_domhsh_add(struct netlbl_dom_map *entry,
struct netlbl_audit *audit_info)
{
int ret_val = 0;
- struct netlbl_dom_map *entry_old;
+ struct netlbl_dom_map *entry_old, *entry_b;
struct netlbl_af4list *iter4;
struct netlbl_af4list *tmp4;
#if IS_ENABLED(CONFIG_IPV6)
@@ -385,9 +437,10 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
rcu_read_lock();
spin_lock(&netlbl_domhsh_lock);
if (entry->domain != NULL)
- entry_old = netlbl_domhsh_search(entry->domain);
+ entry_old = netlbl_domhsh_search(entry->domain, entry->family);
else
- entry_old = netlbl_domhsh_search_def(entry->domain);
+ entry_old = netlbl_domhsh_search_def(entry->domain,
+ entry->family);
if (entry_old == NULL) {
entry->valid = 1;
@@ -397,7 +450,41 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
&rcu_dereference(netlbl_domhsh)->tbl[bkt]);
} else {
INIT_LIST_HEAD(&entry->list);
- rcu_assign_pointer(netlbl_domhsh_def, entry);
+ switch (entry->family) {
+ case AF_INET:
+ rcu_assign_pointer(netlbl_domhsh_def_ipv4,
+ entry);
+ break;
+ case AF_INET6:
+ rcu_assign_pointer(netlbl_domhsh_def_ipv6,
+ entry);
+ break;
+ case AF_UNSPEC:
+ if (entry->def.type !=
+ NETLBL_NLTYPE_UNLABELED) {
+ ret_val = -EINVAL;
+ goto add_return;
+ }
+ entry_b = kzalloc(sizeof(*entry_b), GFP_ATOMIC);
+ if (entry_b == NULL) {
+ ret_val = -ENOMEM;
+ goto add_return;
+ }
+ entry_b->family = AF_INET6;
+ entry_b->def.type = NETLBL_NLTYPE_UNLABELED;
+ entry_b->valid = 1;
+ entry->family = AF_INET;
+ rcu_assign_pointer(netlbl_domhsh_def_ipv4,
+ entry);
+ rcu_assign_pointer(netlbl_domhsh_def_ipv6,
+ entry_b);
+ break;
+ default:
+ /* Already checked in
+ * netlbl_domhsh_validate(). */
+ ret_val = -EINVAL;
+ goto add_return;
+ }
}
if (entry->def.type == NETLBL_NLTYPE_ADDRSELECT) {
@@ -513,10 +600,12 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
spin_lock(&netlbl_domhsh_lock);
if (entry->valid) {
entry->valid = 0;
- if (entry != rcu_dereference(netlbl_domhsh_def))
- list_del_rcu(&entry->list);
+ if (entry == rcu_dereference(netlbl_domhsh_def_ipv4))
+ RCU_INIT_POINTER(netlbl_domhsh_def_ipv4, NULL);
+ else if (entry == rcu_dereference(netlbl_domhsh_def_ipv6))
+ RCU_INIT_POINTER(netlbl_domhsh_def_ipv6, NULL);
else
- RCU_INIT_POINTER(netlbl_domhsh_def, NULL);
+ list_del_rcu(&entry->list);
} else
ret_val = -ENOENT;
spin_unlock(&netlbl_domhsh_lock);
@@ -533,6 +622,10 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
if (ret_val == 0) {
struct netlbl_af4list *iter4;
struct netlbl_domaddr4_map *map4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct netlbl_af6list *iter6;
+ struct netlbl_domaddr6_map *map6;
+#endif /* IPv6 */
switch (entry->def.type) {
case NETLBL_NLTYPE_ADDRSELECT:
@@ -541,12 +634,22 @@ int netlbl_domhsh_remove_entry(struct netlbl_dom_map *entry,
map4 = netlbl_domhsh_addr4_entry(iter4);
cipso_v4_doi_putdef(map4->def.cipso);
}
- /* no need to check the IPv6 list since we currently
- * support only unlabeled protocols for IPv6 */
+#if IS_ENABLED(CONFIG_IPV6)
+ netlbl_af6list_foreach_rcu(iter6,
+ &entry->def.addrsel->list6) {
+ map6 = netlbl_domhsh_addr6_entry(iter6);
+ calipso_doi_putdef(map6->def.calipso);
+ }
+#endif /* IPv6 */
break;
case NETLBL_NLTYPE_CIPSOV4:
cipso_v4_doi_putdef(entry->def.cipso);
break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NETLBL_NLTYPE_CALIPSO:
+ calipso_doi_putdef(entry->def.calipso);
+ break;
+#endif /* IPv6 */
}
call_rcu(&entry->rcu, netlbl_domhsh_free_entry);
}
@@ -583,9 +686,9 @@ int netlbl_domhsh_remove_af4(const char *domain,
rcu_read_lock();
if (domain)
- entry_map = netlbl_domhsh_search(domain);
+ entry_map = netlbl_domhsh_search(domain, AF_INET);
else
- entry_map = netlbl_domhsh_search_def(domain);
+ entry_map = netlbl_domhsh_search_def(domain, AF_INET);
if (entry_map == NULL ||
entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
goto remove_af4_failure;
@@ -622,28 +725,114 @@ remove_af4_failure:
return -ENOENT;
}
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * netlbl_domhsh_remove_af6 - Removes an address selector entry
+ * @domain: the domain
+ * @addr: IPv6 address
+ * @mask: IPv6 address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Removes an individual address selector from a domain mapping and potentially
+ * the entire mapping if it is empty. Returns zero on success, negative values
+ * on failure.
+ *
+ */
+int netlbl_domhsh_remove_af6(const char *domain,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+ struct netlbl_dom_map *entry_map;
+ struct netlbl_af6list *entry_addr;
+ struct netlbl_af4list *iter4;
+ struct netlbl_af6list *iter6;
+ struct netlbl_domaddr6_map *entry;
+
+ rcu_read_lock();
+
+ if (domain)
+ entry_map = netlbl_domhsh_search(domain, AF_INET6);
+ else
+ entry_map = netlbl_domhsh_search_def(domain, AF_INET6);
+ if (entry_map == NULL ||
+ entry_map->def.type != NETLBL_NLTYPE_ADDRSELECT)
+ goto remove_af6_failure;
+
+ spin_lock(&netlbl_domhsh_lock);
+ entry_addr = netlbl_af6list_remove(addr, mask,
+ &entry_map->def.addrsel->list6);
+ spin_unlock(&netlbl_domhsh_lock);
+
+ if (entry_addr == NULL)
+ goto remove_af6_failure;
+ netlbl_af4list_foreach_rcu(iter4, &entry_map->def.addrsel->list4)
+ goto remove_af6_single_addr;
+ netlbl_af6list_foreach_rcu(iter6, &entry_map->def.addrsel->list6)
+ goto remove_af6_single_addr;
+ /* the domain mapping is empty so remove it from the mapping table */
+ netlbl_domhsh_remove_entry(entry_map, audit_info);
+
+remove_af6_single_addr:
+ rcu_read_unlock();
+ /* yick, we can't use call_rcu here because we don't have a rcu head
+ * pointer but hopefully this should be a rare case so the pause
+ * shouldn't be a problem */
+ synchronize_rcu();
+ entry = netlbl_domhsh_addr6_entry(entry_addr);
+ calipso_doi_putdef(entry->def.calipso);
+ kfree(entry);
+ return 0;
+
+remove_af6_failure:
+ rcu_read_unlock();
+ return -ENOENT;
+}
+#endif /* IPv6 */
+
/**
* netlbl_domhsh_remove - Removes an entry from the domain hash table
* @domain: the domain to remove
+ * @family: address family
* @audit_info: NetLabel audit information
*
* Description:
* Removes an entry from the domain hash table and handles any updates to the
- * lower level protocol handler (i.e. CIPSO). Returns zero on success,
- * negative on failure.
+ * lower level protocol handler (i.e. CIPSO). @family may be %AF_UNSPEC which
+ * removes all address family entries. Returns zero on success, negative on
+ * failure.
*
*/
-int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
+int netlbl_domhsh_remove(const char *domain, u16 family,
+ struct netlbl_audit *audit_info)
{
- int ret_val;
+ int ret_val = -EINVAL;
struct netlbl_dom_map *entry;
rcu_read_lock();
- if (domain)
- entry = netlbl_domhsh_search(domain);
- else
- entry = netlbl_domhsh_search_def(domain);
- ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
+
+ if (family == AF_INET || family == AF_UNSPEC) {
+ if (domain)
+ entry = netlbl_domhsh_search(domain, AF_INET);
+ else
+ entry = netlbl_domhsh_search_def(domain, AF_INET);
+ ret_val = netlbl_domhsh_remove_entry(entry, audit_info);
+ if (ret_val && ret_val != -ENOENT)
+ goto done;
+ }
+ if (family == AF_INET6 || family == AF_UNSPEC) {
+ int ret_val2;
+
+ if (domain)
+ entry = netlbl_domhsh_search(domain, AF_INET6);
+ else
+ entry = netlbl_domhsh_search_def(domain, AF_INET6);
+ ret_val2 = netlbl_domhsh_remove_entry(entry, audit_info);
+ if (ret_val2 != -ENOENT)
+ ret_val = ret_val2;
+ }
+done:
rcu_read_unlock();
return ret_val;
@@ -651,32 +840,38 @@ int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info)
/**
* netlbl_domhsh_remove_default - Removes the default entry from the table
+ * @family: address family
* @audit_info: NetLabel audit information
*
* Description:
- * Removes/resets the default entry for the domain hash table and handles any
- * updates to the lower level protocol handler (i.e. CIPSO). Returns zero on
- * success, non-zero on failure.
+ * Removes/resets the default entry corresponding to @family from the domain
+ * hash table and handles any updates to the lower level protocol handler
+ * (i.e. CIPSO). @family may be %AF_UNSPEC which removes all address family
+ * entries. Returns zero on success, negative on failure.
*
*/
-int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info)
+int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info)
{
- return netlbl_domhsh_remove(NULL, audit_info);
+ return netlbl_domhsh_remove(NULL, family, audit_info);
}
/**
* netlbl_domhsh_getentry - Get an entry from the domain hash table
* @domain: the domain name to search for
+ * @family: address family
*
* Description:
* Look through the domain hash table searching for an entry to match @domain,
- * return a pointer to a copy of the entry or NULL. The caller is responsible
- * for ensuring that rcu_read_[un]lock() is called.
+ * with address family @family, return a pointer to a copy of the entry or
+ * NULL. The caller is responsible for ensuring that rcu_read_[un]lock() is
+ * called.
*
*/
-struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain)
+struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family)
{
- return netlbl_domhsh_search_def(domain);
+ if (family == AF_UNSPEC)
+ return NULL;
+ return netlbl_domhsh_search_def(domain, family);
}
/**
@@ -696,7 +891,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain,
struct netlbl_dom_map *dom_iter;
struct netlbl_af4list *addr_iter;
- dom_iter = netlbl_domhsh_search_def(domain);
+ dom_iter = netlbl_domhsh_search_def(domain, AF_INET);
if (dom_iter == NULL)
return NULL;
@@ -726,7 +921,7 @@ struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain,
struct netlbl_dom_map *dom_iter;
struct netlbl_af6list *addr_iter;
- dom_iter = netlbl_domhsh_search_def(domain);
+ dom_iter = netlbl_domhsh_search_def(domain, AF_INET6);
if (dom_iter == NULL)
return NULL;
diff --git a/net/netlabel/netlabel_domainhash.h b/net/netlabel/netlabel_domainhash.h
index 680caf4dff56..1f9247781927 100644
--- a/net/netlabel/netlabel_domainhash.h
+++ b/net/netlabel/netlabel_domainhash.h
@@ -51,6 +51,7 @@ struct netlbl_dommap_def {
union {
struct netlbl_domaddr_map *addrsel;
struct cipso_v4_doi *cipso;
+ struct calipso_doi *calipso;
};
};
#define netlbl_domhsh_addr4_entry(iter) \
@@ -70,6 +71,7 @@ struct netlbl_domaddr6_map {
struct netlbl_dom_map {
char *domain;
+ u16 family;
struct netlbl_dommap_def def;
u32 valid;
@@ -91,14 +93,23 @@ int netlbl_domhsh_remove_af4(const char *domain,
const struct in_addr *addr,
const struct in_addr *mask,
struct netlbl_audit *audit_info);
-int netlbl_domhsh_remove(const char *domain, struct netlbl_audit *audit_info);
-int netlbl_domhsh_remove_default(struct netlbl_audit *audit_info);
-struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain);
+int netlbl_domhsh_remove_af6(const char *domain,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove(const char *domain, u16 family,
+ struct netlbl_audit *audit_info);
+int netlbl_domhsh_remove_default(u16 family, struct netlbl_audit *audit_info);
+struct netlbl_dom_map *netlbl_domhsh_getentry(const char *domain, u16 family);
struct netlbl_dommap_def *netlbl_domhsh_getentry_af4(const char *domain,
__be32 addr);
#if IS_ENABLED(CONFIG_IPV6)
struct netlbl_dommap_def *netlbl_domhsh_getentry_af6(const char *domain,
const struct in6_addr *addr);
+int netlbl_domhsh_remove_af6(const char *domain,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info);
#endif /* IPv6 */
int netlbl_domhsh_walk(u32 *skip_bkt,
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 1325776daa27..28c56b95fb7f 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -37,12 +37,14 @@
#include <net/ipv6.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
+#include <net/calipso.h>
#include <asm/bug.h>
#include <linux/atomic.h>
#include "netlabel_domainhash.h"
#include "netlabel_unlabeled.h"
#include "netlabel_cipso_v4.h"
+#include "netlabel_calipso.h"
#include "netlabel_user.h"
#include "netlabel_mgmt.h"
#include "netlabel_addrlist.h"
@@ -72,12 +74,17 @@ int netlbl_cfg_map_del(const char *domain,
struct netlbl_audit *audit_info)
{
if (addr == NULL && mask == NULL) {
- return netlbl_domhsh_remove(domain, audit_info);
+ return netlbl_domhsh_remove(domain, family, audit_info);
} else if (addr != NULL && mask != NULL) {
switch (family) {
case AF_INET:
return netlbl_domhsh_remove_af4(domain, addr, mask,
audit_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ return netlbl_domhsh_remove_af6(domain, addr, mask,
+ audit_info);
+#endif /* IPv6 */
default:
return -EPFNOSUPPORT;
}
@@ -119,6 +126,7 @@ int netlbl_cfg_unlbl_map_add(const char *domain,
if (entry->domain == NULL)
goto cfg_unlbl_map_add_failure;
}
+ entry->family = family;
if (addr == NULL && mask == NULL)
entry->def.type = NETLBL_NLTYPE_UNLABELED;
@@ -345,6 +353,7 @@ int netlbl_cfg_cipsov4_map_add(u32 doi,
entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
if (entry == NULL)
goto out_entry;
+ entry->family = AF_INET;
if (domain != NULL) {
entry->domain = kstrdup(domain, GFP_ATOMIC);
if (entry->domain == NULL)
@@ -399,6 +408,139 @@ out_entry:
return ret_val;
}
+/**
+ * netlbl_cfg_calipso_add - Add a new CALIPSO DOI definition
+ * @doi_def: CALIPSO DOI definition
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new CALIPSO DOI definition as defined by @doi_def. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+int netlbl_cfg_calipso_add(struct calipso_doi *doi_def,
+ struct netlbl_audit *audit_info)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return calipso_doi_add(doi_def, audit_info);
+#else /* IPv6 */
+ return -ENOSYS;
+#endif /* IPv6 */
+}
+
+/**
+ * netlbl_cfg_calipso_del - Remove an existing CALIPSO DOI definition
+ * @doi: CALIPSO DOI
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Remove an existing CALIPSO DOI definition matching @doi. Returns zero on
+ * success and negative values on failure.
+ *
+ */
+void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ calipso_doi_remove(doi, audit_info);
+#endif /* IPv6 */
+}
+
+/**
+ * netlbl_cfg_calipso_map_add - Add a new CALIPSO DOI mapping
+ * @doi: the CALIPSO DOI
+ * @domain: the domain mapping to add
+ * @addr: IP address
+ * @mask: IP address mask
+ * @audit_info: NetLabel audit information
+ *
+ * Description:
+ * Add a new NetLabel/LSM domain mapping for the given CALIPSO DOI to the
+ * NetLabel subsystem. A @domain value of NULL adds a new default domain
+ * mapping. Returns zero on success, negative values on failure.
+ *
+ */
+int netlbl_cfg_calipso_map_add(u32 doi,
+ const char *domain,
+ const struct in6_addr *addr,
+ const struct in6_addr *mask,
+ struct netlbl_audit *audit_info)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ int ret_val = -ENOMEM;
+ struct calipso_doi *doi_def;
+ struct netlbl_dom_map *entry;
+ struct netlbl_domaddr_map *addrmap = NULL;
+ struct netlbl_domaddr6_map *addrinfo = NULL;
+
+ doi_def = calipso_doi_getdef(doi);
+ if (doi_def == NULL)
+ return -ENOENT;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (entry == NULL)
+ goto out_entry;
+ entry->family = AF_INET6;
+ if (domain != NULL) {
+ entry->domain = kstrdup(domain, GFP_ATOMIC);
+ if (entry->domain == NULL)
+ goto out_domain;
+ }
+
+ if (addr == NULL && mask == NULL) {
+ entry->def.calipso = doi_def;
+ entry->def.type = NETLBL_NLTYPE_CALIPSO;
+ } else if (addr != NULL && mask != NULL) {
+ addrmap = kzalloc(sizeof(*addrmap), GFP_ATOMIC);
+ if (addrmap == NULL)
+ goto out_addrmap;
+ INIT_LIST_HEAD(&addrmap->list4);
+ INIT_LIST_HEAD(&addrmap->list6);
+
+ addrinfo = kzalloc(sizeof(*addrinfo), GFP_ATOMIC);
+ if (addrinfo == NULL)
+ goto out_addrinfo;
+ addrinfo->def.calipso = doi_def;
+ addrinfo->def.type = NETLBL_NLTYPE_CALIPSO;
+ addrinfo->list.addr = *addr;
+ addrinfo->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
+ addrinfo->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
+ addrinfo->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
+ addrinfo->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
+ addrinfo->list.mask = *mask;
+ addrinfo->list.valid = 1;
+ ret_val = netlbl_af6list_add(&addrinfo->list, &addrmap->list6);
+ if (ret_val != 0)
+ goto cfg_calipso_map_add_failure;
+
+ entry->def.addrsel = addrmap;
+ entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
+ } else {
+ ret_val = -EINVAL;
+ goto out_addrmap;
+ }
+
+ ret_val = netlbl_domhsh_add(entry, audit_info);
+ if (ret_val != 0)
+ goto cfg_calipso_map_add_failure;
+
+ return 0;
+
+cfg_calipso_map_add_failure:
+ kfree(addrinfo);
+out_addrinfo:
+ kfree(addrmap);
+out_addrmap:
+ kfree(entry->domain);
+out_domain:
+ kfree(entry);
+out_entry:
+ calipso_doi_putdef(doi_def);
+ return ret_val;
+#else /* IPv6 */
+ return -ENOSYS;
+#endif /* IPv6 */
+}
+
/*
* Security Attribute Functions
*/
@@ -519,6 +661,7 @@ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset)
return -ENOENT;
}
+EXPORT_SYMBOL(netlbl_catmap_walk);
/**
* netlbl_catmap_walkrng - Find the end of a string of set bits
@@ -609,20 +752,19 @@ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap,
off = catmap->startbit;
*offset = off;
}
- iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_NONE, 0);
+ iter = _netlbl_catmap_getnode(&catmap, off, _CM_F_WALK, 0);
if (iter == NULL) {
*offset = (u32)-1;
return 0;
}
if (off < iter->startbit) {
- off = iter->startbit;
- *offset = off;
+ *offset = iter->startbit;
+ off = 0;
} else
off -= iter->startbit;
-
idx = off / NETLBL_CATMAP_MAPSIZE;
- *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_SIZE);
+ *bitmap = iter->bitmap[idx] >> (off % NETLBL_CATMAP_MAPSIZE);
return 0;
}
@@ -655,6 +797,7 @@ int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap,
return 0;
}
+EXPORT_SYMBOL(netlbl_catmap_setbit);
/**
* netlbl_catmap_setrng - Set a range of bits in a LSM secattr catmap
@@ -727,6 +870,76 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
return 0;
}
+/* Bitmap functions
+ */
+
+/**
+ * netlbl_bitmap_walk - Walk a bitmap looking for a bit
+ * @bitmap: the bitmap
+ * @bitmap_len: length in bits
+ * @offset: starting offset
+ * @state: if non-zero, look for a set (1) bit else look for a cleared (0) bit
+ *
+ * Description:
+ * Starting at @offset, walk the bitmap from left to right until either the
+ * desired bit is found or we reach the end. Return the bit offset, -1 if
+ * not found, or -2 if error.
+ */
+int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len,
+ u32 offset, u8 state)
+{
+ u32 bit_spot;
+ u32 byte_offset;
+ unsigned char bitmask;
+ unsigned char byte;
+
+ byte_offset = offset / 8;
+ byte = bitmap[byte_offset];
+ bit_spot = offset;
+ bitmask = 0x80 >> (offset % 8);
+
+ while (bit_spot < bitmap_len) {
+ if ((state && (byte & bitmask) == bitmask) ||
+ (state == 0 && (byte & bitmask) == 0))
+ return bit_spot;
+
+ bit_spot++;
+ bitmask >>= 1;
+ if (bitmask == 0) {
+ byte = bitmap[++byte_offset];
+ bitmask = 0x80;
+ }
+ }
+
+ return -1;
+}
+EXPORT_SYMBOL(netlbl_bitmap_walk);
+
+/**
+ * netlbl_bitmap_setbit - Sets a single bit in a bitmap
+ * @bitmap: the bitmap
+ * @bit: the bit
+ * @state: if non-zero, set the bit (1) else clear the bit (0)
+ *
+ * Description:
+ * Set a single bit in the bitmask. Returns zero on success, negative values
+ * on error.
+ */
+void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state)
+{
+ u32 byte_spot;
+ u8 bitmask;
+
+ /* gcc always rounds to zero when doing integer division */
+ byte_spot = bit / 8;
+ bitmask = 0x80 >> (bit % 8);
+ if (state)
+ bitmap[byte_spot] |= bitmask;
+ else
+ bitmap[byte_spot] &= ~bitmask;
+}
+EXPORT_SYMBOL(netlbl_bitmap_setbit);
+
/*
* LSM Functions
*/
@@ -774,7 +987,7 @@ int netlbl_sock_setattr(struct sock *sk,
struct netlbl_dom_map *dom_entry;
rcu_read_lock();
- dom_entry = netlbl_domhsh_getentry(secattr->domain);
+ dom_entry = netlbl_domhsh_getentry(secattr->domain, family);
if (dom_entry == NULL) {
ret_val = -ENOENT;
goto socket_setattr_return;
@@ -799,9 +1012,21 @@ int netlbl_sock_setattr(struct sock *sk,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- /* since we don't support any IPv6 labeling protocols right
- * now we can optimize everything away until we do */
- ret_val = 0;
+ switch (dom_entry->def.type) {
+ case NETLBL_NLTYPE_ADDRSELECT:
+ ret_val = -EDESTADDRREQ;
+ break;
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = calipso_sock_setattr(sk,
+ dom_entry->def.calipso,
+ secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ ret_val = 0;
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
break;
#endif /* IPv6 */
default:
@@ -824,7 +1049,16 @@ socket_setattr_return:
*/
void netlbl_sock_delattr(struct sock *sk)
{
- cipso_v4_sock_delattr(sk);
+ switch (sk->sk_family) {
+ case AF_INET:
+ cipso_v4_sock_delattr(sk);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ calipso_sock_delattr(sk);
+ break;
+#endif /* IPv6 */
+ }
}
/**
@@ -850,7 +1084,7 @@ int netlbl_sock_getattr(struct sock *sk,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- ret_val = -ENOMSG;
+ ret_val = calipso_sock_getattr(sk, secattr);
break;
#endif /* IPv6 */
default:
@@ -878,6 +1112,9 @@ int netlbl_conn_setattr(struct sock *sk,
{
int ret_val;
struct sockaddr_in *addr4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 *addr6;
+#endif
struct netlbl_dommap_def *entry;
rcu_read_lock();
@@ -898,7 +1135,7 @@ int netlbl_conn_setattr(struct sock *sk,
case NETLBL_NLTYPE_UNLABELED:
/* just delete the protocols we support for right now
* but we could remove other protocols if needed */
- cipso_v4_sock_delattr(sk);
+ netlbl_sock_delattr(sk);
ret_val = 0;
break;
default:
@@ -907,9 +1144,27 @@ int netlbl_conn_setattr(struct sock *sk,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- /* since we don't support any IPv6 labeling protocols right
- * now we can optimize everything away until we do */
- ret_val = 0;
+ addr6 = (struct sockaddr_in6 *)addr;
+ entry = netlbl_domhsh_getentry_af6(secattr->domain,
+ &addr6->sin6_addr);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto conn_setattr_return;
+ }
+ switch (entry->type) {
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = calipso_sock_setattr(sk,
+ entry->calipso, secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ /* just delete the protocols we support for right now
+ * but we could remove other protocols if needed */
+ netlbl_sock_delattr(sk);
+ ret_val = 0;
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
break;
#endif /* IPv6 */
default:
@@ -936,12 +1191,13 @@ int netlbl_req_setattr(struct request_sock *req,
{
int ret_val;
struct netlbl_dommap_def *entry;
+ struct inet_request_sock *ireq = inet_rsk(req);
rcu_read_lock();
switch (req->rsk_ops->family) {
case AF_INET:
entry = netlbl_domhsh_getentry_af4(secattr->domain,
- inet_rsk(req)->ir_rmt_addr);
+ ireq->ir_rmt_addr);
if (entry == NULL) {
ret_val = -ENOENT;
goto req_setattr_return;
@@ -952,9 +1208,7 @@ int netlbl_req_setattr(struct request_sock *req,
entry->cipso, secattr);
break;
case NETLBL_NLTYPE_UNLABELED:
- /* just delete the protocols we support for right now
- * but we could remove other protocols if needed */
- cipso_v4_req_delattr(req);
+ netlbl_req_delattr(req);
ret_val = 0;
break;
default:
@@ -963,9 +1217,24 @@ int netlbl_req_setattr(struct request_sock *req,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- /* since we don't support any IPv6 labeling protocols right
- * now we can optimize everything away until we do */
- ret_val = 0;
+ entry = netlbl_domhsh_getentry_af6(secattr->domain,
+ &ireq->ir_v6_rmt_addr);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto req_setattr_return;
+ }
+ switch (entry->type) {
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = calipso_req_setattr(req,
+ entry->calipso, secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ netlbl_req_delattr(req);
+ ret_val = 0;
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
break;
#endif /* IPv6 */
default:
@@ -987,7 +1256,16 @@ req_setattr_return:
*/
void netlbl_req_delattr(struct request_sock *req)
{
- cipso_v4_req_delattr(req);
+ switch (req->rsk_ops->family) {
+ case AF_INET:
+ cipso_v4_req_delattr(req);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ calipso_req_delattr(req);
+ break;
+#endif /* IPv6 */
+ }
}
/**
@@ -1007,13 +1285,17 @@ int netlbl_skbuff_setattr(struct sk_buff *skb,
{
int ret_val;
struct iphdr *hdr4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct ipv6hdr *hdr6;
+#endif
struct netlbl_dommap_def *entry;
rcu_read_lock();
switch (family) {
case AF_INET:
hdr4 = ip_hdr(skb);
- entry = netlbl_domhsh_getentry_af4(secattr->domain,hdr4->daddr);
+ entry = netlbl_domhsh_getentry_af4(secattr->domain,
+ hdr4->daddr);
if (entry == NULL) {
ret_val = -ENOENT;
goto skbuff_setattr_return;
@@ -1034,9 +1316,26 @@ int netlbl_skbuff_setattr(struct sk_buff *skb,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- /* since we don't support any IPv6 labeling protocols right
- * now we can optimize everything away until we do */
- ret_val = 0;
+ hdr6 = ipv6_hdr(skb);
+ entry = netlbl_domhsh_getentry_af6(secattr->domain,
+ &hdr6->daddr);
+ if (entry == NULL) {
+ ret_val = -ENOENT;
+ goto skbuff_setattr_return;
+ }
+ switch (entry->type) {
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = calipso_skbuff_setattr(skb, entry->calipso,
+ secattr);
+ break;
+ case NETLBL_NLTYPE_UNLABELED:
+ /* just delete the protocols we support for right now
+ * but we could remove other protocols if needed */
+ ret_val = calipso_skbuff_delattr(skb);
+ break;
+ default:
+ ret_val = -ENOENT;
+ }
break;
#endif /* IPv6 */
default:
@@ -1075,6 +1374,9 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
+ ptr = calipso_optptr(skb);
+ if (ptr && calipso_getattr(ptr, secattr) == 0)
+ return 0;
break;
#endif /* IPv6 */
}
@@ -1085,6 +1387,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
/**
* netlbl_skbuff_err - Handle a LSM error on a sk_buff
* @skb: the packet
+ * @family: the family
* @error: the error code
* @gateway: true if host is acting as a gateway, false otherwise
*
@@ -1094,10 +1397,14 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
* according to the packet's labeling protocol.
*
*/
-void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway)
+void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway)
{
- if (cipso_v4_optptr(skb))
- cipso_v4_error(skb, error, gateway);
+ switch (family) {
+ case AF_INET:
+ if (cipso_v4_optptr(skb))
+ cipso_v4_error(skb, error, gateway);
+ break;
+ }
}
/**
@@ -1112,11 +1419,15 @@ void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway)
void netlbl_cache_invalidate(void)
{
cipso_v4_cache_invalidate();
+#if IS_ENABLED(CONFIG_IPV6)
+ calipso_cache_invalidate();
+#endif /* IPv6 */
}
/**
* netlbl_cache_add - Add an entry to a NetLabel protocol cache
* @skb: the packet
+ * @family: the family
* @secattr: the packet's security attributes
*
* Description:
@@ -1125,7 +1436,7 @@ void netlbl_cache_invalidate(void)
* values on error.
*
*/
-int netlbl_cache_add(const struct sk_buff *skb,
+int netlbl_cache_add(const struct sk_buff *skb, u16 family,
const struct netlbl_lsm_secattr *secattr)
{
unsigned char *ptr;
@@ -1133,10 +1444,20 @@ int netlbl_cache_add(const struct sk_buff *skb,
if ((secattr->flags & NETLBL_SECATTR_CACHE) == 0)
return -ENOMSG;
- ptr = cipso_v4_optptr(skb);
- if (ptr)
- return cipso_v4_cache_add(ptr, secattr);
-
+ switch (family) {
+ case AF_INET:
+ ptr = cipso_v4_optptr(skb);
+ if (ptr)
+ return cipso_v4_cache_add(ptr, secattr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ ptr = calipso_optptr(skb);
+ if (ptr)
+ return calipso_cache_add(ptr, secattr);
+ break;
+#endif /* IPv6 */
+ }
return -ENOMSG;
}
@@ -1161,6 +1482,7 @@ struct audit_buffer *netlbl_audit_start(int type,
{
return netlbl_audit_start_common(type, audit_info);
}
+EXPORT_SYMBOL(netlbl_audit_start);
/*
* Setup Functions
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 13f777f20995..f85d0e07af2d 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -41,8 +41,10 @@
#include <net/ipv6.h>
#include <net/netlabel.h>
#include <net/cipso_ipv4.h>
+#include <net/calipso.h>
#include <linux/atomic.h>
+#include "netlabel_calipso.h"
#include "netlabel_domainhash.h"
#include "netlabel_user.h"
#include "netlabel_mgmt.h"
@@ -72,6 +74,8 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
[NLBL_MGMT_A_PROTOCOL] = { .type = NLA_U32 },
[NLBL_MGMT_A_VERSION] = { .type = NLA_U32 },
[NLBL_MGMT_A_CV4DOI] = { .type = NLA_U32 },
+ [NLBL_MGMT_A_FAMILY] = { .type = NLA_U16 },
+ [NLBL_MGMT_A_CLPDOI] = { .type = NLA_U32 },
};
/*
@@ -95,6 +99,9 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
int ret_val = -EINVAL;
struct netlbl_domaddr_map *addrmap = NULL;
struct cipso_v4_doi *cipsov4 = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct calipso_doi *calipso = NULL;
+#endif
u32 tmp_val;
struct netlbl_dom_map *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
@@ -119,6 +126,11 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
switch (entry->def.type) {
case NETLBL_NLTYPE_UNLABELED:
+ if (info->attrs[NLBL_MGMT_A_FAMILY])
+ entry->family =
+ nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]);
+ else
+ entry->family = AF_UNSPEC;
break;
case NETLBL_NLTYPE_CIPSOV4:
if (!info->attrs[NLBL_MGMT_A_CV4DOI])
@@ -128,12 +140,30 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
cipsov4 = cipso_v4_doi_getdef(tmp_val);
if (cipsov4 == NULL)
goto add_free_domain;
+ entry->family = AF_INET;
entry->def.cipso = cipsov4;
break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NETLBL_NLTYPE_CALIPSO:
+ if (!info->attrs[NLBL_MGMT_A_CLPDOI])
+ goto add_free_domain;
+
+ tmp_val = nla_get_u32(info->attrs[NLBL_MGMT_A_CLPDOI]);
+ calipso = calipso_doi_getdef(tmp_val);
+ if (calipso == NULL)
+ goto add_free_domain;
+ entry->family = AF_INET6;
+ entry->def.calipso = calipso;
+ break;
+#endif /* IPv6 */
default:
goto add_free_domain;
}
+ if ((entry->family == AF_INET && info->attrs[NLBL_MGMT_A_IPV6ADDR]) ||
+ (entry->family == AF_INET6 && info->attrs[NLBL_MGMT_A_IPV4ADDR]))
+ goto add_doi_put_def;
+
if (info->attrs[NLBL_MGMT_A_IPV4ADDR]) {
struct in_addr *addr;
struct in_addr *mask;
@@ -178,6 +208,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
goto add_free_addrmap;
}
+ entry->family = AF_INET;
entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
entry->def.addrsel = addrmap;
#if IS_ENABLED(CONFIG_IPV6)
@@ -220,6 +251,8 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
map->list.mask = *mask;
map->list.valid = 1;
map->def.type = entry->def.type;
+ if (calipso)
+ map->def.calipso = calipso;
ret_val = netlbl_af6list_add(&map->list, &addrmap->list6);
if (ret_val != 0) {
@@ -227,6 +260,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info,
goto add_free_addrmap;
}
+ entry->family = AF_INET6;
entry->def.type = NETLBL_NLTYPE_ADDRSELECT;
entry->def.addrsel = addrmap;
#endif /* IPv6 */
@@ -242,6 +276,9 @@ add_free_addrmap:
kfree(addrmap);
add_doi_put_def:
cipso_v4_doi_putdef(cipsov4);
+#if IS_ENABLED(CONFIG_IPV6)
+ calipso_doi_putdef(calipso);
+#endif
add_free_domain:
kfree(entry->domain);
add_free_entry:
@@ -278,6 +315,10 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb,
return ret_val;
}
+ ret_val = nla_put_u16(skb, NLBL_MGMT_A_FAMILY, entry->family);
+ if (ret_val != 0)
+ return ret_val;
+
switch (entry->def.type) {
case NETLBL_NLTYPE_ADDRSELECT:
nla_a = nla_nest_start(skb, NLBL_MGMT_A_SELECTORLIST);
@@ -340,6 +381,15 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb,
if (ret_val != 0)
return ret_val;
+ switch (map6->def.type) {
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI,
+ map6->def.calipso->doi);
+ if (ret_val != 0)
+ return ret_val;
+ break;
+ }
+
nla_nest_end(skb, nla_b);
}
#endif /* IPv6 */
@@ -347,15 +397,25 @@ static int netlbl_mgmt_listentry(struct sk_buff *skb,
nla_nest_end(skb, nla_a);
break;
case NETLBL_NLTYPE_UNLABELED:
- ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type);
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+ entry->def.type);
break;
case NETLBL_NLTYPE_CIPSOV4:
- ret_val = nla_put_u32(skb,NLBL_MGMT_A_PROTOCOL,entry->def.type);
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+ entry->def.type);
if (ret_val != 0)
return ret_val;
ret_val = nla_put_u32(skb, NLBL_MGMT_A_CV4DOI,
entry->def.cipso->doi);
break;
+ case NETLBL_NLTYPE_CALIPSO:
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_PROTOCOL,
+ entry->def.type);
+ if (ret_val != 0)
+ return ret_val;
+ ret_val = nla_put_u32(skb, NLBL_MGMT_A_CLPDOI,
+ entry->def.calipso->doi);
+ break;
}
return ret_val;
@@ -418,7 +478,7 @@ static int netlbl_mgmt_remove(struct sk_buff *skb, struct genl_info *info)
netlbl_netlink_auditinfo(skb, &audit_info);
domain = nla_data(info->attrs[NLBL_MGMT_A_DOMAIN]);
- return netlbl_domhsh_remove(domain, &audit_info);
+ return netlbl_domhsh_remove(domain, AF_UNSPEC, &audit_info);
}
/**
@@ -536,7 +596,7 @@ static int netlbl_mgmt_removedef(struct sk_buff *skb, struct genl_info *info)
netlbl_netlink_auditinfo(skb, &audit_info);
- return netlbl_domhsh_remove_default(&audit_info);
+ return netlbl_domhsh_remove_default(AF_UNSPEC, &audit_info);
}
/**
@@ -556,6 +616,12 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *ans_skb = NULL;
void *data;
struct netlbl_dom_map *entry;
+ u16 family;
+
+ if (info->attrs[NLBL_MGMT_A_FAMILY])
+ family = nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]);
+ else
+ family = AF_INET;
ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (ans_skb == NULL)
@@ -566,7 +632,7 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info)
goto listdef_failure;
rcu_read_lock();
- entry = netlbl_domhsh_getentry(NULL);
+ entry = netlbl_domhsh_getentry(NULL, family);
if (entry == NULL) {
ret_val = -ENOENT;
goto listdef_failure_lock;
@@ -651,6 +717,15 @@ static int netlbl_mgmt_protocols(struct sk_buff *skb,
goto protocols_return;
protos_sent++;
}
+#if IS_ENABLED(CONFIG_IPV6)
+ if (protos_sent == 2) {
+ if (netlbl_mgmt_protocols_cb(skb,
+ cb,
+ NETLBL_NLTYPE_CALIPSO) < 0)
+ goto protocols_return;
+ protos_sent++;
+ }
+#endif
protocols_return:
cb->args[0] = protos_sent;
diff --git a/net/netlabel/netlabel_mgmt.h b/net/netlabel/netlabel_mgmt.h
index 8b6e1ab62b48..ea01e42bca78 100644
--- a/net/netlabel/netlabel_mgmt.h
+++ b/net/netlabel/netlabel_mgmt.h
@@ -58,7 +58,10 @@
*
* NLBL_MGMT_A_CV4DOI
*
- * If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ * If using NETLBL_NLTYPE_UNLABELED no other attributes are required,
+ * however the following attribute may optionally be sent:
+ *
+ * NLBL_MGMT_A_FAMILY
*
* o REMOVE:
* Sent by an application to remove a domain mapping from the NetLabel
@@ -77,6 +80,7 @@
* Required attributes:
*
* NLBL_MGMT_A_DOMAIN
+ * NLBL_MGMT_A_FAMILY
*
* If the IP address selectors are not used the following attribute is
* required:
@@ -108,7 +112,10 @@
*
* NLBL_MGMT_A_CV4DOI
*
- * If using NETLBL_NLTYPE_UNLABELED no other attributes are required.
+ * If using NETLBL_NLTYPE_UNLABELED no other attributes are required,
+ * however the following attribute may optionally be sent:
+ *
+ * NLBL_MGMT_A_FAMILY
*
* o REMOVEDEF:
* Sent by an application to remove the default domain mapping from the
@@ -117,13 +124,17 @@
* o LISTDEF:
* This message can be sent either from an application or by the kernel in
* response to an application generated LISTDEF message. When sent by an
- * application there is no payload. On success the kernel should send a
- * response using the following format.
+ * application there may be an optional payload.
*
- * If the IP address selectors are not used the following attribute is
+ * NLBL_MGMT_A_FAMILY
+ *
+ * On success the kernel should send a response using the following format:
+ *
+ * If the IP address selectors are not used the following attributes are
* required:
*
* NLBL_MGMT_A_PROTOCOL
+ * NLBL_MGMT_A_FAMILY
*
* If the IP address selectors are used then the following attritbute is
* required:
@@ -209,6 +220,12 @@ enum {
/* (NLA_NESTED)
* the selector list, there must be at least one
* NLBL_MGMT_A_ADDRSELECTOR attribute */
+ NLBL_MGMT_A_FAMILY,
+ /* (NLA_U16)
+ * The address family */
+ NLBL_MGMT_A_CLPDOI,
+ /* (NLA_U32)
+ * the CALIPSO DOI value */
__NLBL_MGMT_A_MAX,
};
#define NLBL_MGMT_A_MAX (__NLBL_MGMT_A_MAX - 1)
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 9eaa9a1e8629..4528cff9138b 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -116,8 +116,8 @@ struct netlbl_unlhsh_walk_arg {
static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
#define netlbl_unlhsh_rcu_deref(p) \
rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock))
-static struct netlbl_unlhsh_tbl *netlbl_unlhsh;
-static struct netlbl_unlhsh_iface *netlbl_unlhsh_def;
+static struct netlbl_unlhsh_tbl __rcu *netlbl_unlhsh;
+static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def;
/* Accept unlabeled packets flag */
static u8 netlabel_unlabel_acceptflg;
@@ -1537,6 +1537,7 @@ int __init netlbl_unlabel_defconf(void)
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (entry == NULL)
return -ENOMEM;
+ entry->family = AF_UNSPEC;
entry->def.type = NETLBL_NLTYPE_UNLABELED;
ret_val = netlbl_domhsh_add_default(entry, &audit_info);
if (ret_val != 0)
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index adf8b7900da2..58495f44c62a 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -44,6 +44,7 @@
#include "netlabel_mgmt.h"
#include "netlabel_unlabeled.h"
#include "netlabel_cipso_v4.h"
+#include "netlabel_calipso.h"
#include "netlabel_user.h"
/*
@@ -71,6 +72,10 @@ int __init netlbl_netlink_init(void)
if (ret_val != 0)
return ret_val;
+ ret_val = netlbl_calipso_genl_init();
+ if (ret_val != 0)
+ return ret_val;
+
return netlbl_unlabel_genl_init();
}
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 7425f6c23888..1f1682b9a6a8 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -610,7 +610,8 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
/* We will generate more packets, so re-queue
* auth chunk.
*/
- list_add(&chunk->list, &packet->chunk_list);
+ list_add(&packet->auth->list,
+ &packet->chunk_list);
} else {
sctp_chunk_free(packet->auth);
packet->auth = NULL;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 8812e1bf6c1c..9fc417a8b476 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2079,7 +2079,7 @@ static int sctp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
lock_sock(sk);
if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED) &&
- !sctp_sstate(sk, CLOSING)) {
+ !sctp_sstate(sk, CLOSING) && !sctp_sstate(sk, CLOSED)) {
err = -ENOTCONN;
goto out;
}
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index ec166d2bd2d9..877e55066f89 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -204,7 +204,9 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
/* If the socket is just going to throw this away, do not
* even try to deliver it.
*/
- if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
+ if (sk->sk_shutdown & RCV_SHUTDOWN &&
+ (sk->sk_shutdown & SEND_SHUTDOWN ||
+ !sctp_ulpevent_is_notification(event)))
goto out_free;
if (!sctp_ulpevent_is_notification(event)) {
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 040ff627c18a..a7e42f9a405c 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -51,9 +51,7 @@ static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp)
ret = kstrtoul(val, 0, &num);
if (ret == -EINVAL)
goto out_inval;
- nbits = fls(num);
- if (num > (1U << nbits))
- nbits++;
+ nbits = fls(num - 1);
if (nbits > MAX_HASHTABLE_BITS || nbits < 2)
goto out_inval;
*(unsigned int *)kp->arg = nbits;
@@ -359,8 +357,10 @@ rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred)
EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify);
bool
-rpcauth_cred_key_to_expire(struct rpc_cred *cred)
+rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred)
{
+ if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
+ return false;
if (!cred->cr_ops->crkey_to_expire)
return false;
return cred->cr_ops->crkey_to_expire(cred);
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 54dd3fdead54..168219535a34 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -224,7 +224,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
/* Fast track for non crkey_timeout (no key) underlying credentials */
- if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags))
+ if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT)
return 0;
/* Fast track for the normal case */
@@ -236,12 +236,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
if (IS_ERR(tcred))
return -EACCES;
- if (!tcred->cr_ops->crkey_timeout) {
- set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags);
- ret = 0;
- goto out_put;
- }
-
/* Test for the almost error case */
ret = tcred->cr_ops->crkey_timeout(tcred);
if (ret != 0) {
@@ -257,7 +251,6 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags);
}
-out_put:
put_rpccred(tcred);
return ret;
}
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index e64ae93d5b4f..23c8e7c39656 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1015,8 +1015,11 @@ gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2;
auth->au_rslack = GSS_VERF_SLACK >> 2;
+ auth->au_flags = 0;
auth->au_ops = &authgss_ops;
auth->au_flavor = flavor;
+ if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor))
+ auth->au_flags |= RPCAUTH_AUTH_DATATOUCH;
atomic_set(&auth->au_count, 1);
kref_init(&gss_auth->kref);
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 65427492b1c9..60595835317a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -745,12 +745,14 @@ static struct pf_desc gss_kerberos_pfs[] = {
.qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_INTEGRITY,
.name = "krb5i",
+ .datatouch = true,
},
[2] = {
.pseudoflavor = RPC_AUTH_GSS_KRB5P,
.qop = GSS_C_QOP_DEFAULT,
.service = RPC_GSS_SVC_PRIVACY,
.name = "krb5p",
+ .datatouch = true,
},
};
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 7063d856a598..5fec3abbe19b 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -361,6 +361,18 @@ gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
}
EXPORT_SYMBOL(gss_pseudoflavor_to_service);
+bool
+gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor)
+{
+ int i;
+
+ for (i = 0; i < gm->gm_pf_num; i++) {
+ if (gm->gm_pfs[i].pseudoflavor == pseudoflavor)
+ return gm->gm_pfs[i].datatouch;
+ }
+ return false;
+}
+
char *
gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service)
{
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index e085f5ae1548..1d281816f2bf 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1230,8 +1230,9 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
if (status)
goto out;
- dprintk("RPC: svcauth_gss: gss major status = %d\n",
- ud.major_status);
+ dprintk("RPC: svcauth_gss: gss major status = %d "
+ "minor status = %d\n",
+ ud.major_status, ud.minor_status);
switch (ud.major_status) {
case GSS_S_CONTINUE_NEEDED:
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 8d9eb4d5ddd8..4d17376b2acb 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -115,6 +115,7 @@ static
struct rpc_auth null_auth = {
.au_cslack = NUL_CALLSLACK,
.au_rslack = NUL_REPLYSLACK,
+ .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
.au_ops = &authnull_ops,
.au_flavor = RPC_AUTH_NULL,
.au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 9f65452b7cbc..a99278c984e8 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -228,6 +228,7 @@ static
struct rpc_auth unix_auth = {
.au_cslack = UNX_CALLSLACK,
.au_rslack = NUL_REPLYSLACK,
+ .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT,
.au_ops = &authunix_ops,
.au_flavor = RPC_AUTH_UNIX,
.au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 553bf95f7003..4d8e11f94a35 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -362,7 +362,7 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
cache_purge(cd);
spin_lock(&cache_list_lock);
write_lock(&cd->hash_lock);
- if (cd->entries || atomic_read(&cd->inuse)) {
+ if (cd->entries) {
write_unlock(&cd->hash_lock);
spin_unlock(&cache_list_lock);
goto out;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2808d550d273..cb49898a5a58 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2577,7 +2577,7 @@ static void rpc_cb_add_xprt_release(void *calldata)
kfree(data);
}
-const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
+static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
.rpc_call_done = rpc_cb_add_xprt_done,
.rpc_release = rpc_cb_add_xprt_release,
};
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index fc48eca21fd2..84f98cbe31c3 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1386,7 +1386,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
struct dentry *root, *gssd_dentry;
- struct net *net = data;
+ struct net *net = get_net(sb->s_fs_info);
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int err;
@@ -1419,7 +1419,6 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
sb);
if (err)
goto err_depopulate;
- sb->s_fs_info = get_net(net);
mutex_unlock(&sn->pipefs_sb_lock);
return 0;
@@ -1448,7 +1447,8 @@ static struct dentry *
rpc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_ns(fs_type, flags, current->nsproxy->net_ns, rpc_fill_super);
+ struct net *net = current->nsproxy->net_ns;
+ return mount_ns(fs_type, flags, data, net, net->user_ns, rpc_fill_super);
}
static void rpc_kill_sb(struct super_block *sb)
@@ -1468,9 +1468,9 @@ static void rpc_kill_sb(struct super_block *sb)
RPC_PIPEFS_UMOUNT,
sb);
mutex_unlock(&sn->pipefs_sb_lock);
- put_net(net);
out:
kill_litter_super(sb);
+ put_net(net);
}
static struct file_system_type rpc_pipe_fs_type = {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index fcfd48d263f6..9ae588511aaf 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -54,7 +54,8 @@ static struct rpc_wait_queue delay_queue;
/*
* rpciod-related stuff
*/
-struct workqueue_struct *rpciod_workqueue;
+struct workqueue_struct *rpciod_workqueue __read_mostly;
+struct workqueue_struct *xprtiod_workqueue __read_mostly;
/*
* Disable the timer for a given RPC task. Should be called with
@@ -329,7 +330,8 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
* lockless RPC_IS_QUEUED() test) before we've had a chance to test
* the RPC_TASK_RUNNING flag.
*/
-static void rpc_make_runnable(struct rpc_task *task)
+static void rpc_make_runnable(struct workqueue_struct *wq,
+ struct rpc_task *task)
{
bool need_wakeup = !rpc_test_and_set_running(task);
@@ -338,7 +340,7 @@ static void rpc_make_runnable(struct rpc_task *task)
return;
if (RPC_IS_ASYNC(task)) {
INIT_WORK(&task->u.tk_work, rpc_async_schedule);
- queue_work(rpciod_workqueue, &task->u.tk_work);
+ queue_work(wq, &task->u.tk_work);
} else
wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
}
@@ -407,13 +409,16 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
/**
- * __rpc_do_wake_up_task - wake up a single rpc_task
+ * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task
+ * @wq: workqueue on which to run task
* @queue: wait queue
* @task: task to be woken up
*
* Caller must hold queue->lock, and have cleared the task queued flag.
*/
-static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue,
+ struct rpc_task *task)
{
dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n",
task->tk_pid, jiffies);
@@ -428,7 +433,7 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
__rpc_remove_wait_queue(queue, task);
- rpc_make_runnable(task);
+ rpc_make_runnable(wq, task);
dprintk("RPC: __rpc_wake_up_task done\n");
}
@@ -436,16 +441,25 @@ static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task
/*
* Wake up a queued task while the queue lock is being held
*/
-static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue, struct rpc_task *task)
{
if (RPC_IS_QUEUED(task)) {
smp_rmb();
if (task->tk_waitqueue == queue)
- __rpc_do_wake_up_task(queue, task);
+ __rpc_do_wake_up_task_on_wq(wq, queue, task);
}
}
/*
+ * Wake up a queued task while the queue lock is being held
+ */
+static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task)
+{
+ rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
+}
+
+/*
* Wake up a task on a specific queue
*/
void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task)
@@ -518,7 +532,8 @@ static struct rpc_task *__rpc_find_next_queued(struct rpc_wait_queue *queue)
/*
* Wake up the first task on the wait queue.
*/
-struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
+ struct rpc_wait_queue *queue,
bool (*func)(struct rpc_task *, void *), void *data)
{
struct rpc_task *task = NULL;
@@ -529,7 +544,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
task = __rpc_find_next_queued(queue);
if (task != NULL) {
if (func(task, data))
- rpc_wake_up_task_queue_locked(queue, task);
+ rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
else
task = NULL;
}
@@ -537,6 +552,15 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
return task;
}
+
+/*
+ * Wake up the first task on the wait queue.
+ */
+struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue,
+ bool (*func)(struct rpc_task *, void *), void *data)
+{
+ return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data);
+}
EXPORT_SYMBOL_GPL(rpc_wake_up_first);
static bool rpc_wake_up_next_func(struct rpc_task *task, void *data)
@@ -814,7 +838,7 @@ void rpc_execute(struct rpc_task *task)
bool is_async = RPC_IS_ASYNC(task);
rpc_set_active(task);
- rpc_make_runnable(task);
+ rpc_make_runnable(rpciod_workqueue, task);
if (!is_async)
__rpc_execute(task);
}
@@ -1071,10 +1095,22 @@ static int rpciod_start(void)
* Create the rpciod thread and wait for it to start.
*/
dprintk("RPC: creating workqueue rpciod\n");
- /* Note: highpri because network receive is latency sensitive */
- wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
+ if (!wq)
+ goto out_failed;
rpciod_workqueue = wq;
- return rpciod_workqueue != NULL;
+ /* Note: highpri because network receive is latency sensitive */
+ wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ if (!wq)
+ goto free_rpciod;
+ xprtiod_workqueue = wq;
+ return 1;
+free_rpciod:
+ wq = rpciod_workqueue;
+ rpciod_workqueue = NULL;
+ destroy_workqueue(wq);
+out_failed:
+ return 0;
}
static void rpciod_stop(void)
@@ -1088,6 +1124,9 @@ static void rpciod_stop(void)
wq = rpciod_workqueue;
rpciod_workqueue = NULL;
destroy_workqueue(wq);
+ wq = xprtiod_workqueue;
+ xprtiod_workqueue = NULL;
+ destroy_workqueue(wq);
}
void
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index cc9852897395..c5b0cb4f4056 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1188,11 +1188,17 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
/* Encode reply */
- if (test_bit(RQ_DROPME, &rqstp->rq_flags)) {
+ if (*statp == rpc_drop_reply ||
+ test_bit(RQ_DROPME, &rqstp->rq_flags)) {
if (procp->pc_release)
procp->pc_release(rqstp, NULL, rqstp->rq_resp);
goto dropit;
}
+ if (*statp == rpc_autherr_badcred) {
+ if (procp->pc_release)
+ procp->pc_release(rqstp, NULL, rqstp->rq_resp);
+ goto err_bad_auth;
+ }
if (*statp == rpc_success &&
(xdr = procp->pc_encode) &&
!xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 4f01f63102ee..c3f652395a80 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -21,6 +21,10 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+static unsigned int svc_rpc_per_connection_limit __read_mostly;
+module_param(svc_rpc_per_connection_limit, uint, 0644);
+
+
static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
static int svc_deferred_recv(struct svc_rqst *rqstp);
static struct cache_deferred_req *svc_defer(struct cache_req *req);
@@ -329,12 +333,45 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
}
EXPORT_SYMBOL_GPL(svc_print_addr);
+static bool svc_xprt_slots_in_range(struct svc_xprt *xprt)
+{
+ unsigned int limit = svc_rpc_per_connection_limit;
+ int nrqsts = atomic_read(&xprt->xpt_nr_rqsts);
+
+ return limit == 0 || (nrqsts >= 0 && nrqsts < limit);
+}
+
+static bool svc_xprt_reserve_slot(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+ if (!test_bit(RQ_DATA, &rqstp->rq_flags)) {
+ if (!svc_xprt_slots_in_range(xprt))
+ return false;
+ atomic_inc(&xprt->xpt_nr_rqsts);
+ set_bit(RQ_DATA, &rqstp->rq_flags);
+ }
+ return true;
+}
+
+static void svc_xprt_release_slot(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ if (test_and_clear_bit(RQ_DATA, &rqstp->rq_flags)) {
+ atomic_dec(&xprt->xpt_nr_rqsts);
+ svc_xprt_enqueue(xprt);
+ }
+}
+
static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
{
if (xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_CLOSE)))
return true;
- if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED)))
- return xprt->xpt_ops->xpo_has_wspace(xprt);
+ if (xprt->xpt_flags & ((1<<XPT_DATA)|(1<<XPT_DEFERRED))) {
+ if (xprt->xpt_ops->xpo_has_wspace(xprt) &&
+ svc_xprt_slots_in_range(xprt))
+ return true;
+ trace_svc_xprt_no_write_space(xprt);
+ return false;
+ }
return false;
}
@@ -480,8 +517,6 @@ void svc_reserve(struct svc_rqst *rqstp, int space)
atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
rqstp->rq_reserved = space;
- if (xprt->xpt_ops->xpo_adjust_wspace)
- xprt->xpt_ops->xpo_adjust_wspace(xprt);
svc_xprt_enqueue(xprt);
}
}
@@ -512,8 +547,8 @@ static void svc_xprt_release(struct svc_rqst *rqstp)
rqstp->rq_res.head[0].iov_len = 0;
svc_reserve(rqstp, 0);
+ svc_xprt_release_slot(rqstp);
rqstp->rq_xprt = NULL;
-
svc_xprt_put(xprt);
}
@@ -781,7 +816,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
svc_add_new_temp_xprt(serv, newxpt);
else
module_put(xprt->xpt_class->xcl_owner);
- } else {
+ } else if (svc_xprt_reserve_slot(rqstp, xprt)) {
/* XPT_DATA|XPT_DEFERRED case: */
dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
rqstp, rqstp->rq_pool->sp_id, xprt,
@@ -871,6 +906,7 @@ EXPORT_SYMBOL_GPL(svc_recv);
*/
void svc_drop(struct svc_rqst *rqstp)
{
+ trace_svc_drop(rqstp);
dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
svc_xprt_release(rqstp);
}
@@ -1148,6 +1184,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
spin_unlock(&xprt->xpt_lock);
dprintk("revisit canceled\n");
svc_xprt_put(xprt);
+ trace_svc_drop_deferred(dr);
kfree(dr);
return;
}
@@ -1205,6 +1242,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req)
set_bit(RQ_DROPME, &rqstp->rq_flags);
dr->handle.revisit = svc_revisit;
+ trace_svc_defer(rqstp);
return &dr->handle;
}
@@ -1245,6 +1283,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
struct svc_deferred_req,
handle.recent);
list_del_init(&dr->handle.recent);
+ trace_svc_revisit_deferred(dr);
} else
clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
spin_unlock(&xprt->xpt_lock);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index dadfec66dbd8..57625f64efd5 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -60,7 +60,6 @@
static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
int flags);
-static void svc_udp_data_ready(struct sock *);
static int svc_udp_recvfrom(struct svc_rqst *);
static int svc_udp_sendto(struct svc_rqst *);
static void svc_sock_detach(struct svc_xprt *);
@@ -398,48 +397,21 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp)
return svc_port_is_privileged(svc_addr(rqstp));
}
-static bool sunrpc_waitqueue_active(wait_queue_head_t *wq)
-{
- if (!wq)
- return false;
- /*
- * There should normally be a memory * barrier here--see
- * wq_has_sleeper().
- *
- * It appears that isn't currently necessary, though, basically
- * because callers all appear to have sufficient memory barriers
- * between the time the relevant change is made and the
- * time they call these callbacks.
- *
- * The nfsd code itself doesn't actually explicitly wait on
- * these waitqueues, but it may wait on them for example in
- * sendpage() or sendmsg() calls. (And those may be the only
- * places, since it it uses nonblocking reads.)
- *
- * Maybe we should add the memory barriers anyway, but these are
- * hot paths so we'd need to be convinced there's no sigificant
- * penalty.
- */
- return waitqueue_active(wq);
-}
-
/*
* INET callback when data has been received on the socket.
*/
-static void svc_udp_data_ready(struct sock *sk)
+static void svc_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), busy=%d\n",
svsk, sk,
test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
+ svsk->sk_odata(sk);
+ if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags))
+ svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible(wq);
}
/*
@@ -448,56 +420,22 @@ static void svc_udp_data_ready(struct sock *sk)
static void svc_write_space(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
- wait_queue_head_t *wq = sk_sleep(sk);
if (svsk) {
dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+ svsk->sk_owspace(sk);
svc_xprt_enqueue(&svsk->sk_xprt);
}
-
- if (sunrpc_waitqueue_active(wq)) {
- dprintk("RPC svc_write_space: someone sleeping on %p\n",
- svsk);
- wake_up_interruptible(wq);
- }
}
static int svc_tcp_has_wspace(struct svc_xprt *xprt)
{
- struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
- struct svc_serv *serv = svsk->sk_xprt.xpt_server;
- int required;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
return 1;
- required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
- if (sk_stream_wspace(svsk->sk_sk) >= required ||
- (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
- atomic_read(&xprt->xpt_reserved) == 0))
- return 1;
- set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
- return 0;
-}
-
-static void svc_tcp_write_space(struct sock *sk)
-{
- struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
- struct socket *sock = sk->sk_socket;
-
- if (!sk_stream_is_writeable(sk) || !sock)
- return;
- if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt))
- clear_bit(SOCK_NOSPACE, &sock->flags);
- svc_write_space(sk);
-}
-
-static void svc_tcp_adjust_wspace(struct svc_xprt *xprt)
-{
- struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
-
- if (svc_tcp_has_wspace(xprt))
- clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return !test_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
}
/*
@@ -746,7 +684,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
&svsk->sk_xprt, serv);
clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
- svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
+ svsk->sk_sk->sk_data_ready = svc_data_ready;
svsk->sk_sk->sk_write_space = svc_write_space;
/* initialise setting must have enough space to
@@ -786,11 +724,12 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
static void svc_tcp_listen_data_ready(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq;
dprintk("svc: socket %p TCP (listen) state change %d\n",
sk, sk->sk_state);
+ if (svsk)
+ svsk->sk_odata(sk);
/*
* This callback may called twice when a new connection
* is established as a child socket inherits everything
@@ -808,10 +747,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
} else
printk("svc: socket %p: no user data\n", sk);
}
-
- wq = sk_sleep(sk);
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible_all(wq);
}
/*
@@ -820,7 +755,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
static void svc_tcp_state_change(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq = sk_sleep(sk);
dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
sk, sk->sk_state, sk->sk_user_data);
@@ -828,26 +762,12 @@ static void svc_tcp_state_change(struct sock *sk)
if (!svsk)
printk("svc: socket %p: no user data\n", sk);
else {
- set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
- }
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible_all(wq);
-}
-
-static void svc_tcp_data_ready(struct sock *sk)
-{
- struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
- wait_queue_head_t *wq = sk_sleep(sk);
-
- dprintk("svc: socket %p TCP data ready (svsk %p)\n",
- sk, sk->sk_user_data);
- if (svsk) {
- set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- svc_xprt_enqueue(&svsk->sk_xprt);
+ svsk->sk_ostate(sk);
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
+ }
}
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible(wq);
}
/*
@@ -901,6 +821,11 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
dprintk("%s: connect from %s\n", serv->sv_name,
__svc_print_addr(sin, buf, sizeof(buf)));
+ /* Reset the inherited callbacks before calling svc_setup_socket */
+ newsock->sk->sk_state_change = svsk->sk_ostate;
+ newsock->sk->sk_data_ready = svsk->sk_odata;
+ newsock->sk->sk_write_space = svsk->sk_owspace;
+
/* make sure that a write doesn't block forever when
* low on memory
*/
@@ -1317,7 +1242,6 @@ static struct svc_xprt_ops svc_tcp_ops = {
.xpo_has_wspace = svc_tcp_has_wspace,
.xpo_accept = svc_tcp_accept,
.xpo_secure_port = svc_sock_secure_port,
- .xpo_adjust_wspace = svc_tcp_adjust_wspace,
};
static struct svc_xprt_class svc_tcp_class = {
@@ -1357,8 +1281,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
} else {
dprintk("setting up TCP socket for reading\n");
sk->sk_state_change = svc_tcp_state_change;
- sk->sk_data_ready = svc_tcp_data_ready;
- sk->sk_write_space = svc_tcp_write_space;
+ sk->sk_data_ready = svc_data_ready;
+ sk->sk_write_space = svc_write_space;
svsk->sk_reclen = 0;
svsk->sk_tcplen = 0;
@@ -1368,8 +1292,13 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- if (sk->sk_state != TCP_ESTABLISHED)
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_ESTABLISHED:
+ break;
+ default:
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ }
}
}
@@ -1428,17 +1357,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
/* Initialize the socket */
if (sock->type == SOCK_DGRAM)
svc_udp_init(svsk, serv);
- else {
- /* initialise setting must have enough space to
- * receive and respond to one request.
- */
- svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
- 4 * serv->sv_max_mesg);
+ else
svc_tcp_init(svsk, serv);
- }
- dprintk("svc: svc_setup_socket created %p (inet %p)\n",
- svsk, svsk->sk_sk);
+ dprintk("svc: svc_setup_socket created %p (inet %p), "
+ "listen %d close %d\n",
+ svsk, svsk->sk_sk,
+ test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags),
+ test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
return svsk;
}
@@ -1606,18 +1532,16 @@ static void svc_sock_detach(struct svc_xprt *xprt)
{
struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct sock *sk = svsk->sk_sk;
- wait_queue_head_t *wq;
dprintk("svc: svc_sock_detach(%p)\n", svsk);
/* put back the old socket callbacks */
+ lock_sock(sk);
sk->sk_state_change = svsk->sk_ostate;
sk->sk_data_ready = svsk->sk_odata;
sk->sk_write_space = svsk->sk_owspace;
-
- wq = sk_sleep(sk);
- if (sunrpc_waitqueue_active(wq))
- wake_up_interruptible(wq);
+ sk->sk_user_data = NULL;
+ release_sock(sk);
}
/*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 216a1385718a..8313960cac52 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -220,7 +220,7 @@ static void xprt_clear_locked(struct rpc_xprt *xprt)
clear_bit(XPRT_LOCKED, &xprt->state);
smp_mb__after_atomic();
} else
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
}
/*
@@ -295,7 +295,8 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
return;
- if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt))
+ if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
+ __xprt_lock_write_func, xprt))
return;
xprt_clear_locked(xprt);
}
@@ -324,7 +325,8 @@ static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt)
return;
if (RPCXPRT_CONGESTED(xprt))
goto out_unlock;
- if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt))
+ if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending,
+ __xprt_lock_write_cong_func, xprt))
return;
out_unlock:
xprt_clear_locked(xprt);
@@ -645,7 +647,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN);
spin_unlock_bh(&xprt->transport_lock);
}
@@ -672,7 +674,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN);
out:
spin_unlock_bh(&xprt->transport_lock);
@@ -689,7 +691,7 @@ xprt_init_autodisconnect(unsigned long data)
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
goto out_abort;
spin_unlock(&xprt->transport_lock);
- queue_work(rpciod_workqueue, &xprt->task_cleanup);
+ queue_work(xprtiod_workqueue, &xprt->task_cleanup);
return;
out_abort:
spin_unlock(&xprt->transport_lock);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index e7fd76975d86..66c9d63f4797 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -271,14 +271,12 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi,
xprt_switch_find_xprt_t find_next)
{
struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
- struct list_head *head;
if (xps == NULL)
return NULL;
- head = &xps->xps_xprt_list;
- if (xps->xps_nxprts < 2)
- return xprt_switch_find_first_entry(head);
- return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next);
+ return xprt_switch_set_next_cursor(&xps->xps_xprt_list,
+ &xpi->xpi_cursor,
+ find_next);
}
static
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index dc9f3b513a05..ef19fa42c50f 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,7 +1,7 @@
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o \
- fmr_ops.o frwr_ops.o physical_ops.o \
+ fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
module.o
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 6326ebe8b595..21cb3b150b37 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -19,13 +19,6 @@
* verb (fmr_op_unmap).
*/
-/* Transport recovery
- *
- * After a transport reconnect, fmr_op_map re-uses the MR already
- * allocated for the RPC, but generates a fresh rkey then maps the
- * MR again. This process is synchronous.
- */
-
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -35,62 +28,132 @@
/* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64)
-static struct workqueue_struct *fmr_recovery_wq;
-
-#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND)
+/* Access mode of externally registered pages */
+enum {
+ RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ,
+};
-int
-fmr_alloc_recovery_wq(void)
+bool
+fmr_is_supported(struct rpcrdma_ia *ia)
{
- fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0);
- return !fmr_recovery_wq ? -ENOMEM : 0;
+ if (!ia->ri_device->alloc_fmr) {
+ pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
+ ia->ri_device->name);
+ return false;
+ }
+ return true;
}
-void
-fmr_destroy_recovery_wq(void)
+static int
+fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
{
- struct workqueue_struct *wq;
+ static struct ib_fmr_attr fmr_attr = {
+ .max_pages = RPCRDMA_MAX_FMR_SGES,
+ .max_maps = 1,
+ .page_shift = PAGE_SHIFT
+ };
- if (!fmr_recovery_wq)
- return;
+ mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
+ sizeof(u64), GFP_KERNEL);
+ if (!mw->fmr.fm_physaddrs)
+ goto out_free;
- wq = fmr_recovery_wq;
- fmr_recovery_wq = NULL;
- destroy_workqueue(wq);
+ mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
+ sizeof(*mw->mw_sg), GFP_KERNEL);
+ if (!mw->mw_sg)
+ goto out_free;
+
+ sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
+
+ mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
+ &fmr_attr);
+ if (IS_ERR(mw->fmr.fm_mr))
+ goto out_fmr_err;
+
+ return 0;
+
+out_fmr_err:
+ dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
+ PTR_ERR(mw->fmr.fm_mr));
+
+out_free:
+ kfree(mw->mw_sg);
+ kfree(mw->fmr.fm_physaddrs);
+ return -ENOMEM;
}
static int
__fmr_unmap(struct rpcrdma_mw *mw)
{
LIST_HEAD(l);
+ int rc;
- list_add(&mw->fmr.fmr->list, &l);
- return ib_unmap_fmr(&l);
+ list_add(&mw->fmr.fm_mr->list, &l);
+ rc = ib_unmap_fmr(&l);
+ list_del_init(&mw->fmr.fm_mr->list);
+ return rc;
}
-/* Deferred reset of a single FMR. Generate a fresh rkey by
- * replacing the MR. There's no recovery if this fails.
- */
static void
-__fmr_recovery_worker(struct work_struct *work)
+fmr_op_release_mr(struct rpcrdma_mw *r)
{
- struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw,
- mw_work);
- struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ LIST_HEAD(unmap_list);
+ int rc;
- __fmr_unmap(mw);
- rpcrdma_put_mw(r_xprt, mw);
- return;
+ /* Ensure MW is not on any rl_registered list */
+ if (!list_empty(&r->mw_list))
+ list_del(&r->mw_list);
+
+ kfree(r->fmr.fm_physaddrs);
+ kfree(r->mw_sg);
+
+ /* In case this one was left mapped, try to unmap it
+ * to prevent dealloc_fmr from failing with EBUSY
+ */
+ rc = __fmr_unmap(r);
+ if (rc)
+ pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
+ r, rc);
+
+ rc = ib_dealloc_fmr(r->fmr.fm_mr);
+ if (rc)
+ pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
+ r, rc);
+
+ kfree(r);
}
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
+/* Reset of a single FMR.
*/
static void
-__fmr_queue_recovery(struct rpcrdma_mw *mw)
+fmr_op_recover_mr(struct rpcrdma_mw *mw)
{
- INIT_WORK(&mw->mw_work, __fmr_recovery_worker);
- queue_work(fmr_recovery_wq, &mw->mw_work);
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ int rc;
+
+ /* ORDER: invalidate first */
+ rc = __fmr_unmap(mw);
+
+ /* ORDER: then DMA unmap */
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (rc)
+ goto out_release;
+
+ rpcrdma_put_mw(r_xprt, mw);
+ r_xprt->rx_stats.mrs_recovered++;
+ return;
+
+out_release:
+ pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
+ r_xprt->rx_stats.mrs_orphaned++;
+
+ spin_lock(&r_xprt->rx_buf.rb_mwlock);
+ list_del(&mw->mw_all);
+ spin_unlock(&r_xprt->rx_buf.rb_mwlock);
+
+ fmr_op_release_mr(mw);
}
static int
@@ -112,86 +175,21 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
}
-static int
-fmr_op_init(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
- struct ib_fmr_attr fmr_attr = {
- .max_pages = RPCRDMA_MAX_FMR_SGES,
- .max_maps = 1,
- .page_shift = PAGE_SHIFT
- };
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- struct rpcrdma_mw *r;
- int i, rc;
-
- spin_lock_init(&buf->rb_mwlock);
- INIT_LIST_HEAD(&buf->rb_mws);
- INIT_LIST_HEAD(&buf->rb_all);
-
- i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
- i += 2; /* head + tail */
- i *= buf->rb_max_requests; /* one set for each RPC slot */
- dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
-
- rc = -ENOMEM;
- while (i--) {
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- goto out;
-
- r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
- sizeof(u64), GFP_KERNEL);
- if (!r->fmr.physaddrs)
- goto out_free;
-
- r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
- if (IS_ERR(r->fmr.fmr))
- goto out_fmr_err;
-
- r->mw_xprt = r_xprt;
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
- return 0;
-
-out_fmr_err:
- rc = PTR_ERR(r->fmr.fmr);
- dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
- kfree(r->fmr.physaddrs);
-out_free:
- kfree(r);
-out:
- return rc;
-}
-
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
static int
fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
+ int nsegs, bool writing, struct rpcrdma_mw **out)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_device;
- enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
int len, pageoff, i, rc;
struct rpcrdma_mw *mw;
+ u64 *dma_pages;
- mw = seg1->rl_mw;
- seg1->rl_mw = NULL;
- if (!mw) {
- mw = rpcrdma_get_mw(r_xprt);
- if (!mw)
- return -ENOMEM;
- } else {
- /* this is a retransmit; generate a fresh rkey */
- rc = __fmr_unmap(mw);
- if (rc)
- return rc;
- }
+ mw = rpcrdma_get_mw(r_xprt);
+ if (!mw)
+ return -ENOBUFS;
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -200,8 +198,14 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (nsegs > RPCRDMA_MAX_FMR_SGES)
nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) {
- rpcrdma_map_one(device, seg, direction);
- mw->fmr.physaddrs[i] = seg->mr_dma;
+ if (seg->mr_page)
+ sg_set_page(&mw->mw_sg[i],
+ seg->mr_page,
+ seg->mr_len,
+ offset_in_page(seg->mr_offset));
+ else
+ sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
+ seg->mr_len);
len += seg->mr_len;
++seg;
++i;
@@ -210,49 +214,54 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
-
- rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs,
- i, seg1->mr_dma);
+ mw->mw_nents = i;
+ mw->mw_dir = rpcrdma_data_dir(writing);
+ if (i == 0)
+ goto out_dmamap_err;
+
+ if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir))
+ goto out_dmamap_err;
+
+ for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
+ dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
+ rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
+ dma_pages[0]);
if (rc)
goto out_maperr;
- seg1->rl_mw = mw;
- seg1->mr_rkey = mw->fmr.fmr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- return i;
+ mw->mw_handle = mw->fmr.fm_mr->rkey;
+ mw->mw_length = len;
+ mw->mw_offset = dma_pages[0] + pageoff;
-out_maperr:
- dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
- __func__, len, (unsigned long long)seg1->mr_dma,
- pageoff, i, rc);
- while (i--)
- rpcrdma_unmap_one(device, --seg);
- return rc;
-}
+ *out = mw;
+ return mw->mw_nents;
-static void
-__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- int nsegs = seg->mr_nsegs;
+out_dmamap_err:
+ pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
+ mw->mw_sg, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
- while (nsegs--)
- rpcrdma_unmap_one(device, seg++);
+out_maperr:
+ pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+ len, (unsigned long long)dma_pages[0],
+ pageoff, mw->mw_nents, rc);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
}
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
*/
static void
fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
- struct rpcrdma_mr_seg *seg;
- unsigned int i, nchunks;
- struct rpcrdma_mw *mw;
+ struct rpcrdma_mw *mw, *tmp;
LIST_HEAD(unmap_list);
int rc;
@@ -261,90 +270,54 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* ORDER: Invalidate all of the req's MRs first
*
* ib_unmap_fmr() is slow, so use a single call instead
- * of one call per mapped MR.
+ * of one call per mapped FMR.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
-
- list_add(&mw->fmr.fmr->list, &unmap_list);
-
- i += seg->mr_nsegs;
- }
+ list_for_each_entry(mw, &req->rl_registered, mw_list)
+ list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
rc = ib_unmap_fmr(&unmap_list);
if (rc)
- pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+ goto out_reset;
/* ORDER: Now DMA unmap all of the req's MRs, and return
* them to the free MW list.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->mw_list);
+ list_del_init(&mw->fmr.fm_mr->list);
+ ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ rpcrdma_put_mw(r_xprt, mw);
+ }
- __fmr_dma_unmap(r_xprt, seg);
- rpcrdma_put_mw(r_xprt, seg->rl_mw);
+ return;
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
+out_reset:
+ pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
- req->rl_nchunks = 0;
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->fmr.fm_mr->list);
+ fmr_op_recover_mr(mw);
+ }
}
/* Use a slow, safe mechanism to invalidate all memory regions
* that were registered for "req".
- *
- * In the asynchronous case, DMA unmapping occurs first here
- * because the rpcrdma_mr_seg is released immediately after this
- * call. It's contents won't be available in __fmr_dma_unmap later.
- * FIXME.
*/
static void
fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync)
{
- struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- unsigned int i;
-
- for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
-
- if (sync) {
- /* ORDER */
- __fmr_unmap(mw);
- __fmr_dma_unmap(r_xprt, seg);
- rpcrdma_put_mw(r_xprt, mw);
- } else {
- __fmr_dma_unmap(r_xprt, seg);
- __fmr_queue_recovery(mw);
- }
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
-}
-
-static void
-fmr_op_destroy(struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mw *r;
- int rc;
-
- while (!list_empty(&buf->rb_all)) {
- r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
- list_del(&r->mw_all);
- kfree(r->fmr.physaddrs);
- rc = ib_dealloc_fmr(r->fmr.fmr);
- if (rc)
- dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
- __func__, rc);
+ while (!list_empty(&req->rl_registered)) {
+ mw = list_first_entry(&req->rl_registered,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
- kfree(r);
+ if (sync)
+ fmr_op_recover_mr(mw);
+ else
+ rpcrdma_defer_mr_recovery(mw);
}
}
@@ -352,9 +325,10 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_unmap_sync = fmr_op_unmap_sync,
.ro_unmap_safe = fmr_op_unmap_safe,
+ .ro_recover_mr = fmr_op_recover_mr,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
- .ro_init = fmr_op_init,
- .ro_destroy = fmr_op_destroy,
+ .ro_init_mr = fmr_op_init_mr,
+ .ro_release_mr = fmr_op_release_mr,
.ro_displayname = "fmr",
};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index c0947544babe..892b5e1d9b09 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -73,29 +73,71 @@
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
-static struct workqueue_struct *frwr_recovery_wq;
-
-#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM)
+bool
+frwr_is_supported(struct rpcrdma_ia *ia)
+{
+ struct ib_device_attr *attrs = &ia->ri_device->attrs;
+
+ if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+ goto out_not_supported;
+ if (attrs->max_fast_reg_page_list_len == 0)
+ goto out_not_supported;
+ return true;
+
+out_not_supported:
+ pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n",
+ ia->ri_device->name);
+ return false;
+}
-int
-frwr_alloc_recovery_wq(void)
+static int
+frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
{
- frwr_recovery_wq = alloc_workqueue("frwr_recovery",
- FRWR_RECOVERY_WQ_FLAGS, 0);
- return !frwr_recovery_wq ? -ENOMEM : 0;
+ unsigned int depth = ia->ri_max_frmr_depth;
+ struct rpcrdma_frmr *f = &r->frmr;
+ int rc;
+
+ f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth);
+ if (IS_ERR(f->fr_mr))
+ goto out_mr_err;
+
+ r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL);
+ if (!r->mw_sg)
+ goto out_list_err;
+
+ sg_init_table(r->mw_sg, depth);
+ init_completion(&f->fr_linv_done);
+ return 0;
+
+out_mr_err:
+ rc = PTR_ERR(f->fr_mr);
+ dprintk("RPC: %s: ib_alloc_mr status %i\n",
+ __func__, rc);
+ return rc;
+
+out_list_err:
+ rc = -ENOMEM;
+ dprintk("RPC: %s: sg allocation failure\n",
+ __func__);
+ ib_dereg_mr(f->fr_mr);
+ return rc;
}
-void
-frwr_destroy_recovery_wq(void)
+static void
+frwr_op_release_mr(struct rpcrdma_mw *r)
{
- struct workqueue_struct *wq;
+ int rc;
- if (!frwr_recovery_wq)
- return;
+ /* Ensure MW is not on any rl_registered list */
+ if (!list_empty(&r->mw_list))
+ list_del(&r->mw_list);
- wq = frwr_recovery_wq;
- frwr_recovery_wq = NULL;
- destroy_workqueue(wq);
+ rc = ib_dereg_mr(r->frmr.fr_mr);
+ if (rc)
+ pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n",
+ r, rc);
+ kfree(r->mw_sg);
+ kfree(r);
}
static int
@@ -124,93 +166,37 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
return 0;
}
-static void
-__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_frmr *f = &mw->frmr;
- int rc;
-
- rc = __frwr_reset_mr(ia, mw);
- ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir);
- if (rc)
- return;
-
- rpcrdma_put_mw(r_xprt, mw);
-}
-
-/* Deferred reset of a single FRMR. Generate a fresh rkey by
- * replacing the MR.
+/* Reset of a single FRMR. Generate a fresh rkey by replacing the MR.
*
* There's no recovery if this fails. The FRMR is abandoned, but
* remains in rb_all. It will be cleaned up when the transport is
* destroyed.
*/
static void
-__frwr_recovery_worker(struct work_struct *work)
-{
- struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
- mw_work);
-
- __frwr_reset_and_unmap(r->mw_xprt, r);
- return;
-}
-
-/* A broken MR was discovered in a context that can't sleep.
- * Defer recovery to the recovery worker.
- */
-static void
-__frwr_queue_recovery(struct rpcrdma_mw *r)
-{
- INIT_WORK(&r->mw_work, __frwr_recovery_worker);
- queue_work(frwr_recovery_wq, &r->mw_work);
-}
-
-static int
-__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
- unsigned int depth)
+frwr_op_recover_mr(struct rpcrdma_mw *mw)
{
- struct rpcrdma_frmr *f = &r->frmr;
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int rc;
- f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
- if (IS_ERR(f->fr_mr))
- goto out_mr_err;
-
- f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL);
- if (!f->fr_sg)
- goto out_list_err;
-
- sg_init_table(f->fr_sg, depth);
-
- init_completion(&f->fr_linv_done);
-
- return 0;
+ rc = __frwr_reset_mr(ia, mw);
+ ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (rc)
+ goto out_release;
-out_mr_err:
- rc = PTR_ERR(f->fr_mr);
- dprintk("RPC: %s: ib_alloc_mr status %i\n",
- __func__, rc);
- return rc;
+ rpcrdma_put_mw(r_xprt, mw);
+ r_xprt->rx_stats.mrs_recovered++;
+ return;
-out_list_err:
- rc = -ENOMEM;
- dprintk("RPC: %s: sg allocation failure\n",
- __func__);
- ib_dereg_mr(f->fr_mr);
- return rc;
-}
+out_release:
+ pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw);
+ r_xprt->rx_stats.mrs_orphaned++;
-static void
-__frwr_release(struct rpcrdma_mw *r)
-{
- int rc;
+ spin_lock(&r_xprt->rx_buf.rb_mwlock);
+ list_del(&mw->mw_all);
+ spin_unlock(&r_xprt->rx_buf.rb_mwlock);
- rc = ib_dereg_mr(r->frmr.fr_mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr status %i\n",
- __func__, rc);
- kfree(r->frmr.fr_sg);
+ frwr_op_release_mr(mw);
}
static int
@@ -346,57 +332,14 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
complete_all(&frmr->fr_linv_done);
}
-static int
-frwr_op_init(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
- struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- int i;
-
- spin_lock_init(&buf->rb_mwlock);
- INIT_LIST_HEAD(&buf->rb_mws);
- INIT_LIST_HEAD(&buf->rb_all);
-
- i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
- i += 2; /* head + tail */
- i *= buf->rb_max_requests; /* one set for each RPC slot */
- dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
-
- while (i--) {
- struct rpcrdma_mw *r;
- int rc;
-
- r = kzalloc(sizeof(*r), GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- rc = __frwr_init(r, pd, device, depth);
- if (rc) {
- kfree(r);
- return rc;
- }
-
- r->mw_xprt = r_xprt;
- list_add(&r->mw_list, &buf->rb_mws);
- list_add(&r->mw_all, &buf->rb_all);
- }
-
- return 0;
-}
-
-/* Post a FAST_REG Work Request to register a memory region
+/* Post a REG_MR Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
static int
frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
+ int nsegs, bool writing, struct rpcrdma_mw **out)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct ib_device *device = ia->ri_device;
- enum dma_data_direction direction = rpcrdma_data_dir(writing);
- struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
@@ -405,14 +348,13 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int rc, i, n, dma_nents;
u8 key;
- mw = seg1->rl_mw;
- seg1->rl_mw = NULL;
+ mw = NULL;
do {
if (mw)
- __frwr_queue_recovery(mw);
+ rpcrdma_defer_mr_recovery(mw);
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
- return -ENOMEM;
+ return -ENOBUFS;
} while (mw->frmr.fr_state != FRMR_IS_INVALID);
frmr = &mw->frmr;
frmr->fr_state = FRMR_IS_VALID;
@@ -421,15 +363,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (nsegs > ia->ri_max_frmr_depth)
nsegs = ia->ri_max_frmr_depth;
-
for (i = 0; i < nsegs;) {
if (seg->mr_page)
- sg_set_page(&frmr->fr_sg[i],
+ sg_set_page(&mw->mw_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
- sg_set_buf(&frmr->fr_sg[i], seg->mr_offset,
+ sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
seg->mr_len);
++seg;
@@ -440,26 +381,22 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- frmr->fr_nents = i;
- frmr->fr_dir = direction;
-
- dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction);
- if (!dma_nents) {
- pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
- __func__, frmr->fr_sg, frmr->fr_nents);
- return -ENOMEM;
- }
+ mw->mw_nents = i;
+ mw->mw_dir = rpcrdma_data_dir(writing);
+ if (i == 0)
+ goto out_dmamap_err;
- n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE);
- if (unlikely(n != frmr->fr_nents)) {
- pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
- __func__, frmr->fr_mr, n, frmr->fr_nents);
- rc = n < 0 ? n : -EINVAL;
- goto out_senderr;
- }
+ dma_nents = ib_dma_map_sg(ia->ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
+ if (!dma_nents)
+ goto out_dmamap_err;
+
+ n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE);
+ if (unlikely(n != mw->mw_nents))
+ goto out_mapmr_err;
dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
- __func__, mw, frmr->fr_nents, mr->length);
+ __func__, mw, mw->mw_nents, mr->length);
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
@@ -481,24 +418,34 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc)
goto out_senderr;
- seg1->rl_mw = mw;
- seg1->mr_rkey = mr->rkey;
- seg1->mr_base = mr->iova;
- seg1->mr_nsegs = frmr->fr_nents;
- seg1->mr_len = mr->length;
+ mw->mw_handle = mr->rkey;
+ mw->mw_length = mr->length;
+ mw->mw_offset = mr->iova;
+
+ *out = mw;
+ return mw->mw_nents;
- return frmr->fr_nents;
+out_dmamap_err:
+ pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
+ mw->mw_sg, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
+
+out_mapmr_err:
+ pr_err("rpcrdma: failed to map mr %p (%u/%u)\n",
+ frmr->fr_mr, n, mw->mw_nents);
+ rpcrdma_defer_mr_recovery(mw);
+ return -EIO;
out_senderr:
- dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- __frwr_queue_recovery(mw);
- return rc;
+ pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
+ rpcrdma_defer_mr_recovery(mw);
+ return -ENOTCONN;
}
static struct ib_send_wr *
-__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+__frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
{
- struct rpcrdma_mw *mw = seg->rl_mw;
struct rpcrdma_frmr *f = &mw->frmr;
struct ib_send_wr *invalidate_wr;
@@ -518,16 +465,16 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
*
* Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions.
+ *
+ * Caller ensures that req->rl_registered is not empty.
*/
static void
frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mr_seg *seg;
- unsigned int i, nchunks;
+ struct rpcrdma_mw *mw, *tmp;
struct rpcrdma_frmr *f;
- struct rpcrdma_mw *mw;
int rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -537,22 +484,18 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* Chain the LOCAL_INV Work Requests and post them with
* a single ib_post_send() call.
*/
+ f = NULL;
invalidate_wrs = pos = prev = NULL;
- seg = NULL;
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
-
- pos = __frwr_prepare_linv_wr(seg);
+ list_for_each_entry(mw, &req->rl_registered, mw_list) {
+ pos = __frwr_prepare_linv_wr(mw);
if (!invalidate_wrs)
invalidate_wrs = pos;
else
prev->next = pos;
prev = pos;
-
- i += seg->mr_nsegs;
+ f = &mw->frmr;
}
- f = &seg->rl_mw->frmr;
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
@@ -577,39 +520,27 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* them to the free MW list.
*/
unmap:
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
- seg->rl_mw = NULL;
-
- ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents,
- f->fr_dir);
+ list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ list_del_init(&mw->mw_list);
+ ib_dma_unmap_sg(ia->ri_device,
+ mw->mw_sg, mw->mw_nents, mw->mw_dir);
rpcrdma_put_mw(r_xprt, mw);
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
}
-
- req->rl_nchunks = 0;
return;
reset_mrs:
- pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+ pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc);
+ rdma_disconnect(ia->ri_id);
/* Find and reset the MRs in the LOCAL_INV WRs that did not
* get posted. This is synchronous, and slow.
*/
- for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
+ list_for_each_entry(mw, &req->rl_registered, mw_list) {
f = &mw->frmr;
-
if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) {
__frwr_reset_mr(ia, mw);
bad_wr = bad_wr->next;
}
-
- i += seg->mr_nsegs;
}
goto unmap;
}
@@ -621,38 +552,17 @@ static void
frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
bool sync)
{
- struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- unsigned int i;
- for (i = 0; req->rl_nchunks; req->rl_nchunks--) {
- seg = &req->rl_segments[i];
- mw = seg->rl_mw;
+ while (!list_empty(&req->rl_registered)) {
+ mw = list_first_entry(&req->rl_registered,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
if (sync)
- __frwr_reset_and_unmap(r_xprt, mw);
+ frwr_op_recover_mr(mw);
else
- __frwr_queue_recovery(mw);
-
- i += seg->mr_nsegs;
- seg->mr_nsegs = 0;
- seg->rl_mw = NULL;
- }
-}
-
-static void
-frwr_op_destroy(struct rpcrdma_buffer *buf)
-{
- struct rpcrdma_mw *r;
-
- /* Ensure stale MWs for "buf" are no longer in flight */
- flush_workqueue(frwr_recovery_wq);
-
- while (!list_empty(&buf->rb_all)) {
- r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
- list_del(&r->mw_all);
- __frwr_release(r);
- kfree(r);
+ rpcrdma_defer_mr_recovery(mw);
}
}
@@ -660,9 +570,10 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
.ro_unmap_sync = frwr_op_unmap_sync,
.ro_unmap_safe = frwr_op_unmap_safe,
+ .ro_recover_mr = frwr_op_recover_mr,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
- .ro_init = frwr_op_init,
- .ro_destroy = frwr_op_destroy,
+ .ro_init_mr = frwr_op_init_mr,
+ .ro_release_mr = frwr_op_release_mr,
.ro_displayname = "frwr",
};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
deleted file mode 100644
index 3750596cc432..000000000000
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2015 Oracle. All rights reserved.
- * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
- */
-
-/* No-op chunk preparation. All client memory is pre-registered.
- * Sometimes referred to as ALLPHYSICAL mode.
- *
- * Physical registration is simple because all client memory is
- * pre-registered and never deregistered. This mode is good for
- * adapter bring up, but is considered not safe: the server is
- * trusted not to abuse its access to client memory not involved
- * in RDMA I/O.
- */
-
-#include "xprt_rdma.h"
-
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_TRANS
-#endif
-
-static int
-physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
- struct rpcrdma_create_data_internal *cdata)
-{
- struct ib_mr *mr;
-
- /* Obtain an rkey to use for RPC data payloads.
- */
- mr = ib_get_dma_mr(ia->ri_pd,
- IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ);
- if (IS_ERR(mr)) {
- pr_err("%s: ib_get_dma_mr for failed with %lX\n",
- __func__, PTR_ERR(mr));
- return -ENOMEM;
- }
- ia->ri_dma_mr = mr;
-
- rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int,
- RPCRDMA_MAX_DATA_SEGS,
- RPCRDMA_MAX_HDR_SEGS));
- return 0;
-}
-
-/* PHYSICAL memory registration conveys one page per chunk segment.
- */
-static size_t
-physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
-{
- return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- RPCRDMA_MAX_HDR_SEGS);
-}
-
-static int
-physical_op_init(struct rpcrdma_xprt *r_xprt)
-{
- return 0;
-}
-
-/* The client's physical memory is already exposed for
- * remote access via RDMA READ or RDMA WRITE.
- */
-static int
-physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
- int nsegs, bool writing)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
- seg->mr_rkey = ia->ri_dma_mr->rkey;
- seg->mr_base = seg->mr_dma;
- return 1;
-}
-
-/* DMA unmap all memory regions that were mapped for "req".
- */
-static void
-physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
-{
- struct ib_device *device = r_xprt->rx_ia.ri_device;
- unsigned int i;
-
- for (i = 0; req->rl_nchunks; --req->rl_nchunks)
- rpcrdma_unmap_one(device, &req->rl_segments[i++]);
-}
-
-/* Use a slow, safe mechanism to invalidate all memory regions
- * that were registered for "req".
- *
- * For physical memory registration, there is no good way to
- * fence a single MR that has been advertised to the server. The
- * client has already handed the server an R_key that cannot be
- * invalidated and is shared by all MRs on this connection.
- * Tearing down the PD might be the only safe choice, but it's
- * not clear that a freshly acquired DMA R_key would be different
- * than the one used by the PD that was just destroyed.
- * FIXME.
- */
-static void
-physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- bool sync)
-{
- physical_op_unmap_sync(r_xprt, req);
-}
-
-static void
-physical_op_destroy(struct rpcrdma_buffer *buf)
-{
-}
-
-const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
- .ro_map = physical_op_map,
- .ro_unmap_sync = physical_op_unmap_sync,
- .ro_unmap_safe = physical_op_unmap_safe,
- .ro_open = physical_op_open,
- .ro_maxpages = physical_op_maxpages,
- .ro_init = physical_op_init,
- .ro_destroy = physical_op_destroy,
- .ro_displayname = "physical",
-};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 35a81096e83d..a47f170b20ef 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -196,8 +196,7 @@ rpcrdma_tail_pullup(struct xdr_buf *buf)
* MR when they can.
*/
static int
-rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
- int n, int nsegs)
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
{
size_t page_offset;
u32 remaining;
@@ -206,7 +205,7 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
base = vec->iov_base;
page_offset = offset_in_page(base);
remaining = vec->iov_len;
- while (remaining && n < nsegs) {
+ while (remaining && n < RPCRDMA_MAX_SEGS) {
seg[n].mr_page = NULL;
seg[n].mr_offset = base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
@@ -230,34 +229,34 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
static int
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
- enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
+ enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
{
- int len, n = 0, p;
- int page_base;
+ int len, n, p, page_base;
struct page **ppages;
+ n = 0;
if (pos == 0) {
- n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs);
- if (n == nsegs)
- return -EIO;
+ n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
+ if (n == RPCRDMA_MAX_SEGS)
+ goto out_overflow;
}
len = xdrbuf->page_len;
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = xdrbuf->page_base & ~PAGE_MASK;
p = 0;
- while (len && n < nsegs) {
+ while (len && n < RPCRDMA_MAX_SEGS) {
if (!ppages[p]) {
/* alloc the pagelist for receiving buffer */
ppages[p] = alloc_page(GFP_ATOMIC);
if (!ppages[p])
- return -ENOMEM;
+ return -EAGAIN;
}
seg[n].mr_page = ppages[p];
seg[n].mr_offset = (void *)(unsigned long) page_base;
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
if (seg[n].mr_len > PAGE_SIZE)
- return -EIO;
+ goto out_overflow;
len -= seg[n].mr_len;
++n;
++p;
@@ -265,8 +264,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
}
/* Message overflows the seg array */
- if (len && n == nsegs)
- return -EIO;
+ if (len && n == RPCRDMA_MAX_SEGS)
+ goto out_overflow;
/* When encoding the read list, the tail is always sent inline */
if (type == rpcrdma_readch)
@@ -277,20 +276,24 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* xdr pad bytes, saving the server an RDMA operation. */
if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
return n;
- n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs);
- if (n == nsegs)
- return -EIO;
+ n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
+ if (n == RPCRDMA_MAX_SEGS)
+ goto out_overflow;
}
return n;
+
+out_overflow:
+ pr_err("rpcrdma: segment array overflow\n");
+ return -EIO;
}
static inline __be32 *
-xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg)
+xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
{
- *iptr++ = cpu_to_be32(seg->mr_rkey);
- *iptr++ = cpu_to_be32(seg->mr_len);
- return xdr_encode_hyper(iptr, seg->mr_base);
+ *iptr++ = cpu_to_be32(mw->mw_handle);
+ *iptr++ = cpu_to_be32(mw->mw_length);
+ return xdr_encode_hyper(iptr, mw->mw_offset);
}
/* XDR-encode the Read list. Supports encoding a list of read
@@ -310,7 +313,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, struct rpc_rqst *rqst,
__be32 *iptr, enum rpcrdma_chunktype rtype)
{
- struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
unsigned int pos;
int n, nsegs;
@@ -322,15 +326,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
pos = rqst->rq_snd_buf.head[0].iov_len;
if (rtype == rpcrdma_areadch)
pos = 0;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg,
- RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ seg = req->rl_segments;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false);
- if (n <= 0)
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ false, &mw);
+ if (n < 0)
return ERR_PTR(n);
+ list_add(&mw->mw_list, &req->rl_registered);
*iptr++ = xdr_one; /* item present */
@@ -338,20 +344,17 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
* have the same "position".
*/
*iptr++ = cpu_to_be32(pos);
- iptr = xdr_encode_rdma_segment(iptr, seg);
+ iptr = xdr_encode_rdma_segment(iptr, mw);
- dprintk("RPC: %5u %s: read segment pos %u "
- "%d@0x%016llx:0x%08x (%s)\n",
+ dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__, pos,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
+ mw->mw_length, (unsigned long long)mw->mw_offset,
+ mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.read_chunk_count++;
- req->rl_nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
- req->rl_nextseg = seg;
/* Finish Read list */
*iptr++ = xdr_zero; /* Next item not present */
@@ -375,7 +378,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpc_rqst *rqst, __be32 *iptr,
enum rpcrdma_chunktype wtype)
{
- struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
int n, nsegs, nchunks;
__be32 *segcount;
@@ -384,10 +388,10 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
return iptr;
}
+ seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
- wtype, seg,
- RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -396,26 +400,25 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
- if (n <= 0)
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (n < 0)
return ERR_PTR(n);
+ list_add(&mw->mw_list, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, seg);
+ iptr = xdr_encode_rdma_segment(iptr, mw);
- dprintk("RPC: %5u %s: write segment "
- "%d@0x016%llx:0x%08x (%s)\n",
+ dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
+ mw->mw_length, (unsigned long long)mw->mw_offset,
+ mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.write_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- req->rl_nchunks++;
nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
- req->rl_nextseg = seg;
/* Update count of segments in this Write chunk */
*segcount = cpu_to_be32(nchunks);
@@ -442,7 +445,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, struct rpc_rqst *rqst,
__be32 *iptr, enum rpcrdma_chunktype wtype)
{
- struct rpcrdma_mr_seg *seg = req->rl_nextseg;
+ struct rpcrdma_mr_seg *seg;
+ struct rpcrdma_mw *mw;
int n, nsegs, nchunks;
__be32 *segcount;
@@ -451,8 +455,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
return iptr;
}
- nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
- RPCRDMA_MAX_SEGS - req->rl_nchunks);
+ seg = req->rl_segments;
+ nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -461,26 +465,25 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true);
- if (n <= 0)
+ n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (n < 0)
return ERR_PTR(n);
+ list_add(&mw->mw_list, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, seg);
+ iptr = xdr_encode_rdma_segment(iptr, mw);
- dprintk("RPC: %5u %s: reply segment "
- "%d@0x%016llx:0x%08x (%s)\n",
+ dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
- seg->mr_len, (unsigned long long)seg->mr_base,
- seg->mr_rkey, n < nsegs ? "more" : "last");
+ mw->mw_length, (unsigned long long)mw->mw_offset,
+ mw->mw_handle, n < nsegs ? "more" : "last");
r_xprt->rx_stats.reply_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
- req->rl_nchunks++;
nchunks++;
seg += n;
nsegs -= n;
} while (nsegs);
- req->rl_nextseg = seg;
/* Update count of segments in the Reply chunk */
*segcount = cpu_to_be32(nchunks);
@@ -567,6 +570,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+ bool ddp_allowed;
ssize_t hdrlen;
size_t rpclen;
__be32 *iptr;
@@ -583,6 +587,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
headerp->rm_type = rdma_msg;
+ /* When the ULP employs a GSS flavor that guarantees integrity
+ * or privacy, direct data placement of individual data items
+ * is not allowed.
+ */
+ ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags &
+ RPCAUTH_AUTH_DATATOUCH);
+
/*
* Chunks needed for results?
*
@@ -594,7 +605,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
*/
if (rpcrdma_results_inline(r_xprt, rqst))
wtype = rpcrdma_noch;
- else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech;
else
wtype = rpcrdma_replych;
@@ -617,7 +628,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
rtype = rpcrdma_noch;
rpcrdma_inline_pullup(rqst);
rpclen = rqst->rq_svec[0].iov_len;
- } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+ } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
rpclen = rqst->rq_svec[0].iov_len;
rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
@@ -650,8 +661,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* send a Call message with a Position Zero Read chunk and a
* regular Read chunk at the same time.
*/
- req->rl_nchunks = 0;
- req->rl_nextseg = req->rl_segments;
iptr = headerp->rm_body.rm_chunks;
iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
if (IS_ERR(iptr))
@@ -690,10 +699,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
out_overflow:
pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
- /* Terminate this RPC. Chunks registered above will be
- * released by xprt_release -> xprt_rmda_free .
- */
- return -EIO;
+ iptr = ERR_PTR(-EIO);
out_unmap:
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@@ -705,15 +711,13 @@ out_unmap:
* RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
*/
static int
-rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
+rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
{
unsigned int i, total_len;
struct rpcrdma_write_chunk *cur_wchunk;
char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
i = be32_to_cpu(**iptrp);
- if (i > max)
- return -1;
cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
total_len = 0;
while (i--) {
@@ -744,45 +748,66 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
return total_len;
}
-/*
- * Scatter inline received data back into provided iov's.
+/**
+ * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
+ * @rqst: controlling RPC request
+ * @srcp: points to RPC message payload in receive buffer
+ * @copy_len: remaining length of receive buffer content
+ * @pad: Write chunk pad bytes needed (zero for pure inline)
+ *
+ * The upper layer has set the maximum number of bytes it can
+ * receive in each component of rq_rcv_buf. These values are set in
+ * the head.iov_len, page_len, tail.iov_len, and buflen fields.
+ *
+ * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
+ * many cases this function simply updates iov_base pointers in
+ * rq_rcv_buf to point directly to the received reply data, to
+ * avoid copying reply data.
+ *
+ * Returns the count of bytes which had to be memcopied.
*/
-static void
+static unsigned long
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
{
- int i, npages, curlen, olen;
+ unsigned long fixup_copy_count;
+ int i, npages, curlen;
char *destp;
struct page **ppages;
int page_base;
+ /* The head iovec is redirected to the RPC reply message
+ * in the receive buffer, to avoid a memcopy.
+ */
+ rqst->rq_rcv_buf.head[0].iov_base = srcp;
+ rqst->rq_private_buf.head[0].iov_base = srcp;
+
+ /* The contents of the receive buffer that follow
+ * head.iov_len bytes are copied into the page list.
+ */
curlen = rqst->rq_rcv_buf.head[0].iov_len;
- if (curlen > copy_len) { /* write chunk header fixup */
+ if (curlen > copy_len)
curlen = copy_len;
- rqst->rq_rcv_buf.head[0].iov_len = curlen;
- }
-
dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
__func__, srcp, copy_len, curlen);
-
- /* Shift pointer for first receive segment only */
- rqst->rq_rcv_buf.head[0].iov_base = srcp;
srcp += curlen;
copy_len -= curlen;
- olen = copy_len;
- i = 0;
- rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
page_base = rqst->rq_rcv_buf.page_base;
ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
page_base &= ~PAGE_MASK;
-
+ fixup_copy_count = 0;
if (copy_len && rqst->rq_rcv_buf.page_len) {
- npages = PAGE_ALIGN(page_base +
- rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
- for (; i < npages; i++) {
+ int pagelist_len;
+
+ pagelist_len = rqst->rq_rcv_buf.page_len;
+ if (pagelist_len > copy_len)
+ pagelist_len = copy_len;
+ npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
+ for (i = 0; i < npages; i++) {
curlen = PAGE_SIZE - page_base;
- if (curlen > copy_len)
- curlen = copy_len;
+ if (curlen > pagelist_len)
+ curlen = pagelist_len;
+
dprintk("RPC: %s: page %d"
" srcp 0x%p len %d curlen %d\n",
__func__, i, srcp, copy_len, curlen);
@@ -792,39 +817,32 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
kunmap_atomic(destp);
srcp += curlen;
copy_len -= curlen;
- if (copy_len == 0)
+ fixup_copy_count += curlen;
+ pagelist_len -= curlen;
+ if (!pagelist_len)
break;
page_base = 0;
}
- }
- if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
- curlen = copy_len;
- if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
- curlen = rqst->rq_rcv_buf.tail[0].iov_len;
- if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
- memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
- dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n",
- __func__, srcp, copy_len, curlen);
- rqst->rq_rcv_buf.tail[0].iov_len = curlen;
- copy_len -= curlen; ++i;
- } else
- rqst->rq_rcv_buf.tail[0].iov_len = 0;
-
- if (pad) {
- /* implicit padding on terminal chunk */
- unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
- while (pad--)
- p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+ /* Implicit padding for the last segment in a Write
+ * chunk is inserted inline at the front of the tail
+ * iovec. The upper layer ignores the content of
+ * the pad. Simply ensure inline content in the tail
+ * that follows the Write chunk is properly aligned.
+ */
+ if (pad)
+ srcp -= pad;
}
- if (copy_len)
- dprintk("RPC: %s: %d bytes in"
- " %d extra segments (%d lost)\n",
- __func__, olen, i, copy_len);
+ /* The tail iovec is redirected to the remaining data
+ * in the receive buffer, to avoid a memcopy.
+ */
+ if (copy_len || pad) {
+ rqst->rq_rcv_buf.tail[0].iov_base = srcp;
+ rqst->rq_private_buf.tail[0].iov_base = srcp;
+ }
- /* TBD avoid a warning from call_decode() */
- rqst->rq_private_buf = rqst->rq_rcv_buf;
+ return fixup_copy_count;
}
void
@@ -960,14 +978,13 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
(headerp->rm_body.rm_chunks[1] == xdr_zero &&
headerp->rm_body.rm_chunks[2] != xdr_zero) ||
(headerp->rm_body.rm_chunks[1] != xdr_zero &&
- req->rl_nchunks == 0))
+ list_empty(&req->rl_registered)))
goto badheader;
if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
/* count any expected write chunks in read reply */
/* start at write chunk array count */
iptr = &headerp->rm_body.rm_chunks[2];
- rdmalen = rpcrdma_count_chunks(rep,
- req->rl_nchunks, 1, &iptr);
+ rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
/* check for validity, and no reply chunk after */
if (rdmalen < 0 || *iptr++ != xdr_zero)
goto badheader;
@@ -988,8 +1005,10 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
rep->rr_len -= RPCRDMA_HDRLEN_MIN;
status = rep->rr_len;
}
- /* Fix up the rpc results for upper layer */
- rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
+
+ r_xprt->rx_stats.fixup_copy_count +=
+ rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
+ rdmalen);
break;
case rdma_nomsg:
@@ -997,11 +1016,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
headerp->rm_body.rm_chunks[1] != xdr_zero ||
headerp->rm_body.rm_chunks[2] != xdr_one ||
- req->rl_nchunks == 0)
+ list_empty(&req->rl_registered))
goto badheader;
iptr = (__be32 *)((unsigned char *)headerp +
RPCRDMA_HDRLEN_MIN);
- rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
+ rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
if (rdmalen < 0)
goto badheader;
r_xprt->rx_stats.total_rdma_reply += rdmalen;
@@ -1014,14 +1033,9 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
badheader:
default:
- dprintk("%s: invalid rpcrdma reply header (type %d):"
- " chunks[012] == %d %d %d"
- " expected chunks <= %d\n",
- __func__, be32_to_cpu(headerp->rm_type),
- headerp->rm_body.rm_chunks[0],
- headerp->rm_body.rm_chunks[1],
- headerp->rm_body.rm_chunks[2],
- req->rl_nchunks);
+ dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
+ rqst->rq_task->tk_pid, __func__,
+ be32_to_cpu(headerp->rm_type));
status = -EIO;
r_xprt->rx_stats.bad_reply_count++;
break;
@@ -1035,7 +1049,7 @@ out:
* control: waking the next RPC waits until this RPC has
* relinquished all its Send Queue entries.
*/
- if (req->rl_nchunks)
+ if (!list_empty(&req->rl_registered))
r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
spin_lock_bh(&xprt->transport_lock);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 99d2e5b72726..81f0e879f019 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -558,7 +558,6 @@ out_sendbuf:
out_fail:
rpcrdma_buffer_put(req);
- r_xprt->rx_stats.failed_marshal_count++;
return NULL;
}
@@ -590,8 +589,19 @@ xprt_rdma_free(void *buffer)
rpcrdma_buffer_put(req);
}
-/*
+/**
+ * xprt_rdma_send_request - marshal and send an RPC request
+ * @task: RPC task with an RPC message in rq_snd_buf
+ *
+ * Return values:
+ * 0: The request has been sent
+ * ENOTCONN: Caller needs to invoke connect logic then call again
+ * ENOBUFS: Call again later to send the request
+ * EIO: A permanent error occurred. The request was not sent,
+ * and don't try it again
+ *
* send_request invokes the meat of RPC RDMA. It must do the following:
+ *
* 1. Marshal the RPC request into an RPC RDMA request, which means
* putting a header in front of data, and creating IOVs for RDMA
* from those in the request.
@@ -600,7 +610,6 @@ xprt_rdma_free(void *buffer)
* the request (rpcrdma_ep_post).
* 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
*/
-
static int
xprt_rdma_send_request(struct rpc_task *task)
{
@@ -610,6 +619,9 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc = 0;
+ /* On retransmit, remove any previously registered chunks */
+ r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+
rc = rpcrdma_marshal_req(rqst);
if (rc < 0)
goto failed_marshal;
@@ -630,11 +642,12 @@ xprt_rdma_send_request(struct rpc_task *task)
return 0;
failed_marshal:
- r_xprt->rx_stats.failed_marshal_count++;
dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
__func__, rc);
if (rc == -EIO)
- return -EIO;
+ r_xprt->rx_stats.failed_marshal_count++;
+ if (rc != -ENOTCONN)
+ return rc;
drop_connection:
xprt_disconnect_done(xprt);
return -ENOTCONN; /* implies disconnect */
@@ -660,7 +673,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
xprt->stat.bad_xids,
xprt->stat.req_u,
xprt->stat.bklog_u);
- seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
r_xprt->rx_stats.read_chunk_count,
r_xprt->rx_stats.write_chunk_count,
r_xprt->rx_stats.reply_chunk_count,
@@ -672,6 +685,10 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
r_xprt->rx_stats.failed_marshal_count,
r_xprt->rx_stats.bad_reply_count,
r_xprt->rx_stats.nomsg_call_count);
+ seq_printf(seq, "%lu %lu %lu\n",
+ r_xprt->rx_stats.mrs_recovered,
+ r_xprt->rx_stats.mrs_orphaned,
+ r_xprt->rx_stats.mrs_allocated);
}
static int
@@ -741,7 +758,6 @@ void xprt_rdma_cleanup(void)
__func__, rc);
rpcrdma_destroy_wq();
- frwr_destroy_recovery_wq();
rc = xprt_unregister_transport(&xprt_rdma_bc);
if (rc)
@@ -753,20 +769,13 @@ int xprt_rdma_init(void)
{
int rc;
- rc = frwr_alloc_recovery_wq();
- if (rc)
- return rc;
-
rc = rpcrdma_alloc_wq();
- if (rc) {
- frwr_destroy_recovery_wq();
+ if (rc)
return rc;
- }
rc = xprt_register_transport(&xprt_rdma);
if (rc) {
rpcrdma_destroy_wq();
- frwr_destroy_recovery_wq();
return rc;
}
@@ -774,7 +783,6 @@ int xprt_rdma_init(void)
if (rc) {
xprt_unregister_transport(&xprt_rdma);
rpcrdma_destroy_wq();
- frwr_destroy_recovery_wq();
return rc;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b044d98a1370..536d0be3f61b 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -379,8 +379,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
struct rpcrdma_ia *ia = &xprt->rx_ia;
int rc;
- ia->ri_dma_mr = NULL;
-
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
@@ -391,47 +389,29 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_pd = ib_alloc_pd(ia->ri_device);
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
- dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
- __func__, rc);
+ pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
goto out2;
}
- if (memreg == RPCRDMA_FRMR) {
- if (!(ia->ri_device->attrs.device_cap_flags &
- IB_DEVICE_MEM_MGT_EXTENSIONS) ||
- (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
- dprintk("RPC: %s: FRMR registration "
- "not supported by HCA\n", __func__);
- memreg = RPCRDMA_MTHCAFMR;
- }
- }
- if (memreg == RPCRDMA_MTHCAFMR) {
- if (!ia->ri_device->alloc_fmr) {
- dprintk("RPC: %s: MTHCAFMR registration "
- "not supported by HCA\n", __func__);
- rc = -EINVAL;
- goto out3;
- }
- }
-
switch (memreg) {
case RPCRDMA_FRMR:
- ia->ri_ops = &rpcrdma_frwr_memreg_ops;
- break;
- case RPCRDMA_ALLPHYSICAL:
- ia->ri_ops = &rpcrdma_physical_memreg_ops;
- break;
+ if (frwr_is_supported(ia)) {
+ ia->ri_ops = &rpcrdma_frwr_memreg_ops;
+ break;
+ }
+ /*FALLTHROUGH*/
case RPCRDMA_MTHCAFMR:
- ia->ri_ops = &rpcrdma_fmr_memreg_ops;
- break;
+ if (fmr_is_supported(ia)) {
+ ia->ri_ops = &rpcrdma_fmr_memreg_ops;
+ break;
+ }
+ /*FALLTHROUGH*/
default:
- printk(KERN_ERR "RPC: Unsupported memory "
- "registration mode: %d\n", memreg);
- rc = -ENOMEM;
+ pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
+ memreg);
+ rc = -EINVAL;
goto out3;
}
- dprintk("RPC: %s: memory registration strategy is '%s'\n",
- __func__, ia->ri_ops->ro_displayname);
return 0;
@@ -585,8 +565,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
out2:
ib_free_cq(sendcq);
out1:
- if (ia->ri_dma_mr)
- ib_dereg_mr(ia->ri_dma_mr);
return rc;
}
@@ -600,8 +578,6 @@ out1:
void
rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
- int rc;
-
dprintk("RPC: %s: entering, connected is %d\n",
__func__, ep->rep_connected);
@@ -615,12 +591,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_free_cq(ep->rep_attr.recv_cq);
ib_free_cq(ep->rep_attr.send_cq);
-
- if (ia->ri_dma_mr) {
- rc = ib_dereg_mr(ia->ri_dma_mr);
- dprintk("RPC: %s: ib_dereg_mr returned %i\n",
- __func__, rc);
- }
}
/*
@@ -777,6 +747,90 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_drain_qp(ia->ri_id->qp);
}
+static void
+rpcrdma_mr_recovery_worker(struct work_struct *work)
+{
+ struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+ rb_recovery_worker.work);
+ struct rpcrdma_mw *mw;
+
+ spin_lock(&buf->rb_recovery_lock);
+ while (!list_empty(&buf->rb_stale_mrs)) {
+ mw = list_first_entry(&buf->rb_stale_mrs,
+ struct rpcrdma_mw, mw_list);
+ list_del_init(&mw->mw_list);
+ spin_unlock(&buf->rb_recovery_lock);
+
+ dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
+ mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw);
+
+ spin_lock(&buf->rb_recovery_lock);
+ }
+ spin_unlock(&buf->rb_recovery_lock);
+}
+
+void
+rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
+{
+ struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+
+ spin_lock(&buf->rb_recovery_lock);
+ list_add(&mw->mw_list, &buf->rb_stale_mrs);
+ spin_unlock(&buf->rb_recovery_lock);
+
+ schedule_delayed_work(&buf->rb_recovery_worker, 0);
+}
+
+static void
+rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ unsigned int count;
+ LIST_HEAD(free);
+ LIST_HEAD(all);
+
+ for (count = 0; count < 32; count++) {
+ struct rpcrdma_mw *mw;
+ int rc;
+
+ mw = kzalloc(sizeof(*mw), GFP_KERNEL);
+ if (!mw)
+ break;
+
+ rc = ia->ri_ops->ro_init_mr(ia, mw);
+ if (rc) {
+ kfree(mw);
+ break;
+ }
+
+ mw->mw_xprt = r_xprt;
+
+ list_add(&mw->mw_list, &free);
+ list_add(&mw->mw_all, &all);
+ }
+
+ spin_lock(&buf->rb_mwlock);
+ list_splice(&free, &buf->rb_mws);
+ list_splice(&all, &buf->rb_all);
+ r_xprt->rx_stats.mrs_allocated += count;
+ spin_unlock(&buf->rb_mwlock);
+
+ dprintk("RPC: %s: created %u MRs\n", __func__, count);
+}
+
+static void
+rpcrdma_mr_refresh_worker(struct work_struct *work)
+{
+ struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+ rb_refresh_worker.work);
+ struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+ rx_buf);
+
+ rpcrdma_create_mrs(r_xprt);
+}
+
struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
@@ -793,6 +847,7 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
spin_unlock(&buffer->rb_reqslock);
req->rl_cqe.done = rpcrdma_wc_send;
req->rl_buffer = &r_xprt->rx_buf;
+ INIT_LIST_HEAD(&req->rl_registered);
return req;
}
@@ -832,17 +887,23 @@ int
rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
int i, rc;
buf->rb_max_requests = r_xprt->rx_data.max_requests;
buf->rb_bc_srv_max_requests = 0;
- spin_lock_init(&buf->rb_lock);
atomic_set(&buf->rb_credits, 1);
+ spin_lock_init(&buf->rb_mwlock);
+ spin_lock_init(&buf->rb_lock);
+ spin_lock_init(&buf->rb_recovery_lock);
+ INIT_LIST_HEAD(&buf->rb_mws);
+ INIT_LIST_HEAD(&buf->rb_all);
+ INIT_LIST_HEAD(&buf->rb_stale_mrs);
+ INIT_DELAYED_WORK(&buf->rb_refresh_worker,
+ rpcrdma_mr_refresh_worker);
+ INIT_DELAYED_WORK(&buf->rb_recovery_worker,
+ rpcrdma_mr_recovery_worker);
- rc = ia->ri_ops->ro_init(r_xprt);
- if (rc)
- goto out;
+ rpcrdma_create_mrs(r_xprt);
INIT_LIST_HEAD(&buf->rb_send_bufs);
INIT_LIST_HEAD(&buf->rb_allreqs);
@@ -862,7 +923,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
}
INIT_LIST_HEAD(&buf->rb_recv_bufs);
- for (i = 0; i < buf->rb_max_requests + 2; i++) {
+ for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_rep *rep;
rep = rpcrdma_create_rep(r_xprt);
@@ -918,11 +979,39 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
kfree(req);
}
+static void
+rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+ rx_buf);
+ struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ struct rpcrdma_mw *mw;
+ unsigned int count;
+
+ count = 0;
+ spin_lock(&buf->rb_mwlock);
+ while (!list_empty(&buf->rb_all)) {
+ mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+ list_del(&mw->mw_all);
+
+ spin_unlock(&buf->rb_mwlock);
+ ia->ri_ops->ro_release_mr(mw);
+ count++;
+ spin_lock(&buf->rb_mwlock);
+ }
+ spin_unlock(&buf->rb_mwlock);
+ r_xprt->rx_stats.mrs_allocated = 0;
+
+ dprintk("RPC: %s: released %u MRs\n", __func__, count);
+}
+
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
+ cancel_delayed_work_sync(&buf->rb_recovery_worker);
+
while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep;
@@ -944,7 +1033,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
}
spin_unlock(&buf->rb_reqslock);
- ia->ri_ops->ro_destroy(buf);
+ rpcrdma_destroy_mrs(buf);
}
struct rpcrdma_mw *
@@ -962,8 +1051,17 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
spin_unlock(&buf->rb_mwlock);
if (!mw)
- pr_err("RPC: %s: no MWs available\n", __func__);
+ goto out_nomws;
return mw;
+
+out_nomws:
+ dprintk("RPC: %s: no MWs available\n", __func__);
+ schedule_delayed_work(&buf->rb_refresh_worker, 0);
+
+ /* Allow the reply handler and refresh worker to run */
+ cond_resched();
+
+ return NULL;
}
void
@@ -978,8 +1076,6 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
/*
* Get a set of request/reply buffers.
- *
- * Reply buffer (if available) is attached to send buffer upon return.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -998,13 +1094,13 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
out_reqbuf:
spin_unlock(&buffers->rb_lock);
- pr_warn("RPC: %s: out of request buffers\n", __func__);
+ pr_warn("rpcrdma: out of request buffers (%p)\n", buffers);
return NULL;
out_repbuf:
+ list_add(&req->rl_free, &buffers->rb_send_bufs);
spin_unlock(&buffers->rb_lock);
- pr_warn("RPC: %s: out of reply buffers\n", __func__);
- req->rl_reply = NULL;
- return req;
+ pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers);
+ return NULL;
}
/*
@@ -1060,14 +1156,6 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
* Wrappers for internal-use kmalloc memory registration, used by buffer code.
*/
-void
-rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
-{
- dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
- seg->mr_offset,
- (unsigned long long)seg->mr_dma, seg->mr_dmalen);
-}
-
/**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia
@@ -1150,7 +1238,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep);
if (rc)
- goto out;
+ return rc;
req->rl_reply = NULL;
}
@@ -1175,10 +1263,12 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
if (rc)
- dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
- rc);
-out:
- return rc;
+ goto out_postsend_err;
+ return 0;
+
+out_postsend_err:
+ pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc);
+ return -ENOTCONN;
}
/*
@@ -1203,11 +1293,13 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
DMA_BIDIRECTIONAL);
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
-
if (rc)
- dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
- rc);
- return rc;
+ goto out_postrecv;
+ return 0;
+
+out_postrecv:
+ pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
+ return -ENOTCONN;
}
/**
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 95cdc66225ee..670fad57153a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -68,7 +68,6 @@ struct rpcrdma_ia {
struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
- struct ib_mr *ri_dma_mr;
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
@@ -172,23 +171,14 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
* o recv buffer (posted to provider)
* o ib_sge (also donated to provider)
* o status of reply (length, success or not)
- * o bookkeeping state to get run by tasklet (list, etc)
+ * o bookkeeping state to get run by reply handler (list, etc)
*
- * These are allocated during initialization, per-transport instance;
- * however, the tasklet execution list itself is global, as it should
- * always be pretty short.
+ * These are allocated during initialization, per-transport instance.
*
* N of these are associated with a transport instance, and stored in
* struct rpcrdma_buffer. N is the max number of outstanding requests.
*/
-#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
-
-/* data segments + head/tail for Call + head/tail for Reply */
-#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4)
-
-struct rpcrdma_buffer;
-
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
unsigned int rr_len;
@@ -221,9 +211,6 @@ enum rpcrdma_frmr_state {
};
struct rpcrdma_frmr {
- struct scatterlist *fr_sg;
- int fr_nents;
- enum dma_data_direction fr_dir;
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state;
@@ -235,18 +222,23 @@ struct rpcrdma_frmr {
};
struct rpcrdma_fmr {
- struct ib_fmr *fmr;
- u64 *physaddrs;
+ struct ib_fmr *fm_mr;
+ u64 *fm_physaddrs;
};
struct rpcrdma_mw {
+ struct list_head mw_list;
+ struct scatterlist *mw_sg;
+ int mw_nents;
+ enum dma_data_direction mw_dir;
union {
struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
};
- struct work_struct mw_work;
struct rpcrdma_xprt *mw_xprt;
- struct list_head mw_list;
+ u32 mw_handle;
+ u32 mw_length;
+ u64 mw_offset;
struct list_head mw_all;
};
@@ -266,33 +258,30 @@ struct rpcrdma_mw {
* of iovs for send operations. The reason is that the iovs passed to
* ib_post_{send,recv} must not be modified until the work request
* completes.
- *
- * NOTES:
- * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
- * marshal. The number needed varies depending on the iov lists that
- * are passed to us, the memory registration mode we are in, and if
- * physical addressing is used, the layout.
*/
+/* Maximum number of page-sized "segments" per chunk list to be
+ * registered or invalidated. Must handle a Reply chunk:
+ */
+enum {
+ RPCRDMA_MAX_IOV_SEGS = 3,
+ RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
+ RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS +
+ RPCRDMA_MAX_IOV_SEGS,
+};
+
struct rpcrdma_mr_seg { /* chunk descriptors */
- struct rpcrdma_mw *rl_mw; /* registered MR */
- u64 mr_base; /* registration result */
- u32 mr_rkey; /* registration result */
u32 mr_len; /* length of chunk or segment */
- int mr_nsegs; /* number of segments in chunk or 0 */
- enum dma_data_direction mr_dir; /* segment mapping direction */
- dma_addr_t mr_dma; /* segment mapping address */
- size_t mr_dmalen; /* segment mapping length */
struct page *mr_page; /* owning page, if any */
char *mr_offset; /* kva if no page, else offset */
};
#define RPCRDMA_MAX_IOVS (2)
+struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_free;
unsigned int rl_niovs;
- unsigned int rl_nchunks;
unsigned int rl_connect_cookie;
struct rpc_task *rl_task;
struct rpcrdma_buffer *rl_buffer;
@@ -300,12 +289,13 @@ struct rpcrdma_req {
struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
struct rpcrdma_regbuf *rl_rdmabuf;
struct rpcrdma_regbuf *rl_sendbuf;
- struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
- struct rpcrdma_mr_seg *rl_nextseg;
struct ib_cqe rl_cqe;
struct list_head rl_all;
bool rl_backchannel;
+
+ struct list_head rl_registered; /* registered segments */
+ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
static inline struct rpcrdma_req *
@@ -341,6 +331,11 @@ struct rpcrdma_buffer {
struct list_head rb_allreqs;
u32 rb_bc_max_requests;
+
+ spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */
+ struct list_head rb_stale_mrs;
+ struct delayed_work rb_recovery_worker;
+ struct delayed_work rb_refresh_worker;
};
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -387,6 +382,9 @@ struct rpcrdma_stats {
unsigned long bad_reply_count;
unsigned long nomsg_call_count;
unsigned long bcall_count;
+ unsigned long mrs_recovered;
+ unsigned long mrs_orphaned;
+ unsigned long mrs_allocated;
};
/*
@@ -395,23 +393,25 @@ struct rpcrdma_stats {
struct rpcrdma_xprt;
struct rpcrdma_memreg_ops {
int (*ro_map)(struct rpcrdma_xprt *,
- struct rpcrdma_mr_seg *, int, bool);
+ struct rpcrdma_mr_seg *, int, bool,
+ struct rpcrdma_mw **);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
struct rpcrdma_req *);
void (*ro_unmap_safe)(struct rpcrdma_xprt *,
struct rpcrdma_req *, bool);
+ void (*ro_recover_mr)(struct rpcrdma_mw *);
int (*ro_open)(struct rpcrdma_ia *,
struct rpcrdma_ep *,
struct rpcrdma_create_data_internal *);
size_t (*ro_maxpages)(struct rpcrdma_xprt *);
- int (*ro_init)(struct rpcrdma_xprt *);
- void (*ro_destroy)(struct rpcrdma_buffer *);
+ int (*ro_init_mr)(struct rpcrdma_ia *,
+ struct rpcrdma_mw *);
+ void (*ro_release_mr)(struct rpcrdma_mw *);
const char *ro_displayname;
};
extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
-extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
/*
* RPCRDMA transport -- encapsulates the structures above for
@@ -446,6 +446,8 @@ extern int xprt_rdma_pad_optimize;
*/
int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
void rpcrdma_ia_close(struct rpcrdma_ia *);
+bool frwr_is_supported(struct rpcrdma_ia *);
+bool fmr_is_supported(struct rpcrdma_ia *);
/*
* Endpoint calls - xprtrdma/verbs.c
@@ -477,6 +479,8 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
+
struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
size_t, gfp_t);
void rpcrdma_free_regbuf(struct rpcrdma_ia *,
@@ -484,9 +488,6 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
-int frwr_alloc_recovery_wq(void);
-void frwr_destroy_recovery_wq(void);
-
int rpcrdma_alloc_wq(void);
void rpcrdma_destroy_wq(void);
@@ -494,45 +495,12 @@ void rpcrdma_destroy_wq(void);
* Wrappers for chunk registration, shared by read/write chunk code.
*/
-void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
-
static inline enum dma_data_direction
rpcrdma_data_dir(bool writing)
{
return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
}
-static inline void
-rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
- enum dma_data_direction direction)
-{
- seg->mr_dir = direction;
- seg->mr_dmalen = seg->mr_len;
-
- if (seg->mr_page)
- seg->mr_dma = ib_dma_map_page(device,
- seg->mr_page, offset_in_page(seg->mr_offset),
- seg->mr_dmalen, seg->mr_dir);
- else
- seg->mr_dma = ib_dma_map_single(device,
- seg->mr_offset,
- seg->mr_dmalen, seg->mr_dir);
-
- if (ib_dma_mapping_error(device, seg->mr_dma))
- rpcrdma_mapping_error(seg);
-}
-
-static inline void
-rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
-{
- if (seg->mr_page)
- ib_dma_unmap_page(device,
- seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
- else
- ib_dma_unmap_single(device,
- seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-}
-
/*
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
*/
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7e2b2fa189c3..111767ab124a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -124,7 +124,7 @@ static struct ctl_table xs_tunables_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &xprt_min_resvport_limit,
- .extra2 = &xprt_max_resvport_limit
+ .extra2 = &xprt_max_resvport
},
{
.procname = "max_resvport",
@@ -132,7 +132,7 @@ static struct ctl_table xs_tunables_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &xprt_min_resvport_limit,
+ .extra1 = &xprt_min_resvport,
.extra2 = &xprt_max_resvport_limit
},
{
@@ -642,6 +642,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
bool zerocopy = true;
+ bool vm_wait = false;
int status;
int sent;
@@ -677,15 +678,33 @@ static int xs_tcp_send_request(struct rpc_task *task)
return 0;
}
+ WARN_ON_ONCE(sent == 0 && status == 0);
+
+ if (status == -EAGAIN ) {
+ /*
+ * Return EAGAIN if we're sure we're hitting the
+ * socket send buffer limits.
+ */
+ if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
+ break;
+ /*
+ * Did we hit a memory allocation failure?
+ */
+ if (sent == 0) {
+ status = -ENOBUFS;
+ if (vm_wait)
+ break;
+ /* Retry, knowing now that we're below the
+ * socket send buffer limit
+ */
+ vm_wait = true;
+ }
+ continue;
+ }
if (status < 0)
break;
- if (sent == 0) {
- status = -EAGAIN;
- break;
- }
+ vm_wait = false;
}
- if (status == -EAGAIN && sk_stream_is_writeable(transport->inet))
- status = -ENOBUFS;
switch (status) {
case -ENOTSOCK:
@@ -755,11 +774,19 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
sk->sk_error_report = transport->old_error_report;
}
+static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+ clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+}
+
static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
{
smp_mb__before_atomic();
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
clear_bit(XPRT_CLOSING, &xprt->state);
+ xs_sock_reset_state_flags(xprt);
smp_mb__after_atomic();
}
@@ -962,10 +989,13 @@ static void xs_local_data_receive(struct sock_xprt *transport)
goto out;
for (;;) {
skb = skb_recv_datagram(sk, 0, 1, &err);
- if (skb == NULL)
+ if (skb != NULL) {
+ xs_local_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram(sk, skb);
+ continue;
+ }
+ if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
break;
- xs_local_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram(sk, skb);
}
out:
mutex_unlock(&transport->recv_mutex);
@@ -1043,10 +1073,13 @@ static void xs_udp_data_receive(struct sock_xprt *transport)
goto out;
for (;;) {
skb = skb_recv_datagram(sk, 0, 1, &err);
- if (skb == NULL)
+ if (skb != NULL) {
+ xs_udp_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram(sk, skb);
+ continue;
+ }
+ if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
break;
- xs_udp_data_read_skb(&transport->xprt, sk, skb);
- skb_free_datagram(sk, skb);
}
out:
mutex_unlock(&transport->recv_mutex);
@@ -1074,7 +1107,14 @@ static void xs_data_ready(struct sock *sk)
if (xprt != NULL) {
struct sock_xprt *transport = container_of(xprt,
struct sock_xprt, xprt);
- queue_work(rpciod_workqueue, &transport->recv_worker);
+ transport->old_data_ready(sk);
+ /* Any data means we had a useful conversation, so
+ * then we don't need to delay the next reconnect
+ */
+ if (xprt->reestablish_timeout)
+ xprt->reestablish_timeout = 0;
+ if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ queue_work(xprtiod_workqueue, &transport->recv_worker);
}
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1474,10 +1514,15 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
for (;;) {
lock_sock(sk);
read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
- release_sock(sk);
- if (read <= 0)
- break;
- total += read;
+ if (read <= 0) {
+ clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+ release_sock(sk);
+ if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ break;
+ } else {
+ release_sock(sk);
+ total += read;
+ }
rd_desc.count = 65536;
}
out:
@@ -1493,34 +1538,6 @@ static void xs_tcp_data_receive_workfn(struct work_struct *work)
}
/**
- * xs_tcp_data_ready - "data ready" callback for TCP sockets
- * @sk: socket with data to read
- *
- */
-static void xs_tcp_data_ready(struct sock *sk)
-{
- struct sock_xprt *transport;
- struct rpc_xprt *xprt;
-
- dprintk("RPC: xs_tcp_data_ready...\n");
-
- read_lock_bh(&sk->sk_callback_lock);
- if (!(xprt = xprt_from_sock(sk)))
- goto out;
- transport = container_of(xprt, struct sock_xprt, xprt);
-
- /* Any data means we had a useful conversation, so
- * the we don't need to delay the next reconnect
- */
- if (xprt->reestablish_timeout)
- xprt->reestablish_timeout = 0;
- queue_work(rpciod_workqueue, &transport->recv_worker);
-
-out:
- read_unlock_bh(&sk->sk_callback_lock);
-}
-
-/**
* xs_tcp_state_change - callback to handle TCP socket state changes
* @sk: socket whose state has changed
*
@@ -1714,7 +1731,7 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
static unsigned short xs_get_random_port(void)
{
- unsigned short range = xprt_max_resvport - xprt_min_resvport;
+ unsigned short range = xprt_max_resvport - xprt_min_resvport + 1;
unsigned short rand = (unsigned short) prandom_u32() % range;
return rand + xprt_min_resvport;
}
@@ -2241,7 +2258,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt;
- sk->sk_data_ready = xs_tcp_data_ready;
+ sk->sk_data_ready = xs_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
sock_set_flag(sk, SOCK_FASYNC);
@@ -2380,7 +2397,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
/* Start by resetting any existing state */
xs_reset_transport(transport);
- queue_delayed_work(rpciod_workqueue,
+ queue_delayed_work(xprtiod_workqueue,
&transport->connect_worker,
xprt->reestablish_timeout);
xprt->reestablish_timeout <<= 1;
@@ -2390,7 +2407,7 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
} else {
dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
- queue_delayed_work(rpciod_workqueue,
+ queue_delayed_work(xprtiod_workqueue,
&transport->connect_worker, 0);
}
}
@@ -3153,8 +3170,12 @@ static int param_set_uint_minmax(const char *val,
static int param_set_portnr(const char *val, const struct kernel_param *kp)
{
- return param_set_uint_minmax(val, kp,
+ if (kp->arg == &xprt_min_resvport)
+ return param_set_uint_minmax(val, kp,
RPC_MIN_RESVPORT,
+ xprt_max_resvport);
+ return param_set_uint_minmax(val, kp,
+ xprt_min_resvport,
RPC_MAX_RESVPORT);
}
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index ed98c1fc3de1..46a71c701e7c 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -46,7 +46,7 @@ static int net_ctl_permissions(struct ctl_table_header *head,
kgid_t root_gid = make_kgid(net->user_ns, 0);
/* Allow network administrator to have same access as root. */
- if (ns_capable(net->user_ns, CAP_NET_ADMIN) ||
+ if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN) ||
uid_eq(root_uid, current_euid())) {
int mode = (table->mode >> 6) & 7;
return (mode << 6) | (mode << 3) | mode;
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index be70a57c1ff9..b62caa1c770c 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -794,10 +794,10 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg,
return 0;
attr_msg_full:
+ read_unlock_bh(&mon->lock);
nla_nest_cancel(msg->skb, attrs);
msg_full:
genlmsg_cancel(msg->skb, hdr);
- read_unlock_bh(&mon->lock);
return -EMSGSIZE;
}
diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig
index 14810abedc2e..8831e7c42167 100644
--- a/net/vmw_vsock/Kconfig
+++ b/net/vmw_vsock/Kconfig
@@ -26,3 +26,23 @@ config VMWARE_VMCI_VSOCKETS
To compile this driver as a module, choose M here: the module
will be called vmw_vsock_vmci_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS
+ tristate "virtio transport for Virtual Sockets"
+ depends on VSOCKETS && VIRTIO
+ select VIRTIO_VSOCKETS_COMMON
+ help
+ This module implements a virtio transport for Virtual Sockets.
+
+ Enable this transport if your Virtual Machine host supports Virtual
+ Sockets over virtio.
+
+ To compile this driver as a module, choose M here: the module will be
+ called vmw_vsock_virtio_transport. If unsure, say N.
+
+config VIRTIO_VSOCKETS_COMMON
+ tristate
+ help
+ This option is selected by any driver which needs to access
+ the virtio_vsock. The module will be called
+ vmw_vsock_virtio_transport_common.
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 2ce52d70f224..bc27c70e0e59 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -1,7 +1,13 @@
obj-$(CONFIG_VSOCKETS) += vsock.o
obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o
+obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o
vsock-y += af_vsock.o vsock_addr.o
vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \
vmci_transport_notify_qstate.o
+
+vmw_vsock_virtio_transport-y += virtio_transport.o
+
+vmw_vsock_virtio_transport_common-y += virtio_transport_common.o
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index b96ac918e0ba..17dbbe64cd73 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -344,6 +344,16 @@ static bool vsock_in_connected_table(struct vsock_sock *vsk)
return ret;
}
+void vsock_remove_sock(struct vsock_sock *vsk)
+{
+ if (vsock_in_bound_table(vsk))
+ vsock_remove_bound(vsk);
+
+ if (vsock_in_connected_table(vsk))
+ vsock_remove_connected(vsk);
+}
+EXPORT_SYMBOL_GPL(vsock_remove_sock);
+
void vsock_for_each_connected_socket(void (*fn)(struct sock *sk))
{
int i;
@@ -660,12 +670,6 @@ static void __vsock_release(struct sock *sk)
vsk = vsock_sk(sk);
pending = NULL; /* Compiler warning. */
- if (vsock_in_bound_table(vsk))
- vsock_remove_bound(vsk);
-
- if (vsock_in_connected_table(vsk))
- vsock_remove_connected(vsk);
-
transport->release(vsk);
lock_sock(sk);
@@ -1995,6 +1999,15 @@ void vsock_core_exit(void)
}
EXPORT_SYMBOL_GPL(vsock_core_exit);
+const struct vsock_transport *vsock_core_get_transport(void)
+{
+ /* vsock_register_mutex not taken since only the transport uses this
+ * function and only while registered.
+ */
+ return transport;
+}
+EXPORT_SYMBOL_GPL(vsock_core_get_transport);
+
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMware Virtual Socket Family");
MODULE_VERSION("1.0.1.0-k");
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
new file mode 100644
index 000000000000..699dfabdbccd
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport.c
@@ -0,0 +1,624 @@
+/*
+ * virtio transport for vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * Some of the code is take from Gerd Hoffmann <kraxel@redhat.com>'s
+ * early virtio-vsock proof-of-concept bits.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+#include <net/sock.h>
+#include <linux/mutex.h>
+#include <net/af_vsock.h>
+
+static struct workqueue_struct *virtio_vsock_workqueue;
+static struct virtio_vsock *the_virtio_vsock;
+static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */
+
+struct virtio_vsock {
+ struct virtio_device *vdev;
+ struct virtqueue *vqs[VSOCK_VQ_MAX];
+
+ /* Virtqueue processing is deferred to a workqueue */
+ struct work_struct tx_work;
+ struct work_struct rx_work;
+ struct work_struct event_work;
+
+ /* The following fields are protected by tx_lock. vqs[VSOCK_VQ_TX]
+ * must be accessed with tx_lock held.
+ */
+ struct mutex tx_lock;
+
+ struct work_struct send_pkt_work;
+ spinlock_t send_pkt_list_lock;
+ struct list_head send_pkt_list;
+
+ atomic_t queued_replies;
+
+ /* The following fields are protected by rx_lock. vqs[VSOCK_VQ_RX]
+ * must be accessed with rx_lock held.
+ */
+ struct mutex rx_lock;
+ int rx_buf_nr;
+ int rx_buf_max_nr;
+
+ /* The following fields are protected by event_lock.
+ * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held.
+ */
+ struct mutex event_lock;
+ struct virtio_vsock_event event_list[8];
+
+ u32 guest_cid;
+};
+
+static struct virtio_vsock *virtio_vsock_get(void)
+{
+ return the_virtio_vsock;
+}
+
+static u32 virtio_transport_get_local_cid(void)
+{
+ struct virtio_vsock *vsock = virtio_vsock_get();
+
+ return vsock->guest_cid;
+}
+
+static void
+virtio_transport_send_pkt_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, send_pkt_work);
+ struct virtqueue *vq;
+ bool added = false;
+ bool restart_rx = false;
+
+ mutex_lock(&vsock->tx_lock);
+
+ vq = vsock->vqs[VSOCK_VQ_TX];
+
+ /* Avoid unnecessary interrupts while we're processing the ring */
+ virtqueue_disable_cb(vq);
+
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ struct scatterlist hdr, buf, *sgs[2];
+ int ret, in_sg = 0, out_sg = 0;
+ bool reply;
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ if (list_empty(&vsock->send_pkt_list)) {
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+ virtqueue_enable_cb(vq);
+ break;
+ }
+
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del_init(&pkt->list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ reply = pkt->reply;
+
+ sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+ sgs[out_sg++] = &hdr;
+ if (pkt->buf) {
+ sg_init_one(&buf, pkt->buf, pkt->len);
+ sgs[out_sg++] = &buf;
+ }
+
+ ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL);
+ if (ret < 0) {
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ if (!virtqueue_enable_cb(vq) && ret == -ENOSPC)
+ continue; /* retry now that we have more space */
+ break;
+ }
+
+ if (reply) {
+ struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+ int val;
+
+ val = atomic_dec_return(&vsock->queued_replies);
+
+ /* Do we now have resources to resume rx processing? */
+ if (val + 1 == virtqueue_get_vring_size(rx_vq))
+ restart_rx = true;
+ }
+
+ added = true;
+ }
+
+ if (added)
+ virtqueue_kick(vq);
+
+ mutex_unlock(&vsock->tx_lock);
+
+ if (restart_rx)
+ queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static int
+virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock *vsock;
+ int len = pkt->len;
+
+ vsock = virtio_vsock_get();
+ if (!vsock) {
+ virtio_transport_free_pkt(pkt);
+ return -ENODEV;
+ }
+
+ if (pkt->reply)
+ atomic_inc(&vsock->queued_replies);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ list_add_tail(&pkt->list, &vsock->send_pkt_list);
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+ return len;
+}
+
+static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
+{
+ int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+ struct virtio_vsock_pkt *pkt;
+ struct scatterlist hdr, buf, *sgs[2];
+ struct virtqueue *vq;
+ int ret;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ do {
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ break;
+
+ pkt->buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!pkt->buf) {
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+
+ pkt->len = buf_len;
+
+ sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+ sgs[0] = &hdr;
+
+ sg_init_one(&buf, pkt->buf, buf_len);
+ sgs[1] = &buf;
+ ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
+ if (ret) {
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+ vsock->rx_buf_nr++;
+ } while (vq->num_free);
+ if (vsock->rx_buf_nr > vsock->rx_buf_max_nr)
+ vsock->rx_buf_max_nr = vsock->rx_buf_nr;
+ virtqueue_kick(vq);
+}
+
+static void virtio_transport_tx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, tx_work);
+ struct virtqueue *vq;
+ bool added = false;
+
+ vq = vsock->vqs[VSOCK_VQ_TX];
+ mutex_lock(&vsock->tx_lock);
+ do {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ virtqueue_disable_cb(vq);
+ while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) {
+ virtio_transport_free_pkt(pkt);
+ added = true;
+ }
+ } while (!virtqueue_enable_cb(vq));
+ mutex_unlock(&vsock->tx_lock);
+
+ if (added)
+ queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+}
+
+/* Is there space left for replies to rx packets? */
+static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
+{
+ struct virtqueue *vq = vsock->vqs[VSOCK_VQ_RX];
+ int val;
+
+ smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */
+ val = atomic_read(&vsock->queued_replies);
+
+ return val < virtqueue_get_vring_size(vq);
+}
+
+static void virtio_transport_rx_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, rx_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_RX];
+
+ mutex_lock(&vsock->rx_lock);
+
+ do {
+ virtqueue_disable_cb(vq);
+ for (;;) {
+ struct virtio_vsock_pkt *pkt;
+ unsigned int len;
+
+ if (!virtio_transport_more_replies(vsock)) {
+ /* Stop rx until the device processes already
+ * pending replies. Leave rx virtqueue
+ * callbacks disabled.
+ */
+ goto out;
+ }
+
+ pkt = virtqueue_get_buf(vq, &len);
+ if (!pkt) {
+ break;
+ }
+
+ vsock->rx_buf_nr--;
+
+ /* Drop short/long packets */
+ if (unlikely(len < sizeof(pkt->hdr) ||
+ len > sizeof(pkt->hdr) + pkt->len)) {
+ virtio_transport_free_pkt(pkt);
+ continue;
+ }
+
+ pkt->len = len - sizeof(pkt->hdr);
+ virtio_transport_recv_pkt(pkt);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+out:
+ if (vsock->rx_buf_nr < vsock->rx_buf_max_nr / 2)
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+}
+
+/* event_lock must be held */
+static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock,
+ struct virtio_vsock_event *event)
+{
+ struct scatterlist sg;
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+ sg_init_one(&sg, event, sizeof(*event));
+
+ return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_fill(struct virtio_vsock *vsock)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(vsock->event_list); i++) {
+ struct virtio_vsock_event *event = &vsock->event_list[i];
+
+ virtio_vsock_event_fill_one(vsock, event);
+ }
+
+ virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+}
+
+static void virtio_vsock_reset_sock(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = ECONNRESET;
+ sk->sk_error_report(sk);
+ release_sock(sk);
+}
+
+static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)
+{
+ struct virtio_device *vdev = vsock->vdev;
+ u64 guest_cid;
+
+ vdev->config->get(vdev, offsetof(struct virtio_vsock_config, guest_cid),
+ &guest_cid, sizeof(guest_cid));
+ vsock->guest_cid = le64_to_cpu(guest_cid);
+}
+
+/* event_lock must be held */
+static void virtio_vsock_event_handle(struct virtio_vsock *vsock,
+ struct virtio_vsock_event *event)
+{
+ switch (le32_to_cpu(event->id)) {
+ case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET:
+ virtio_vsock_update_guest_cid(vsock);
+ vsock_for_each_connected_socket(virtio_vsock_reset_sock);
+ break;
+ }
+}
+
+static void virtio_transport_event_work(struct work_struct *work)
+{
+ struct virtio_vsock *vsock =
+ container_of(work, struct virtio_vsock, event_work);
+ struct virtqueue *vq;
+
+ vq = vsock->vqs[VSOCK_VQ_EVENT];
+
+ mutex_lock(&vsock->event_lock);
+
+ do {
+ struct virtio_vsock_event *event;
+ unsigned int len;
+
+ virtqueue_disable_cb(vq);
+ while ((event = virtqueue_get_buf(vq, &len)) != NULL) {
+ if (len == sizeof(*event))
+ virtio_vsock_event_handle(vsock, event);
+
+ virtio_vsock_event_fill_one(vsock, event);
+ }
+ } while (!virtqueue_enable_cb(vq));
+
+ virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+
+ mutex_unlock(&vsock->event_lock);
+}
+
+static void virtio_vsock_event_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->event_work);
+}
+
+static void virtio_vsock_tx_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->tx_work);
+}
+
+static void virtio_vsock_rx_done(struct virtqueue *vq)
+{
+ struct virtio_vsock *vsock = vq->vdev->priv;
+
+ if (!vsock)
+ return;
+ queue_work(virtio_vsock_workqueue, &vsock->rx_work);
+}
+
+static struct virtio_transport virtio_transport = {
+ .transport = {
+ .get_local_cid = virtio_transport_get_local_cid,
+
+ .init = virtio_transport_do_socket_init,
+ .destruct = virtio_transport_destruct,
+ .release = virtio_transport_release,
+ .connect = virtio_transport_connect,
+ .shutdown = virtio_transport_shutdown,
+
+ .dgram_bind = virtio_transport_dgram_bind,
+ .dgram_dequeue = virtio_transport_dgram_dequeue,
+ .dgram_enqueue = virtio_transport_dgram_enqueue,
+ .dgram_allow = virtio_transport_dgram_allow,
+
+ .stream_dequeue = virtio_transport_stream_dequeue,
+ .stream_enqueue = virtio_transport_stream_enqueue,
+ .stream_has_data = virtio_transport_stream_has_data,
+ .stream_has_space = virtio_transport_stream_has_space,
+ .stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
+ .stream_is_active = virtio_transport_stream_is_active,
+ .stream_allow = virtio_transport_stream_allow,
+
+ .notify_poll_in = virtio_transport_notify_poll_in,
+ .notify_poll_out = virtio_transport_notify_poll_out,
+ .notify_recv_init = virtio_transport_notify_recv_init,
+ .notify_recv_pre_block = virtio_transport_notify_recv_pre_block,
+ .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue,
+ .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue,
+ .notify_send_init = virtio_transport_notify_send_init,
+ .notify_send_pre_block = virtio_transport_notify_send_pre_block,
+ .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue,
+ .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
+
+ .set_buffer_size = virtio_transport_set_buffer_size,
+ .set_min_buffer_size = virtio_transport_set_min_buffer_size,
+ .set_max_buffer_size = virtio_transport_set_max_buffer_size,
+ .get_buffer_size = virtio_transport_get_buffer_size,
+ .get_min_buffer_size = virtio_transport_get_min_buffer_size,
+ .get_max_buffer_size = virtio_transport_get_max_buffer_size,
+ },
+
+ .send_pkt = virtio_transport_send_pkt,
+};
+
+static int virtio_vsock_probe(struct virtio_device *vdev)
+{
+ vq_callback_t *callbacks[] = {
+ virtio_vsock_rx_done,
+ virtio_vsock_tx_done,
+ virtio_vsock_event_done,
+ };
+ static const char * const names[] = {
+ "rx",
+ "tx",
+ "event",
+ };
+ struct virtio_vsock *vsock = NULL;
+ int ret;
+
+ ret = mutex_lock_interruptible(&the_virtio_vsock_mutex);
+ if (ret)
+ return ret;
+
+ /* Only one virtio-vsock device per guest is supported */
+ if (the_virtio_vsock) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ vsock = kzalloc(sizeof(*vsock), GFP_KERNEL);
+ if (!vsock) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ vsock->vdev = vdev;
+
+ ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+ vsock->vqs, callbacks, names);
+ if (ret < 0)
+ goto out;
+
+ virtio_vsock_update_guest_cid(vsock);
+
+ ret = vsock_core_init(&virtio_transport.transport);
+ if (ret < 0)
+ goto out_vqs;
+
+ vsock->rx_buf_nr = 0;
+ vsock->rx_buf_max_nr = 0;
+ atomic_set(&vsock->queued_replies, 0);
+
+ vdev->priv = vsock;
+ the_virtio_vsock = vsock;
+ mutex_init(&vsock->tx_lock);
+ mutex_init(&vsock->rx_lock);
+ mutex_init(&vsock->event_lock);
+ spin_lock_init(&vsock->send_pkt_list_lock);
+ INIT_LIST_HEAD(&vsock->send_pkt_list);
+ INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
+ INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
+ INIT_WORK(&vsock->event_work, virtio_transport_event_work);
+ INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
+
+ mutex_lock(&vsock->rx_lock);
+ virtio_vsock_rx_fill(vsock);
+ mutex_unlock(&vsock->rx_lock);
+
+ mutex_lock(&vsock->event_lock);
+ virtio_vsock_event_fill(vsock);
+ mutex_unlock(&vsock->event_lock);
+
+ mutex_unlock(&the_virtio_vsock_mutex);
+ return 0;
+
+out_vqs:
+ vsock->vdev->config->del_vqs(vsock->vdev);
+out:
+ kfree(vsock);
+ mutex_unlock(&the_virtio_vsock_mutex);
+ return ret;
+}
+
+static void virtio_vsock_remove(struct virtio_device *vdev)
+{
+ struct virtio_vsock *vsock = vdev->priv;
+ struct virtio_vsock_pkt *pkt;
+
+ flush_work(&vsock->rx_work);
+ flush_work(&vsock->tx_work);
+ flush_work(&vsock->event_work);
+ flush_work(&vsock->send_pkt_work);
+
+ vdev->config->reset(vdev);
+
+ mutex_lock(&vsock->rx_lock);
+ while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX])))
+ virtio_transport_free_pkt(pkt);
+ mutex_unlock(&vsock->rx_lock);
+
+ mutex_lock(&vsock->tx_lock);
+ while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX])))
+ virtio_transport_free_pkt(pkt);
+ mutex_unlock(&vsock->tx_lock);
+
+ spin_lock_bh(&vsock->send_pkt_list_lock);
+ while (!list_empty(&vsock->send_pkt_list)) {
+ pkt = list_first_entry(&vsock->send_pkt_list,
+ struct virtio_vsock_pkt, list);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ spin_unlock_bh(&vsock->send_pkt_list_lock);
+
+ mutex_lock(&the_virtio_vsock_mutex);
+ the_virtio_vsock = NULL;
+ vsock_core_exit();
+ mutex_unlock(&the_virtio_vsock_mutex);
+
+ vdev->config->del_vqs(vdev);
+
+ kfree(vsock);
+}
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_VSOCK, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static unsigned int features[] = {
+};
+
+static struct virtio_driver virtio_vsock_driver = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .probe = virtio_vsock_probe,
+ .remove = virtio_vsock_remove,
+};
+
+static int __init virtio_vsock_init(void)
+{
+ int ret;
+
+ virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0);
+ if (!virtio_vsock_workqueue)
+ return -ENOMEM;
+ ret = register_virtio_driver(&virtio_vsock_driver);
+ if (ret)
+ destroy_workqueue(virtio_vsock_workqueue);
+ return ret;
+}
+
+static void __exit virtio_vsock_exit(void)
+{
+ unregister_virtio_driver(&virtio_vsock_driver);
+ destroy_workqueue(virtio_vsock_workqueue);
+}
+
+module_init(virtio_vsock_init);
+module_exit(virtio_vsock_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("virtio transport for vsock");
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
new file mode 100644
index 000000000000..a53b3a16b4f1
--- /dev/null
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -0,0 +1,992 @@
+/*
+ * common code for virtio vsock
+ *
+ * Copyright (C) 2013-2015 Red Hat, Inc.
+ * Author: Asias He <asias@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/virtio.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_vsock.h>
+
+#include <net/sock.h>
+#include <net/af_vsock.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/vsock_virtio_transport_common.h>
+
+/* How long to wait for graceful shutdown of a connection */
+#define VSOCK_CLOSE_TIMEOUT (8 * HZ)
+
+static const struct virtio_transport *virtio_transport_get_ops(void)
+{
+ const struct vsock_transport *t = vsock_core_get_transport();
+
+ return container_of(t, struct virtio_transport, transport);
+}
+
+struct virtio_vsock_pkt *
+virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
+ size_t len,
+ u32 src_cid,
+ u32 src_port,
+ u32 dst_cid,
+ u32 dst_port)
+{
+ struct virtio_vsock_pkt *pkt;
+ int err;
+
+ pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+ if (!pkt)
+ return NULL;
+
+ pkt->hdr.type = cpu_to_le16(info->type);
+ pkt->hdr.op = cpu_to_le16(info->op);
+ pkt->hdr.src_cid = cpu_to_le64(src_cid);
+ pkt->hdr.dst_cid = cpu_to_le64(dst_cid);
+ pkt->hdr.src_port = cpu_to_le32(src_port);
+ pkt->hdr.dst_port = cpu_to_le32(dst_port);
+ pkt->hdr.flags = cpu_to_le32(info->flags);
+ pkt->len = len;
+ pkt->hdr.len = cpu_to_le32(len);
+ pkt->reply = info->reply;
+
+ if (info->msg && len > 0) {
+ pkt->buf = kmalloc(len, GFP_KERNEL);
+ if (!pkt->buf)
+ goto out_pkt;
+ err = memcpy_from_msg(pkt->buf, info->msg, len);
+ if (err)
+ goto out;
+ }
+
+ trace_virtio_transport_alloc_pkt(src_cid, src_port,
+ dst_cid, dst_port,
+ len,
+ info->type,
+ info->op,
+ info->flags);
+
+ return pkt;
+
+out:
+ kfree(pkt->buf);
+out_pkt:
+ kfree(pkt);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_alloc_pkt);
+
+static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt_info *info)
+{
+ u32 src_cid, src_port, dst_cid, dst_port;
+ struct virtio_vsock_sock *vvs;
+ struct virtio_vsock_pkt *pkt;
+ u32 pkt_len = info->pkt_len;
+
+ src_cid = vm_sockets_get_local_cid();
+ src_port = vsk->local_addr.svm_port;
+ if (!info->remote_cid) {
+ dst_cid = vsk->remote_addr.svm_cid;
+ dst_port = vsk->remote_addr.svm_port;
+ } else {
+ dst_cid = info->remote_cid;
+ dst_port = info->remote_port;
+ }
+
+ vvs = vsk->trans;
+
+ /* we can send less than pkt_len bytes */
+ if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
+ pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+
+ /* virtio_transport_get_credit might return less than pkt_len credit */
+ pkt_len = virtio_transport_get_credit(vvs, pkt_len);
+
+ /* Do not send zero length OP_RW pkt */
+ if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
+ return pkt_len;
+
+ pkt = virtio_transport_alloc_pkt(info, pkt_len,
+ src_cid, src_port,
+ dst_cid, dst_port);
+ if (!pkt) {
+ virtio_transport_put_credit(vvs, pkt_len);
+ return -ENOMEM;
+ }
+
+ virtio_transport_inc_tx_pkt(vvs, pkt);
+
+ return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
+ struct virtio_vsock_pkt *pkt)
+{
+ vvs->rx_bytes += pkt->len;
+}
+
+static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
+ struct virtio_vsock_pkt *pkt)
+{
+ vvs->rx_bytes -= pkt->len;
+ vvs->fwd_cnt += pkt->len;
+}
+
+void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
+{
+ spin_lock_bh(&vvs->tx_lock);
+ pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
+ pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
+ spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
+
+u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+ u32 ret;
+
+ spin_lock_bh(&vvs->tx_lock);
+ ret = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+ if (ret > credit)
+ ret = credit;
+ vvs->tx_cnt += ret;
+ spin_unlock_bh(&vvs->tx_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_credit);
+
+void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit)
+{
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->tx_cnt -= credit;
+ spin_unlock_bh(&vvs->tx_lock);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_put_credit);
+
+static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
+ int type,
+ struct virtio_vsock_hdr *hdr)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
+ .type = type,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+static ssize_t
+virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ struct virtio_vsock_pkt *pkt;
+ size_t bytes, total = 0;
+ int err = -EFAULT;
+
+ spin_lock_bh(&vvs->rx_lock);
+ while (total < len && !list_empty(&vvs->rx_queue)) {
+ pkt = list_first_entry(&vvs->rx_queue,
+ struct virtio_vsock_pkt, list);
+
+ bytes = len - total;
+ if (bytes > pkt->len - pkt->off)
+ bytes = pkt->len - pkt->off;
+
+ /* sk_lock is held by caller so no one else can dequeue.
+ * Unlock rx_lock since memcpy_to_msg() may sleep.
+ */
+ spin_unlock_bh(&vvs->rx_lock);
+
+ err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
+ if (err)
+ goto out;
+
+ spin_lock_bh(&vvs->rx_lock);
+
+ total += bytes;
+ pkt->off += bytes;
+ if (pkt->off == pkt->len) {
+ virtio_transport_dec_rx_pkt(vvs, pkt);
+ list_del(&pkt->list);
+ virtio_transport_free_pkt(pkt);
+ }
+ }
+ spin_unlock_bh(&vvs->rx_lock);
+
+ /* Send a credit pkt to peer */
+ virtio_transport_send_credit_update(vsk, VIRTIO_VSOCK_TYPE_STREAM,
+ NULL);
+
+ return total;
+
+out:
+ if (total)
+ err = total;
+ return err;
+}
+
+ssize_t
+virtio_transport_stream_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len, int flags)
+{
+ if (flags & MSG_PEEK)
+ return -EOPNOTSUPP;
+
+ return virtio_transport_stream_do_dequeue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_dequeue);
+
+int
+virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len, int flags)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue);
+
+s64 virtio_transport_stream_has_data(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ spin_lock_bh(&vvs->rx_lock);
+ bytes = vvs->rx_bytes;
+ spin_unlock_bh(&vvs->rx_lock);
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_data);
+
+static s64 virtio_transport_has_space(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ bytes = vvs->peer_buf_alloc - (vvs->tx_cnt - vvs->peer_fwd_cnt);
+ if (bytes < 0)
+ bytes = 0;
+
+ return bytes;
+}
+
+s64 virtio_transport_stream_has_space(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ s64 bytes;
+
+ spin_lock_bh(&vvs->tx_lock);
+ bytes = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+
+ return bytes;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_has_space);
+
+int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+ struct vsock_sock *psk)
+{
+ struct virtio_vsock_sock *vvs;
+
+ vvs = kzalloc(sizeof(*vvs), GFP_KERNEL);
+ if (!vvs)
+ return -ENOMEM;
+
+ vsk->trans = vvs;
+ vvs->vsk = vsk;
+ if (psk) {
+ struct virtio_vsock_sock *ptrans = psk->trans;
+
+ vvs->buf_size = ptrans->buf_size;
+ vvs->buf_size_min = ptrans->buf_size_min;
+ vvs->buf_size_max = ptrans->buf_size_max;
+ vvs->peer_buf_alloc = ptrans->peer_buf_alloc;
+ } else {
+ vvs->buf_size = VIRTIO_VSOCK_DEFAULT_BUF_SIZE;
+ vvs->buf_size_min = VIRTIO_VSOCK_DEFAULT_MIN_BUF_SIZE;
+ vvs->buf_size_max = VIRTIO_VSOCK_DEFAULT_MAX_BUF_SIZE;
+ }
+
+ vvs->buf_alloc = vvs->buf_size;
+
+ spin_lock_init(&vvs->rx_lock);
+ spin_lock_init(&vvs->tx_lock);
+ INIT_LIST_HEAD(&vvs->rx_queue);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_do_socket_init);
+
+u64 virtio_transport_get_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_buffer_size);
+
+u64 virtio_transport_get_min_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size_min;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_min_buffer_size);
+
+u64 virtio_transport_get_max_buffer_size(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size_max;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_get_max_buffer_size);
+
+void virtio_transport_set_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val < vvs->buf_size_min)
+ vvs->buf_size_min = val;
+ if (val > vvs->buf_size_max)
+ vvs->buf_size_max = val;
+ vvs->buf_size = val;
+ vvs->buf_alloc = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_buffer_size);
+
+void virtio_transport_set_min_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val > vvs->buf_size)
+ vvs->buf_size = val;
+ vvs->buf_size_min = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_min_buffer_size);
+
+void virtio_transport_set_max_buffer_size(struct vsock_sock *vsk, u64 val)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ if (val > VIRTIO_VSOCK_MAX_BUF_SIZE)
+ val = VIRTIO_VSOCK_MAX_BUF_SIZE;
+ if (val < vvs->buf_size)
+ vvs->buf_size = val;
+ vvs->buf_size_max = val;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_set_max_buffer_size);
+
+int
+virtio_transport_notify_poll_in(struct vsock_sock *vsk,
+ size_t target,
+ bool *data_ready_now)
+{
+ if (vsock_stream_has_data(vsk))
+ *data_ready_now = true;
+ else
+ *data_ready_now = false;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_in);
+
+int
+virtio_transport_notify_poll_out(struct vsock_sock *vsk,
+ size_t target,
+ bool *space_avail_now)
+{
+ s64 free_space;
+
+ free_space = vsock_stream_has_space(vsk);
+ if (free_space > 0)
+ *space_avail_now = true;
+ else if (free_space == 0)
+ *space_avail_now = false;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_poll_out);
+
+int virtio_transport_notify_recv_init(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_init);
+
+int virtio_transport_notify_recv_pre_block(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_block);
+
+int virtio_transport_notify_recv_pre_dequeue(struct vsock_sock *vsk,
+ size_t target, struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_pre_dequeue);
+
+int virtio_transport_notify_recv_post_dequeue(struct vsock_sock *vsk,
+ size_t target, ssize_t copied, bool data_read,
+ struct vsock_transport_recv_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_recv_post_dequeue);
+
+int virtio_transport_notify_send_init(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_init);
+
+int virtio_transport_notify_send_pre_block(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_block);
+
+int virtio_transport_notify_send_pre_enqueue(struct vsock_sock *vsk,
+ struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_pre_enqueue);
+
+int virtio_transport_notify_send_post_enqueue(struct vsock_sock *vsk,
+ ssize_t written, struct vsock_transport_send_notify_data *data)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_notify_send_post_enqueue);
+
+u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ return vvs->buf_size;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_rcvhiwat);
+
+bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
+
+bool virtio_transport_stream_allow(u32 cid, u32 port)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
+
+int virtio_transport_dgram_bind(struct vsock_sock *vsk,
+ struct sockaddr_vm *addr)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
+
+bool virtio_transport_dgram_allow(u32 cid, u32 port)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow);
+
+int virtio_transport_connect(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_REQUEST,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_connect);
+
+int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_SHUTDOWN,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .flags = (mode & RCV_SHUTDOWN ?
+ VIRTIO_VSOCK_SHUTDOWN_RCV : 0) |
+ (mode & SEND_SHUTDOWN ?
+ VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_shutdown);
+
+int
+virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+ struct sockaddr_vm *remote_addr,
+ struct msghdr *msg,
+ size_t dgram_len)
+{
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue);
+
+ssize_t
+virtio_transport_stream_enqueue(struct vsock_sock *vsk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RW,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .msg = msg,
+ .pkt_len = len,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_stream_enqueue);
+
+void virtio_transport_destruct(struct vsock_sock *vsk)
+{
+ struct virtio_vsock_sock *vvs = vsk->trans;
+
+ kfree(vvs);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_destruct);
+
+static int virtio_transport_reset(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RST,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .reply = !!pkt,
+ };
+
+ /* Send RST only if the original pkt is not a RST pkt */
+ if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ return 0;
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Normally packets are associated with a socket. There may be no socket if an
+ * attempt was made to connect to a socket that does not exist.
+ */
+static int virtio_transport_reset_no_sock(struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RST,
+ .type = le16_to_cpu(pkt->hdr.type),
+ .reply = true,
+ };
+
+ /* Send RST only if the original pkt is not a RST pkt */
+ if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ return 0;
+
+ pkt = virtio_transport_alloc_pkt(&info, 0,
+ le32_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port),
+ le32_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+ if (!pkt)
+ return -ENOMEM;
+
+ return virtio_transport_get_ops()->send_pkt(pkt);
+}
+
+static void virtio_transport_wait_close(struct sock *sk, long timeout)
+{
+ if (timeout) {
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(sk_sleep(sk), &wait,
+ TASK_INTERRUPTIBLE);
+ if (sk_wait_event(sk, &timeout,
+ sock_flag(sk, SOCK_DONE)))
+ break;
+ } while (!signal_pending(current) && timeout);
+
+ finish_wait(sk_sleep(sk), &wait);
+ }
+}
+
+static void virtio_transport_do_close(struct vsock_sock *vsk,
+ bool cancel_timeout)
+{
+ struct sock *sk = sk_vsock(vsk);
+
+ sock_set_flag(sk, SOCK_DONE);
+ vsk->peer_shutdown = SHUTDOWN_MASK;
+ if (vsock_stream_has_data(vsk) <= 0)
+ sk->sk_state = SS_DISCONNECTING;
+ sk->sk_state_change(sk);
+
+ if (vsk->close_work_scheduled &&
+ (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) {
+ vsk->close_work_scheduled = false;
+
+ vsock_remove_sock(vsk);
+
+ /* Release refcnt obtained when we scheduled the timeout */
+ sock_put(sk);
+ }
+}
+
+static void virtio_transport_close_timeout(struct work_struct *work)
+{
+ struct vsock_sock *vsk =
+ container_of(work, struct vsock_sock, close_work.work);
+ struct sock *sk = sk_vsock(vsk);
+
+ sock_hold(sk);
+ lock_sock(sk);
+
+ if (!sock_flag(sk, SOCK_DONE)) {
+ (void)virtio_transport_reset(vsk, NULL);
+
+ virtio_transport_do_close(vsk, false);
+ }
+
+ vsk->close_work_scheduled = false;
+
+ release_sock(sk);
+ sock_put(sk);
+}
+
+/* User context, vsk->sk is locked */
+static bool virtio_transport_close(struct vsock_sock *vsk)
+{
+ struct sock *sk = &vsk->sk;
+
+ if (!(sk->sk_state == SS_CONNECTED ||
+ sk->sk_state == SS_DISCONNECTING))
+ return true;
+
+ /* Already received SHUTDOWN from peer, reply with RST */
+ if ((vsk->peer_shutdown & SHUTDOWN_MASK) == SHUTDOWN_MASK) {
+ (void)virtio_transport_reset(vsk, NULL);
+ return true;
+ }
+
+ if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK)
+ (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK);
+
+ if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING))
+ virtio_transport_wait_close(sk, sk->sk_lingertime);
+
+ if (sock_flag(sk, SOCK_DONE)) {
+ return true;
+ }
+
+ sock_hold(sk);
+ INIT_DELAYED_WORK(&vsk->close_work,
+ virtio_transport_close_timeout);
+ vsk->close_work_scheduled = true;
+ schedule_delayed_work(&vsk->close_work, VSOCK_CLOSE_TIMEOUT);
+ return false;
+}
+
+void virtio_transport_release(struct vsock_sock *vsk)
+{
+ struct sock *sk = &vsk->sk;
+ bool remove_sock = true;
+
+ lock_sock(sk);
+ if (sk->sk_type == SOCK_STREAM)
+ remove_sock = virtio_transport_close(vsk);
+ release_sock(sk);
+
+ if (remove_sock)
+ vsock_remove_sock(vsk);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_release);
+
+static int
+virtio_transport_recv_connecting(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ int err;
+ int skerr;
+
+ switch (le16_to_cpu(pkt->hdr.op)) {
+ case VIRTIO_VSOCK_OP_RESPONSE:
+ sk->sk_state = SS_CONNECTED;
+ sk->sk_socket->state = SS_CONNECTED;
+ vsock_insert_connected(vsk);
+ sk->sk_state_change(sk);
+ break;
+ case VIRTIO_VSOCK_OP_INVALID:
+ break;
+ case VIRTIO_VSOCK_OP_RST:
+ skerr = ECONNRESET;
+ err = 0;
+ goto destroy;
+ default:
+ skerr = EPROTO;
+ err = -EINVAL;
+ goto destroy;
+ }
+ return 0;
+
+destroy:
+ virtio_transport_reset(vsk, pkt);
+ sk->sk_state = SS_UNCONNECTED;
+ sk->sk_err = skerr;
+ sk->sk_error_report(sk);
+ return err;
+}
+
+static int
+virtio_transport_recv_connected(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ int err = 0;
+
+ switch (le16_to_cpu(pkt->hdr.op)) {
+ case VIRTIO_VSOCK_OP_RW:
+ pkt->len = le32_to_cpu(pkt->hdr.len);
+ pkt->off = 0;
+
+ spin_lock_bh(&vvs->rx_lock);
+ virtio_transport_inc_rx_pkt(vvs, pkt);
+ list_add_tail(&pkt->list, &vvs->rx_queue);
+ spin_unlock_bh(&vvs->rx_lock);
+
+ sk->sk_data_ready(sk);
+ return err;
+ case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+ sk->sk_write_space(sk);
+ break;
+ case VIRTIO_VSOCK_OP_SHUTDOWN:
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
+ vsk->peer_shutdown |= RCV_SHUTDOWN;
+ if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+ vsk->peer_shutdown |= SEND_SHUTDOWN;
+ if (vsk->peer_shutdown == SHUTDOWN_MASK &&
+ vsock_stream_has_data(vsk) <= 0)
+ sk->sk_state = SS_DISCONNECTING;
+ if (le32_to_cpu(pkt->hdr.flags))
+ sk->sk_state_change(sk);
+ break;
+ case VIRTIO_VSOCK_OP_RST:
+ virtio_transport_do_close(vsk, true);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ virtio_transport_free_pkt(pkt);
+ return err;
+}
+
+static void
+virtio_transport_recv_disconnecting(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+
+ if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
+ virtio_transport_do_close(vsk, true);
+}
+
+static int
+virtio_transport_send_response(struct vsock_sock *vsk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct virtio_vsock_pkt_info info = {
+ .op = VIRTIO_VSOCK_OP_RESPONSE,
+ .type = VIRTIO_VSOCK_TYPE_STREAM,
+ .remote_cid = le32_to_cpu(pkt->hdr.src_cid),
+ .remote_port = le32_to_cpu(pkt->hdr.src_port),
+ .reply = true,
+ };
+
+ return virtio_transport_send_pkt_info(vsk, &info);
+}
+
+/* Handle server socket */
+static int
+virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct vsock_sock *vchild;
+ struct sock *child;
+
+ if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
+ virtio_transport_reset(vsk, pkt);
+ return -EINVAL;
+ }
+
+ if (sk_acceptq_is_full(sk)) {
+ virtio_transport_reset(vsk, pkt);
+ return -ENOMEM;
+ }
+
+ child = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
+ sk->sk_type, 0);
+ if (!child) {
+ virtio_transport_reset(vsk, pkt);
+ return -ENOMEM;
+ }
+
+ sk->sk_ack_backlog++;
+
+ lock_sock_nested(child, SINGLE_DEPTH_NESTING);
+
+ child->sk_state = SS_CONNECTED;
+
+ vchild = vsock_sk(child);
+ vsock_addr_init(&vchild->local_addr, le32_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port));
+ vsock_addr_init(&vchild->remote_addr, le32_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+
+ vsock_insert_connected(vchild);
+ vsock_enqueue_accept(sk, child);
+ virtio_transport_send_response(vchild, pkt);
+
+ release_sock(child);
+
+ sk->sk_data_ready(sk);
+ return 0;
+}
+
+static bool virtio_transport_space_update(struct sock *sk,
+ struct virtio_vsock_pkt *pkt)
+{
+ struct vsock_sock *vsk = vsock_sk(sk);
+ struct virtio_vsock_sock *vvs = vsk->trans;
+ bool space_available;
+
+ /* buf_alloc and fwd_cnt is always included in the hdr */
+ spin_lock_bh(&vvs->tx_lock);
+ vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+ vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
+ space_available = virtio_transport_has_space(vsk);
+ spin_unlock_bh(&vvs->tx_lock);
+ return space_available;
+}
+
+/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
+ * lock.
+ */
+void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
+{
+ struct sockaddr_vm src, dst;
+ struct vsock_sock *vsk;
+ struct sock *sk;
+ bool space_available;
+
+ vsock_addr_init(&src, le32_to_cpu(pkt->hdr.src_cid),
+ le32_to_cpu(pkt->hdr.src_port));
+ vsock_addr_init(&dst, le32_to_cpu(pkt->hdr.dst_cid),
+ le32_to_cpu(pkt->hdr.dst_port));
+
+ trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
+ dst.svm_cid, dst.svm_port,
+ le32_to_cpu(pkt->hdr.len),
+ le16_to_cpu(pkt->hdr.type),
+ le16_to_cpu(pkt->hdr.op),
+ le32_to_cpu(pkt->hdr.flags),
+ le32_to_cpu(pkt->hdr.buf_alloc),
+ le32_to_cpu(pkt->hdr.fwd_cnt));
+
+ if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
+ (void)virtio_transport_reset_no_sock(pkt);
+ goto free_pkt;
+ }
+
+ /* The socket must be in connected or bound table
+ * otherwise send reset back
+ */
+ sk = vsock_find_connected_socket(&src, &dst);
+ if (!sk) {
+ sk = vsock_find_bound_socket(&dst);
+ if (!sk) {
+ (void)virtio_transport_reset_no_sock(pkt);
+ goto free_pkt;
+ }
+ }
+
+ vsk = vsock_sk(sk);
+
+ space_available = virtio_transport_space_update(sk, pkt);
+
+ lock_sock(sk);
+
+ /* Update CID in case it has changed after a transport reset event */
+ vsk->local_addr.svm_cid = dst.svm_cid;
+
+ if (space_available)
+ sk->sk_write_space(sk);
+
+ switch (sk->sk_state) {
+ case VSOCK_SS_LISTEN:
+ virtio_transport_recv_listen(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ case SS_CONNECTING:
+ virtio_transport_recv_connecting(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ case SS_CONNECTED:
+ virtio_transport_recv_connected(sk, pkt);
+ break;
+ case SS_DISCONNECTING:
+ virtio_transport_recv_disconnecting(sk, pkt);
+ virtio_transport_free_pkt(pkt);
+ break;
+ default:
+ virtio_transport_free_pkt(pkt);
+ break;
+ }
+ release_sock(sk);
+
+ /* Release refcnt obtained when we fetched this socket out of the
+ * bound or connected list.
+ */
+ sock_put(sk);
+ return;
+
+free_pkt:
+ virtio_transport_free_pkt(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
+
+void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
+{
+ kfree(pkt->buf);
+ kfree(pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Asias He");
+MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 4120b7a538be..4be4fbbc0b50 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1644,6 +1644,8 @@ static void vmci_transport_destruct(struct vsock_sock *vsk)
static void vmci_transport_release(struct vsock_sock *vsk)
{
+ vsock_remove_sock(vsk);
+
if (!vmci_handle_is_invalid(vmci_trans(vsk)->dg_handle)) {
vmci_datagram_destroy_handle(vmci_trans(vsk)->dg_handle);
vmci_trans(vsk)->dg_handle = VMCI_INVALID_HANDLE;
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index da49c0b1fd32..b0e11b6dc994 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -715,7 +715,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy,
ASSERT_RTNL();
- if (!config_enabled(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
+ if (!IS_ENABLED(CONFIG_CFG80211_REG_RELAX_NO_IR) ||
!(wiphy->regulatory_flags & REGULATORY_ENABLE_RELAX_NO_IR))
return false;