diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/fib_frontend.c | 2 | ||||
-rw-r--r-- | net/ipv4/fib_hash.c | 54 | ||||
-rw-r--r-- | net/ipv4/fib_lookup.h | 5 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 7 | ||||
-rw-r--r-- | net/ipv4/gre.c | 5 | ||||
-rw-r--r-- | net/ipv4/icmp.c | 3 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 4 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 27 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 3 | ||||
-rw-r--r-- | net/ipv4/inetpeer.c | 138 | ||||
-rw-r--r-- | net/ipv4/ip_gre.c | 7 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 10 | ||||
-rw-r--r-- | net/ipv4/ipip.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/arp_tables.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_tables.c | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_nat_core.c | 40 | ||||
-rw-r--r-- | net/ipv4/proc.c | 9 | ||||
-rw-r--r-- | net/ipv4/protocol.c | 8 | ||||
-rw-r--r-- | net/ipv4/route.c | 75 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 42 | ||||
-rw-r--r-- | net/ipv4/tunnel4.c | 29 | ||||
-rw-r--r-- | net/ipv4/udp.c | 6 |
27 files changed, 309 insertions, 210 deletions
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 36e27c2107de..eb6f69a8f27a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1052,7 +1052,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { hlist_del(node); fib_table_flush(tb); - kfree(tb); + fib_free_table(tb); } } kfree(net->ipv4.fib_table_hash); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 43e1c594ce8f..b3acb0417b21 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -120,11 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz, struct fib_node *f; hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { - struct hlist_head __rcu *new_head; + struct hlist_head *new_head; hlist_del_rcu(&f->fn_hash); - new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + new_head = rcu_dereference_protected(fz->fz_hash, 1) + + fn_hash(f->fn_key, fz); hlist_add_head_rcu(&f->fn_hash, new_head); } } @@ -179,8 +180,8 @@ static void fn_rehash_zone(struct fn_zone *fz) memcpy(&nfz, fz, sizeof(nfz)); write_seqlock_bh(&fz->fz_lock); - old_ht = fz->fz_hash; - nfz.fz_hash = ht; + old_ht = rcu_dereference_protected(fz->fz_hash, 1); + RCU_INIT_POINTER(nfz.fz_hash, ht); nfz.fz_hashmask = new_hashmask; nfz.fz_divisor = new_divisor; fn_rebuild_zone(&nfz, old_ht, old_divisor); @@ -236,7 +237,7 @@ fn_new_zone(struct fn_hash *table, int z) seqlock_init(&fz->fz_lock); fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; fz->fz_hashmask = fz->fz_divisor - 1; - fz->fz_hash = fz->fz_embedded_hash; + RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash); fz->fz_order = z; fz->fz_revorder = 32 - z; fz->fz_mask = inet_make_mask(z); @@ -272,7 +273,7 @@ int fib_table_lookup(struct fib_table *tb, for (fz = rcu_dereference(t->fn_zone_list); fz != NULL; fz = rcu_dereference(fz->fz_next)) { - struct hlist_head __rcu *head; + struct hlist_head *head; struct hlist_node *node; struct fib_node *f; __be32 k; @@ -282,7 +283,7 @@ int fib_table_lookup(struct fib_table *tb, seq = read_seqbegin(&fz->fz_lock); k = fz_key(flp->fl4_dst, fz); - head = &fz->fz_hash[fn_hash(k, fz)]; + head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz); hlist_for_each_entry_rcu(f, node, head, fn_hash) { if (f->fn_key != k) continue; @@ -311,6 +312,7 @@ void fib_table_select_default(struct fib_table *tb, struct fib_info *last_resort; struct fn_hash *t = (struct fn_hash *)tb->tb_data; struct fn_zone *fz = t->fn_zones[0]; + struct hlist_head *head; if (fz == NULL) return; @@ -320,7 +322,8 @@ void fib_table_select_default(struct fib_table *tb, order = -1; rcu_read_lock(); - hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) { + head = rcu_dereference(fz->fz_hash); + hlist_for_each_entry_rcu(f, node, head, fn_hash) { struct fib_alias *fa; list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { @@ -374,7 +377,7 @@ out: /* Insert node F to FZ. */ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) { - struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz); hlist_add_head_rcu(&f->fn_hash, head); } @@ -382,7 +385,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) /* Return the node in FZ matching KEY. */ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) { - struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz); struct hlist_node *node; struct fib_node *f; @@ -662,7 +665,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) static int fn_flush_list(struct fn_zone *fz, int idx) { - struct hlist_head *head = &fz->fz_hash[idx]; + struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx; struct hlist_node *node, *n; struct fib_node *f; int found = 0; @@ -713,6 +716,24 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz, *next; + + next = table->fn_zone_list; + while (next != NULL) { + fz = next; + next = fz->fz_next; + + if (fz->fz_hash != fz->fz_embedded_hash) + fz_hash_free(fz->fz_hash, fz->fz_divisor); + + kfree(fz); + } + + kfree(tb); +} static inline int fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, @@ -761,14 +782,15 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, struct fn_zone *fz) { int h, s_h; + struct hlist_head *head = rcu_dereference(fz->fz_hash); - if (fz->fz_hash == NULL) + if (head == NULL) return skb->len; s_h = cb->args[3]; for (h = s_h; h < fz->fz_divisor; h++) { - if (hlist_empty(&fz->fz_hash[h])) + if (hlist_empty(head + h)) continue; - if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h]) < 0) { + if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) { cb->args[3] = h; return -1; } @@ -872,7 +894,7 @@ static struct fib_alias *fib_get_first(struct seq_file *seq) if (!iter->zone->fz_nent) continue; - iter->hash_head = iter->zone->fz_hash; + iter->hash_head = rcu_dereference(iter->zone->fz_hash); maxslot = iter->zone->fz_divisor; for (iter->bucket = 0; iter->bucket < maxslot; @@ -957,7 +979,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq) goto out; iter->bucket = 0; - iter->hash_head = iter->zone->fz_hash; + iter->hash_head = rcu_dereference(iter->zone->fz_hash); hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { list_for_each_entry(fa, &fn->fn_alias, fa_list) { diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a29edf2219c8..c079cc0ec651 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -47,11 +47,8 @@ extern int fib_detect_death(struct fib_info *fi, int order, static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { - if (res->fi != NULL) - fib_info_put(res->fi); + /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; - if (fi != NULL) - atomic_inc(&fi->fib_clntref); } #endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b14450895102..0f280348e0fd 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -365,7 +365,7 @@ static struct tnode *tnode_alloc(size_t size) if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else - return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); + return vzalloc(size); } static void __tnode_vfree(struct work_struct *arg) @@ -1797,6 +1797,11 @@ int fib_table_flush(struct fib_table *tb) return found; } +void fib_free_table(struct fib_table *tb) +{ + kfree(tb); +} + void fib_table_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index caea6885fdbd..c6933f2ea310 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -22,7 +22,7 @@ #include <net/gre.h> -static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly; +static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; static DEFINE_SPINLOCK(gre_proto_lock); int gre_add_protocol(const struct gre_protocol *proto, u8 version) @@ -51,7 +51,8 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) goto err_out; spin_lock(&gre_proto_lock); - if (gre_proto[version] != proto) + if (rcu_dereference_protected(gre_proto[version], + lockdep_is_held(&gre_proto_lock)) != proto) goto err_out_unlock; rcu_assign_pointer(gre_proto[version], NULL); spin_unlock(&gre_proto_lock); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 96bc7f9475a3..e5d1a44bcbdf 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -569,6 +569,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* No need to clone since we're just using its address. */ rt2 = rt; + if (!fl.nl_u.ip4_u.saddr) + fl.nl_u.ip4_u.saddr = rt->rt_src; + err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); switch (err) { case 0: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c8877c6c7216..3c53c2d89e3b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2306,10 +2306,8 @@ void ip_mc_drop_socket(struct sock *sk) in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); - if (in_dev != NULL) { + if (in_dev != NULL) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); - in_dev_put(in_dev); - } /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); call_rcu(&iml->rcu, ip_mc_socklist_reclaim); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ba8042665849..2ada17129fce 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -490,9 +490,11 @@ static int inet_csk_diag_dump(struct sock *sk, { struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); struct inet_sock *inet = inet_sk(sk); entry.family = sk->sk_family; @@ -512,7 +514,7 @@ static int inet_csk_diag_dump(struct sock *sk, entry.dport = ntohs(inet->inet_dport); entry.userlocks = sk->sk_userlocks; - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) return 0; } @@ -527,9 +529,11 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, { struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); + const struct nlattr *bc = nlmsg_find_attr(cb->nlh, + sizeof(*r), + INET_DIAG_REQ_BYTECODE); entry.family = tw->tw_family; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) @@ -548,7 +552,7 @@ static int inet_twsk_diag_dump(struct inet_timewait_sock *tw, entry.dport = ntohs(tw->tw_dport); entry.userlocks = 0; - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), nla_len(bc), &entry)) return 0; } @@ -618,7 +622,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, struct inet_diag_req *r = NLMSG_DATA(cb->nlh); struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt; - struct rtattr *bc = NULL; + const struct nlattr *bc = NULL; struct inet_sock *inet = inet_sk(sk); int j, s_j; int reqnum, s_reqnum; @@ -638,8 +642,9 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (!lopt || !lopt->qlen) goto out; - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - bc = (struct rtattr *)(r + 1); + if (nlmsg_attrlen(cb->nlh, sizeof(*r))) { + bc = nlmsg_find_attr(cb->nlh, sizeof(*r), + INET_DIAG_REQ_BYTECODE); entry.sport = inet->inet_num; entry.userlocks = sk->sk_userlocks; } @@ -672,8 +677,8 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); - if (!inet_diag_bc_run(RTA_DATA(bc), - RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(nla_data(bc), + nla_len(bc), &entry)) continue; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 1b344f30b463..3c0369a3a663 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -133,8 +133,7 @@ int __inet_inherit_port(struct sock *sk, struct sock *child) } } } - sk_add_bind_node(child, &tb->owners); - inet_csk(child)->icsk_bind_hash = tb; + inet_bind_hash(child, tb, port); spin_unlock(&head->lock); return 0; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 9ffa24b9a804..9e94d7cf4f8a 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -72,18 +72,19 @@ static struct kmem_cache *peer_cachep __read_mostly; #define node_height(x) x->avl_height #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) +#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node) static const struct inet_peer peer_fake_node = { - .avl_left = peer_avl_empty, - .avl_right = peer_avl_empty, + .avl_left = peer_avl_empty_rcu, + .avl_right = peer_avl_empty_rcu, .avl_height = 0 }; static struct { - struct inet_peer *root; + struct inet_peer __rcu *root; spinlock_t lock; int total; } peers = { - .root = peer_avl_empty, + .root = peer_avl_empty_rcu, .lock = __SPIN_LOCK_UNLOCKED(peers.lock), .total = 0, }; @@ -156,11 +157,14 @@ static void unlink_from_unused(struct inet_peer *p) */ #define lookup(_daddr, _stack) \ ({ \ - struct inet_peer *u, **v; \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ \ stackptr = _stack; \ *stackptr++ = &peers.root; \ - for (u = peers.root; u != peer_avl_empty; ) { \ + for (u = rcu_dereference_protected(peers.root, \ + lockdep_is_held(&peers.lock)); \ + u != peer_avl_empty; ) { \ if (_daddr == u->v4daddr) \ break; \ if ((__force __u32)_daddr < (__force __u32)u->v4daddr) \ @@ -168,7 +172,8 @@ static void unlink_from_unused(struct inet_peer *p) else \ v = &u->avl_right; \ *stackptr++ = v; \ - u = *v; \ + u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ } \ u; \ }) @@ -209,13 +214,17 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) /* Called with local BH disabled and the pool lock held. */ #define lookup_rightempty(start) \ ({ \ - struct inet_peer *u, **v; \ + struct inet_peer *u; \ + struct inet_peer __rcu **v; \ *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ - for (u = *v; u->avl_right != peer_avl_empty; ) { \ + for (u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ + u->avl_right != peer_avl_empty_rcu; ) { \ v = &u->avl_right; \ *stackptr++ = v; \ - u = *v; \ + u = rcu_dereference_protected(*v, \ + lockdep_is_held(&peers.lock)); \ } \ u; \ }) @@ -224,74 +233,86 @@ static struct inet_peer *lookup_rcu_bh(__be32 daddr) * Variable names are the proof of operation correctness. * Look into mm/map_avl.c for more detail description of the ideas. */ -static void peer_avl_rebalance(struct inet_peer **stack[], - struct inet_peer ***stackend) +static void peer_avl_rebalance(struct inet_peer __rcu **stack[], + struct inet_peer __rcu ***stackend) { - struct inet_peer **nodep, *node, *l, *r; + struct inet_peer __rcu **nodep; + struct inet_peer *node, *l, *r; int lh, rh; while (stackend > stack) { nodep = *--stackend; - node = *nodep; - l = node->avl_left; - r = node->avl_right; + node = rcu_dereference_protected(*nodep, + lockdep_is_held(&peers.lock)); + l = rcu_dereference_protected(node->avl_left, + lockdep_is_held(&peers.lock)); + r = rcu_dereference_protected(node->avl_right, + lockdep_is_held(&peers.lock)); lh = node_height(l); rh = node_height(r); if (lh > rh + 1) { /* l: RH+2 */ struct inet_peer *ll, *lr, *lrl, *lrr; int lrh; - ll = l->avl_left; - lr = l->avl_right; + ll = rcu_dereference_protected(l->avl_left, + lockdep_is_held(&peers.lock)); + lr = rcu_dereference_protected(l->avl_right, + lockdep_is_held(&peers.lock)); lrh = node_height(lr); if (lrh <= node_height(ll)) { /* ll: RH+1 */ - node->avl_left = lr; /* lr: RH or RH+1 */ - node->avl_right = r; /* r: RH */ + RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = lrh + 1; /* RH+1 or RH+2 */ - l->avl_left = ll; /* ll: RH+1 */ - l->avl_right = node; /* node: RH+1 or RH+2 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */ + RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */ l->avl_height = node->avl_height + 1; - *nodep = l; + RCU_INIT_POINTER(*nodep, l); } else { /* ll: RH, lr: RH+1 */ - lrl = lr->avl_left; /* lrl: RH or RH-1 */ - lrr = lr->avl_right; /* lrr: RH or RH-1 */ - node->avl_left = lrr; /* lrr: RH or RH-1 */ - node->avl_right = r; /* r: RH */ + lrl = rcu_dereference_protected(lr->avl_left, + lockdep_is_held(&peers.lock)); /* lrl: RH or RH-1 */ + lrr = rcu_dereference_protected(lr->avl_right, + lockdep_is_held(&peers.lock)); /* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ + RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = rh + 1; /* node: RH+1 */ - l->avl_left = ll; /* ll: RH */ - l->avl_right = lrl; /* lrl: RH or RH-1 */ + RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */ + RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */ l->avl_height = rh + 1; /* l: RH+1 */ - lr->avl_left = l; /* l: RH+1 */ - lr->avl_right = node; /* node: RH+1 */ + RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */ + RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */ lr->avl_height = rh + 2; - *nodep = lr; + RCU_INIT_POINTER(*nodep, lr); } } else if (rh > lh + 1) { /* r: LH+2 */ struct inet_peer *rr, *rl, *rlr, *rll; int rlh; - rr = r->avl_right; - rl = r->avl_left; + rr = rcu_dereference_protected(r->avl_right, + lockdep_is_held(&peers.lock)); + rl = rcu_dereference_protected(r->avl_left, + lockdep_is_held(&peers.lock)); rlh = node_height(rl); if (rlh <= node_height(rr)) { /* rr: LH+1 */ - node->avl_right = rl; /* rl: LH or LH+1 */ - node->avl_left = l; /* l: LH */ + RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = rlh + 1; /* LH+1 or LH+2 */ - r->avl_right = rr; /* rr: LH+1 */ - r->avl_left = node; /* node: LH+1 or LH+2 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */ + RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */ r->avl_height = node->avl_height + 1; - *nodep = r; + RCU_INIT_POINTER(*nodep, r); } else { /* rr: RH, rl: RH+1 */ - rlr = rl->avl_right; /* rlr: LH or LH-1 */ - rll = rl->avl_left; /* rll: LH or LH-1 */ - node->avl_right = rll; /* rll: LH or LH-1 */ - node->avl_left = l; /* l: LH */ + rlr = rcu_dereference_protected(rl->avl_right, + lockdep_is_held(&peers.lock)); /* rlr: LH or LH-1 */ + rll = rcu_dereference_protected(rl->avl_left, + lockdep_is_held(&peers.lock)); /* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ + RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = lh + 1; /* node: LH+1 */ - r->avl_right = rr; /* rr: LH */ - r->avl_left = rlr; /* rlr: LH or LH-1 */ + RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */ + RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */ r->avl_height = lh + 1; /* r: LH+1 */ - rl->avl_right = r; /* r: LH+1 */ - rl->avl_left = node; /* node: LH+1 */ + RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */ + RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */ rl->avl_height = lh + 2; - *nodep = rl; + RCU_INIT_POINTER(*nodep, rl); } } else { node->avl_height = (lh > rh ? lh : rh) + 1; @@ -303,10 +324,10 @@ static void peer_avl_rebalance(struct inet_peer **stack[], #define link_to_pool(n) \ do { \ n->avl_height = 1; \ - n->avl_left = peer_avl_empty; \ - n->avl_right = peer_avl_empty; \ - smp_wmb(); /* lockless readers can catch us now */ \ - **--stackptr = n; \ + n->avl_left = peer_avl_empty_rcu; \ + n->avl_right = peer_avl_empty_rcu; \ + /* lockless readers can catch us now */ \ + rcu_assign_pointer(**--stackptr, n); \ peer_avl_rebalance(stack, stackptr); \ } while (0) @@ -330,24 +351,25 @@ static void unlink_from_pool(struct inet_peer *p) * We use refcnt=-1 to alert lockless readers this entry is deleted. */ if (atomic_cmpxchg(&p->refcnt, 1, -1) == 1) { - struct inet_peer **stack[PEER_MAXDEPTH]; - struct inet_peer ***stackptr, ***delp; + struct inet_peer __rcu **stack[PEER_MAXDEPTH]; + struct inet_peer __rcu ***stackptr, ***delp; if (lookup(p->v4daddr, stack) != p) BUG(); delp = stackptr - 1; /* *delp[0] == p */ - if (p->avl_left == peer_avl_empty) { + if (p->avl_left == peer_avl_empty_rcu) { *delp[0] = p->avl_right; --stackptr; } else { /* look for a node to insert instead of p */ struct inet_peer *t; t = lookup_rightempty(p); - BUG_ON(*stackptr[-1] != t); + BUG_ON(rcu_dereference_protected(*stackptr[-1], + lockdep_is_held(&peers.lock)) != t); **--stackptr = t->avl_left; /* t is removed, t->v4daddr > x->v4daddr for any * x in p->avl_left subtree. * Put t in the old place of p. */ - *delp[0] = t; + RCU_INIT_POINTER(*delp[0], t); t->avl_left = p->avl_left; t->avl_right = p->avl_right; t->avl_height = p->avl_height; @@ -414,7 +436,7 @@ static int cleanup_once(unsigned long ttl) struct inet_peer *inet_getpeer(__be32 daddr, int create) { struct inet_peer *p; - struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; + struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; /* Look up for the address quickly, lockless. * Because of a concurrent writer, we might not find an existing entry. diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d0ffcbe369b7..70ff77f02eee 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1072,6 +1072,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } ipgre_tunnel_unlink(ign, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; t->parms.i_key = p.i_key; @@ -1324,7 +1325,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id); tunnel->dev = dev; strcpy(tunnel->parms.name, dev->name); @@ -1335,7 +1335,6 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) tunnel->hlen = sizeof(struct iphdr) + 4; dev_hold(dev); - rcu_assign_pointer(ign->tunnels_wc[0], tunnel); } @@ -1382,10 +1381,12 @@ static int __net_init ipgre_init_net(struct net *net) if ((err = register_netdev(ign->fb_tunnel_dev))) goto err_reg_dev; + rcu_assign_pointer(ign->tunnels_wc[0], + netdev_priv(ign->fb_tunnel_dev)); return 0; err_reg_dev: - free_netdev(ign->fb_tunnel_dev); + ipgre_dev_free(ign->fb_tunnel_dev); err_alloc_dev: return err; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 64b70ad162e3..3948c86e59ca 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -238,7 +238,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) but receiver should be enough clever f.e. to forward mtrace requests, sent to multicast group to reach destination designated router. */ -struct ip_ra_chain *ip_ra_chain; +struct ip_ra_chain __rcu *ip_ra_chain; static DEFINE_SPINLOCK(ip_ra_lock); @@ -253,7 +253,8 @@ static void ip_ra_destroy_rcu(struct rcu_head *head) int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) { - struct ip_ra_chain *ra, *new_ra, **rap; + struct ip_ra_chain *ra, *new_ra; + struct ip_ra_chain __rcu **rap; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; @@ -261,7 +262,10 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; spin_lock_bh(&ip_ra_lock); - for (rap = &ip_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { + for (rap = &ip_ra_chain; + (ra = rcu_dereference_protected(*rap, + lockdep_is_held(&ip_ra_lock))) != NULL; + rap = &ra->next) { if (ra->sk == sk) { if (on) { spin_unlock_bh(&ip_ra_lock); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e9b816e6cd73..cd300aaee78f 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -676,6 +676,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) } t = netdev_priv(dev); ipip_tunnel_unlink(ipn, t); + synchronize_net(); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; memcpy(dev->dev_addr, &p.iph.saddr, 4); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3cad2591ace0..3fac340a28d5 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -927,6 +927,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d31b007a6d80..a846d633b3b6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1124,6 +1124,7 @@ static int get_info(struct net *net, void __user *user, private = &tmp; } #endif + memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 295c97431e43..c04787ce1a71 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -47,26 +47,6 @@ __nf_nat_proto_find(u_int8_t protonum) return rcu_dereference(nf_nat_protos[protonum]); } -static const struct nf_nat_protocol * -nf_nat_proto_find_get(u_int8_t protonum) -{ - const struct nf_nat_protocol *p; - - rcu_read_lock(); - p = __nf_nat_proto_find(protonum); - if (!try_module_get(p->me)) - p = &nf_nat_unknown_protocol; - rcu_read_unlock(); - - return p; -} - -static void -nf_nat_proto_put(const struct nf_nat_protocol *p) -{ - module_put(p->me); -} - /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int hash_by_src(const struct net *net, u16 zone, @@ -588,6 +568,26 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> +static const struct nf_nat_protocol * +nf_nat_proto_find_get(u_int8_t protonum) +{ + const struct nf_nat_protocol *p; + + rcu_read_lock(); + p = __nf_nat_proto_find(protonum); + if (!try_module_get(p->me)) + p = &nf_nat_unknown_protocol; + rcu_read_unlock(); + + return p; +} + +static void +nf_nat_proto_put(const struct nf_nat_protocol *p) +{ + module_put(p->me); +} + static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4ae1f203f7cb..b14ec7d03b6e 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,13 +59,13 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) local_bh_enable(); socket_seq_show(seq); - seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", + seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, tcp_death_row.tw_count, sockets, - atomic_read(&tcp_memory_allocated)); - seq_printf(seq, "UDP: inuse %d mem %d\n", + atomic_long_read(&tcp_memory_allocated)); + seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), - atomic_read(&udp_memory_allocated)); + atomic_long_read(&udp_memory_allocated)); seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", @@ -253,6 +253,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP), SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER), + SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 65699c24411c..9ae5c01cd0b2 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,7 +28,7 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly; +const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; /* * Add a protocol handler to the hash tables @@ -38,7 +38,8 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { int hash = protocol & (MAX_INET_PROTOS - 1); - return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1; + return !cmpxchg((const struct net_protocol **)&inet_protos[hash], + NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol); @@ -50,7 +51,8 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { int ret, hash = protocol & (MAX_INET_PROTOS - 1); - ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1; + ret = (cmpxchg((const struct net_protocol **)&inet_protos[hash], + prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d6cb2bfcd8e1..987bf9adb318 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -198,7 +198,7 @@ const __u8 ip_tos2prio[16] = { */ struct rt_hash_bucket { - struct rtable *chain; + struct rtable __rcu *chain; }; #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ @@ -280,7 +280,7 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - if (!rt_hash_table[st->bucket].chain) + if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) continue; rcu_read_lock_bh(); r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); @@ -300,17 +300,17 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, { struct rt_cache_iter_state *st = seq->private; - r = r->dst.rt_next; + r = rcu_dereference_bh(r->dst.rt_next); while (!r) { rcu_read_unlock_bh(); do { if (--st->bucket < 0) return NULL; - } while (!rt_hash_table[st->bucket].chain); + } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; + r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); } - return rcu_dereference_bh(r); + return r; } static struct rtable *rt_cache_get_next(struct seq_file *seq, @@ -721,19 +721,23 @@ static void rt_do_flush(int process_context) for (i = 0; i <= rt_hash_mask; i++) { if (process_context && need_resched()) cond_resched(); - rth = rt_hash_table[i].chain; + rth = rcu_dereference_raw(rt_hash_table[i].chain); if (!rth) continue; spin_lock_bh(rt_hash_lock_addr(i)); #ifdef CONFIG_NET_NS { - struct rtable ** prev, * p; + struct rtable __rcu **prev; + struct rtable *p; - rth = rt_hash_table[i].chain; + rth = rcu_dereference_protected(rt_hash_table[i].chain, + lockdep_is_held(rt_hash_lock_addr(i))); /* defer releasing the head of the list after spin_unlock */ - for (tail = rth; tail; tail = tail->dst.rt_next) + for (tail = rth; tail; + tail = rcu_dereference_protected(tail->dst.rt_next, + lockdep_is_held(rt_hash_lock_addr(i)))) if (!rt_is_expired(tail)) break; if (rth != tail) @@ -741,8 +745,12 @@ static void rt_do_flush(int process_context) /* call rt_free on entries after the tail requiring flush */ prev = &rt_hash_table[i].chain; - for (p = *prev; p; p = next) { - next = p->dst.rt_next; + for (p = rcu_dereference_protected(*prev, + lockdep_is_held(rt_hash_lock_addr(i))); + p != NULL; + p = next) { + next = rcu_dereference_protected(p->dst.rt_next, + lockdep_is_held(rt_hash_lock_addr(i))); if (!rt_is_expired(p)) { prev = &p->dst.rt_next; } else { @@ -752,14 +760,15 @@ static void rt_do_flush(int process_context) } } #else - rth = rt_hash_table[i].chain; - rt_hash_table[i].chain = NULL; + rth = rcu_dereference_protected(rt_hash_table[i].chain, + lockdep_is_held(rt_hash_lock_addr(i))); + rcu_assign_pointer(rt_hash_table[i].chain, NULL); tail = NULL; #endif spin_unlock_bh(rt_hash_lock_addr(i)); for (; rth != tail; rth = next) { - next = rth->dst.rt_next; + next = rcu_dereference_protected(rth->dst.rt_next, 1); rt_free(rth); } } @@ -790,7 +799,7 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) while (aux != rth) { if (compare_hash_inputs(&aux->fl, &rth->fl)) return 0; - aux = aux->dst.rt_next; + aux = rcu_dereference_protected(aux->dst.rt_next, 1); } return ONE; } @@ -799,7 +808,8 @@ static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; unsigned long samples = 0; unsigned long sum = 0, sum2 = 0; unsigned long delta; @@ -825,11 +835,12 @@ static void rt_check_expire(void) samples++; - if (*rthp == NULL) + if (rcu_dereference_raw(*rthp) == NULL) continue; length = 0; spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { prefetch(rth->dst.rt_next); if (rt_is_expired(rth)) { *rthp = rth->dst.rt_next; @@ -941,7 +952,8 @@ static int rt_garbage_collect(struct dst_ops *ops) static unsigned long last_gc; static int rover; static int equilibrium; - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; unsigned long now = jiffies; int goal; int entries = dst_entries_get_fast(&ipv4_dst_ops); @@ -995,7 +1007,8 @@ static int rt_garbage_collect(struct dst_ops *ops) k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; spin_lock_bh(rt_hash_lock_addr(k)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -1071,7 +1084,7 @@ static int slow_chain_length(const struct rtable *head) while (rth) { length += has_noalias(head, rth); - rth = rth->dst.rt_next; + rth = rcu_dereference_protected(rth->dst.rt_next, 1); } return length >> FRACT_BITS; } @@ -1079,9 +1092,9 @@ static int slow_chain_length(const struct rtable *head) static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp, struct sk_buff *skb, int ifindex) { - struct rtable *rth, **rthp; + struct rtable *rth, *cand; + struct rtable __rcu **rthp, **candp; unsigned long now; - struct rtable *cand, **candp; u32 min_score; int chain_length; int attempts = !in_softirq(); @@ -1128,7 +1141,8 @@ restart: rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); - while ((rth = *rthp) != NULL) { + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (rt_is_expired(rth)) { *rthp = rth->dst.rt_next; rt_free(rth); @@ -1324,12 +1338,14 @@ EXPORT_SYMBOL(__ip_select_ident); static void rt_del(unsigned hash, struct rtable *rt) { - struct rtable **rthp, *aux; + struct rtable __rcu **rthp; + struct rtable *aux; rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); - while ((aux = *rthp) != NULL) { + while ((aux = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { if (aux == rt || rt_is_expired(aux)) { *rthp = aux->dst.rt_next; rt_free(aux); @@ -1346,7 +1362,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, { int i, k; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rth, **rthp; + struct rtable *rth; + struct rtable __rcu **rthp; __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct netevent_redirect netevent; @@ -1379,7 +1396,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], rt_genid(net)); - rthp=&rt_hash_table[hash].chain; + rthp = &rt_hash_table[hash].chain; while ((rth = rcu_dereference(*rthp)) != NULL) { struct rtable *rt; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d96c1da4b17c..1b4ec21497a4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -26,6 +26,8 @@ static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; +static int tcp_adv_win_scale_min = -31; +static int tcp_adv_win_scale_max = 31; /* Update system visible IP port range */ static void set_local_port_range(int range[2]) @@ -398,7 +400,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_mem, .maxlen = sizeof(sysctl_tcp_mem), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_doulongvec_minmax }, { .procname = "tcp_wmem", @@ -426,7 +428,9 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_tcp_adv_win_scale, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_adv_win_scale_min, + .extra2 = &tcp_adv_win_scale_max, }, { .procname = "tcp_tw_reuse", @@ -602,8 +606,7 @@ static struct ctl_table ipv4_table[] = { .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero + .proc_handler = proc_doulongvec_minmax, }, { .procname = "udp_rmem_min", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1664a0590bb8..f15c36a706ec 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,7 +282,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); -int sysctl_tcp_mem[3] __read_mostly; +long sysctl_tcp_mem[3] __read_mostly; int sysctl_tcp_wmem[3] __read_mostly; int sysctl_tcp_rmem[3] __read_mostly; @@ -290,7 +290,7 @@ EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); -atomic_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_long_t tcp_memory_allocated; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); /* @@ -2246,7 +2246,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ - if (val < 8 || val > MAX_TCP_WINDOW) { + if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) { err = -EINVAL; break; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3357f69e353d..6d8ab1c4efc3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -259,8 +259,11 @@ static void tcp_fixup_sndbuf(struct sock *sk) int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); - if (sk->sk_sndbuf < 3 * sndmem) - sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); + if (sk->sk_sndbuf < 3 * sndmem) { + sk->sk_sndbuf = 3 * sndmem; + if (sk->sk_sndbuf > sysctl_tcp_wmem[2]) + sk->sk_sndbuf = sysctl_tcp_wmem[2]; + } } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -396,7 +399,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } @@ -4861,7 +4864,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk) return 0; /* If we are under soft global TCP memory pressure, do not expand. */ - if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) + if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) return 0; /* If we filled the congestion window, do not expand. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8f8527d41682..e13da6de1fc7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -415,6 +415,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) !icsk->icsk_backoff) break; + if (sock_owned_by_user(sk)) + break; + icsk->icsk_backoff--; inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << icsk->icsk_backoff; @@ -429,11 +432,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); - } else if (sock_owned_by_user(sk)) { - /* RTO revert clocked out retransmission, - * but socket is locked. Will defer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - HZ/20, TCP_RTO_MAX); } else { /* RTO revert clocked out retransmission. * Will retransmit now */ @@ -2045,7 +2043,9 @@ get_req: } get_sk: sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) { + if (!net_eq(sock_net(sk), net)) + continue; + if (sk->sk_family == st->family) { cur = sk; goto out; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 43cf901d7659..a66735f75963 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -347,7 +347,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) * socket up. We've got bigger problems than * non-graceful socket closings. */ - LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 05b1ecf36763..61c2463e2753 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -231,11 +231,10 @@ void tcp_select_initial_window(int __space, __u32 mss, /* when initializing use the value from init_rcv_wnd * rather than the default from above */ - if (init_rcv_wnd && - (*rcv_wnd > init_rcv_wnd * mss)) - *rcv_wnd = init_rcv_wnd * mss; - else if (*rcv_wnd > init_cwnd * mss) - *rcv_wnd = init_cwnd * mss; + if (init_rcv_wnd) + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); + else + *rcv_wnd = min(*rcv_wnd, init_cwnd * mss); } /* Set the clamp no higher than max representable value */ @@ -386,27 +385,30 @@ struct tcp_out_options { */ static u8 tcp_cookie_size_check(u8 desired) { - if (desired > 0) { + int cookie_size; + + if (desired > 0) /* previously specified */ return desired; - } - if (sysctl_tcp_cookie_size <= 0) { + + cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size); + if (cookie_size <= 0) /* no default specified */ return 0; - } - if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { + + if (cookie_size <= TCP_COOKIE_MIN) /* value too small, specify minimum */ return TCP_COOKIE_MIN; - } - if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { + + if (cookie_size >= TCP_COOKIE_MAX) /* value too large, specify maximum */ return TCP_COOKIE_MAX; - } - if (0x1 & sysctl_tcp_cookie_size) { + + if (cookie_size & 1) /* 8-bit multiple, illegal, fix it */ - return (u8)(sysctl_tcp_cookie_size + 0x1); - } - return (u8)sysctl_tcp_cookie_size; + cookie_size++; + + return (u8)cookie_size; } /* Write previously computed TCP options to the packet. @@ -1513,6 +1515,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; + int win_divisor; if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) goto send_now; @@ -1544,13 +1547,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; - if (sysctl_tcp_tso_win_divisor) { + win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor); + if (win_divisor) { u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ - chunk /= sysctl_tcp_tso_win_divisor; + chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 9a17bd2a0a37..ac3b3ee4b07c 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -14,27 +14,32 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm_tunnel *tunnel4_handlers __read_mostly; -static struct xfrm_tunnel *tunnel64_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; +static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); -static inline struct xfrm_tunnel **fam_handlers(unsigned short family) +static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { return (family == AF_INET) ? &tunnel4_handlers : &tunnel64_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { - struct xfrm_tunnel **pprev; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; + int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel4_mutex); - for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { - if ((*pprev)->priority > priority) + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t->priority > priority) break; - if ((*pprev)->priority == priority) + if (t->priority == priority) goto err; } @@ -52,13 +57,17 @@ EXPORT_SYMBOL(xfrm4_tunnel_register); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) { - struct xfrm_tunnel **pprev; + struct xfrm_tunnel __rcu **pprev; + struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel4_mutex); - for (pprev = fam_handlers(family); *pprev; pprev = &(*pprev)->next) { - if (*pprev == handler) { + for (pprev = fam_handlers(family); + (t = rcu_dereference_protected(*pprev, + lockdep_is_held(&tunnel4_mutex))) != NULL; + pprev = &t->next) { + if (t == handler) { *pprev = handler->next; ret = 0; break; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b3f7e8cf18ac..5e0a3a582a59 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -110,7 +110,7 @@ struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); -int sysctl_udp_mem[3] __read_mostly; +long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); int sysctl_udp_rmem_min __read_mostly; @@ -119,7 +119,7 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min); int sysctl_udp_wmem_min __read_mostly; EXPORT_SYMBOL(sysctl_udp_wmem_min); -atomic_t udp_memory_allocated; +atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 @@ -1413,7 +1413,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } - if (sk->sk_filter) { + if (rcu_dereference_raw(sk->sk_filter)) { if (udp_lib_checksum_complete(skb)) goto drop; } |