summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/psnap.c2
-rw-r--r--net/8021q/vlan_core.c5
-rw-r--r--net/Kconfig4
-rw-r--r--net/batman-adv/bat_iv_ogm.c4
-rw-r--r--net/batman-adv/distributed-arp-table.c2
-rw-r--r--net/batman-adv/main.h2
-rw-r--r--net/batman-adv/translation-table.c8
-rw-r--r--net/bluetooth/a2mp.h10
-rw-r--r--net/bluetooth/bnep/bnep.h6
-rw-r--r--net/bluetooth/hci_conn.c146
-rw-r--r--net/bluetooth/hci_core.c143
-rw-r--r--net/bluetooth/hci_event.c114
-rw-r--r--net/bluetooth/hci_request.c350
-rw-r--r--net/bluetooth/hci_request.h2
-rw-r--r--net/bluetooth/hidp/core.c2
-rw-r--r--net/bluetooth/l2cap_core.c734
-rw-r--r--net/bluetooth/l2cap_sock.c67
-rw-r--r--net/bluetooth/mgmt.c113
-rw-r--r--net/bluetooth/rfcomm/core.c13
-rw-r--r--net/bluetooth/rfcomm/tty.c4
-rw-r--r--net/bluetooth/sco.c13
-rw-r--r--net/bluetooth/smp.c29
-rw-r--r--net/bpf/test_run.c76
-rw-r--r--net/bpfilter/main.c13
-rw-r--r--net/bridge/br_device.c6
-rw-r--r--net/bridge/br_netlink_tunnel.c12
-rw-r--r--net/bridge/br_private.h4
-rw-r--r--net/bridge/br_private_tunnel.h17
-rw-r--r--net/bridge/br_stp.c3
-rw-r--r--net/bridge/br_vlan.c76
-rw-r--r--net/bridge/br_vlan_options.c112
-rw-r--r--net/bridge/br_vlan_tunnel.c5
-rw-r--r--net/bridge/netfilter/ebtables.c2
-rw-r--r--net/caif/caif_dev.c3
-rw-r--r--net/ceph/messenger.c9
-rw-r--r--net/ceph/osd_client.c14
-rw-r--r--net/ceph/osdmap.c9
-rw-r--r--net/compat.c30
-rw-r--r--net/core/bpf_sk_storage.c285
-rw-r--r--net/core/datagram.c39
-rw-r--r--net/core/dev.c91
-rw-r--r--net/core/dev_ioctl.c6
-rw-r--r--net/core/devlink.c1338
-rw-r--r--net/core/drop_monitor.c35
-rw-r--r--net/core/fib_rules.c2
-rw-r--r--net/core/filter.c234
-rw-r--r--net/core/flow_dissector.c4
-rw-r--r--net/core/flow_offload.c34
-rw-r--r--net/core/lwt_bpf.c2
-rw-r--r--net/core/lwtunnel.c6
-rw-r--r--net/core/neighbour.c3
-rw-r--r--net/core/net-sysfs.c133
-rw-r--r--net/core/net-sysfs.h2
-rw-r--r--net/core/net_namespace.c15
-rw-r--r--net/core/netclassid_cgroup.c47
-rw-r--r--net/core/page_pool.c100
-rw-r--r--net/core/pktgen.c6
-rw-r--r--net/core/rtnetlink.c58
-rw-r--r--net/core/skbuff.c30
-rw-r--r--net/core/skmsg.c10
-rw-r--r--net/core/sock.c31
-rw-r--r--net/core/sock_map.c318
-rw-r--r--net/core/sock_reuseport.c50
-rw-r--r--net/core/xdp.c2
-rw-r--r--net/dccp/ccid.h2
-rw-r--r--net/dccp/diag.c9
-rw-r--r--net/dccp/minisocks.c1
-rw-r--r--net/decnet/dn_route.c4
-rw-r--r--net/dsa/dsa.c6
-rw-r--r--net/dsa/dsa2.c2
-rw-r--r--net/dsa/dsa_priv.h17
-rw-r--r--net/dsa/master.c21
-rw-r--r--net/dsa/port.c71
-rw-r--r--net/dsa/slave.c437
-rw-r--r--net/dsa/switch.c37
-rw-r--r--net/dsa/tag_8021q.c43
-rw-r--r--net/dsa/tag_ar9331.c2
-rw-r--r--net/dsa/tag_brcm.c25
-rw-r--r--net/dsa/tag_ocelot.c3
-rw-r--r--net/dsa/tag_qca.c2
-rw-r--r--net/dsa/tag_sja1105.c19
-rw-r--r--net/ethtool/Makefile3
-rw-r--r--net/ethtool/bitset.c100
-rw-r--r--net/ethtool/bitset.h6
-rw-r--r--net/ethtool/channels.c227
-rw-r--r--net/ethtool/coalesce.c353
-rw-r--r--net/ethtool/common.c114
-rw-r--r--net/ethtool/common.h9
-rw-r--r--net/ethtool/debug.c10
-rw-r--r--net/ethtool/eee.c206
-rw-r--r--net/ethtool/features.c304
-rw-r--r--net/ethtool/ioctl.c195
-rw-r--r--net/ethtool/linkinfo.c10
-rw-r--r--net/ethtool/linkmodes.c11
-rw-r--r--net/ethtool/netlink.c168
-rw-r--r--net/ethtool/netlink.h22
-rw-r--r--net/ethtool/pause.c145
-rw-r--r--net/ethtool/privflags.c211
-rw-r--r--net/ethtool/rings.c200
-rw-r--r--net/ethtool/strset.c15
-rw-r--r--net/ethtool/tsinfo.c143
-rw-r--r--net/ethtool/wol.c9
-rw-r--r--net/hsr/hsr_debugfs.c5
-rw-r--r--net/hsr/hsr_device.c64
-rw-r--r--net/hsr/hsr_device.h3
-rw-r--r--net/hsr/hsr_framereg.c15
-rw-r--r--net/hsr/hsr_main.c3
-rw-r--r--net/hsr/hsr_main.h1
-rw-r--r--net/hsr/hsr_netlink.c119
-rw-r--r--net/hsr/hsr_slave.c71
-rw-r--r--net/hsr/hsr_slave.h2
-rw-r--r--net/ieee802154/nl_policy.c6
-rw-r--r--net/ipv4/Kconfig7
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c8
-rw-r--r--net/ipv4/ah4.c2
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/bpf_tcp_ca.c40
-rw-r--r--net/ipv4/cipso_ipv4.c7
-rw-r--r--net/ipv4/devinet.c6
-rw-r--r--net/ipv4/esp4.c16
-rw-r--r--net/ipv4/esp4_offload.c32
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_lookup.h2
-rw-r--r--net/ipv4/fib_semantics.c26
-rw-r--r--net/ipv4/fib_trie.c10
-rw-r--r--net/ipv4/gre_demux.c12
-rw-r--r--net/ipv4/icmp.c35
-rw-r--r--net/ipv4/igmp.c2
-rw-r--r--net/ipv4/inet_connection_sock.c56
-rw-r--r--net/ipv4/inet_diag.c343
-rw-r--r--net/ipv4/ip_gre.c105
-rw-r--r--net/ipv4/ip_input.c3
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_tunnel.c6
-rw-r--r--net/ipv4/ip_tunnel_core.c4
-rw-r--r--net/ipv4/ip_vti.c38
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c4
-rw-r--r--net/ipv4/netfilter/ip_tables.c4
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c4
-rw-r--r--net/ipv4/nexthop.c2
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/raw_diag.c29
-rw-r--r--net/ipv4/route.c61
-rw-r--r--net/ipv4/sysctl_net_ipv4.c33
-rw-r--r--net/ipv4/tcp.c33
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_bpf.c272
-rw-r--r--net/ipv4/tcp_diag.c8
-rw-r--r--net/ipv4/tcp_input.c12
-rw-r--r--net/ipv4/tcp_ipv4.c10
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c12
-rw-r--r--net/ipv4/tcp_scalable.c17
-rw-r--r--net/ipv4/tcp_ulp.c9
-rw-r--r--net/ipv4/tcp_veno.c47
-rw-r--r--net/ipv4/tcp_yeah.c41
-rw-r--r--net/ipv4/udp.c30
-rw-r--r--net/ipv4/udp_bpf.c53
-rw-r--r--net/ipv4/udp_diag.c46
-rw-r--r--net/ipv4/udp_offload.c1
-rw-r--r--net/ipv6/Kconfig10
-rw-r--r--net/ipv6/Makefile3
-rw-r--r--net/ipv6/addrconf.c120
-rw-r--r--net/ipv6/af_inet6.c7
-rw-r--r--net/ipv6/ah6.c4
-rw-r--r--net/ipv6/esp6.c16
-rw-r--r--net/ipv6/esp6_offload.c36
-rw-r--r--net/ipv6/exthdrs.c203
-rw-r--r--net/ipv6/icmp.c2
-rw-r--r--net/ipv6/ila/ila_lwt.c2
-rw-r--r--net/ipv6/ip6_fib.c15
-rw-r--r--net/ipv6/ip6_gre.c8
-rw-r--r--net/ipv6/ip6_icmp.c34
-rw-r--r--net/ipv6/ip6_input.c3
-rw-r--r--net/ipv6/ip6_output.c70
-rw-r--r--net/ipv6/ip6_tunnel.c81
-rw-r--r--net/ipv6/ip6_vti.c34
-rw-r--r--net/ipv6/ip6mr.c5
-rw-r--r--net/ipv6/ipv6_sockglue.c10
-rw-r--r--net/ipv6/ndisc.c3
-rw-r--r--net/ipv6/netfilter/ip6_tables.c4
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c2
-rw-r--r--net/ipv6/raw.c8
-rw-r--r--net/ipv6/route.c11
-rw-r--r--net/ipv6/rpl.c123
-rw-r--r--net/ipv6/rpl_iptunnel.c382
-rw-r--r--net/ipv6/seg6_iptunnel.c6
-rw-r--r--net/ipv6/seg6_local.c7
-rw-r--r--net/ipv6/sysctl_net_ipv6.c21
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/ipv6/udp.c9
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/kcm/kcmproc.c2
-rw-r--r--net/kcm/kcmsock.c4
-rw-r--r--net/l2tp/l2tp_core.h2
-rw-r--r--net/llc/af_llc.c2
-rw-r--r--net/llc/llc_proc.c2
-rw-r--r--net/mac80211/aes_cmac.c21
-rw-r--r--net/mac80211/aes_gmac.c24
-rw-r--r--net/mac80211/cfg.c195
-rw-r--r--net/mac80211/debugfs.c56
-rw-r--r--net/mac80211/debugfs_key.c31
-rw-r--r--net/mac80211/debugfs_key.h10
-rw-r--r--net/mac80211/debugfs_netdev.c13
-rw-r--r--net/mac80211/debugfs_sta.c9
-rw-r--r--net/mac80211/driver-ops.h27
-rw-r--r--net/mac80211/he.c4
-rw-r--r--net/mac80211/ht.c64
-rw-r--r--net/mac80211/ieee80211_i.h32
-rw-r--r--net/mac80211/iface.c86
-rw-r--r--net/mac80211/key.c79
-rw-r--r--net/mac80211/key.h3
-rw-r--r--net/mac80211/main.c39
-rw-r--r--net/mac80211/mesh_hwmp.c3
-rw-r--r--net/mac80211/mlme.c154
-rw-r--r--net/mac80211/rx.c98
-rw-r--r--net/mac80211/scan.c3
-rw-r--r--net/mac80211/sta_info.c56
-rw-r--r--net/mac80211/sta_info.h5
-rw-r--r--net/mac80211/status.c91
-rw-r--r--net/mac80211/tx.c246
-rw-r--r--net/mac80211/util.c121
-rw-r--r--net/mac80211/vht.c58
-rw-r--r--net/mac80211/wep.c4
-rw-r--r--net/mac80211/wep.h2
-rw-r--r--net/mpls/internal.h4
-rw-r--r--net/mpls/mpls_iptunnel.c2
-rw-r--r--net/mptcp/Kconfig1
-rw-r--r--net/mptcp/Makefile3
-rw-r--r--net/mptcp/crypto.c17
-rw-r--r--net/mptcp/diag.c104
-rw-r--r--net/mptcp/mib.c69
-rw-r--r--net/mptcp/mib.h40
-rw-r--r--net/mptcp/options.c542
-rw-r--r--net/mptcp/pm.c242
-rw-r--r--net/mptcp/pm_netlink.c857
-rw-r--r--net/mptcp/protocol.c1108
-rw-r--r--net/mptcp/protocol.h208
-rw-r--r--net/mptcp/subflow.c403
-rw-r--r--net/mptcp/token.c38
-rw-r--r--net/netfilter/Kconfig8
-rw-r--r--net/netfilter/Makefile13
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c2
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c2
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c2
-rw-r--r--net/netfilter/ipset/ip_set_core.c34
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h639
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c45
-rw-r--r--net/netfilter/nf_conntrack_core.c211
-rw-r--r--net/netfilter/nf_conntrack_netlink.c3
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c20
-rw-r--r--net/netfilter/nf_conntrack_standalone.c21
-rw-r--r--net/netfilter/nf_flow_table_core.c62
-rw-r--r--net/netfilter/nf_flow_table_ip.c36
-rw-r--r--net/netfilter/nf_flow_table_offload.c337
-rw-r--r--net/netfilter/nf_queue.c96
-rw-r--r--net/netfilter/nf_synproxy_core.c2
-rw-r--r--net/netfilter/nf_tables_api.c265
-rw-r--r--net/netfilter/nf_tables_offload.c2
-rw-r--r--net/netfilter/nf_tables_set_core.c31
-rw-r--r--net/netfilter/nfnetlink_acct.c2
-rw-r--r--net/netfilter/nfnetlink_cthelper.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c10
-rw-r--r--net/netfilter/nft_bitwise.c14
-rw-r--r--net/netfilter/nft_chain_nat.c1
-rw-r--r--net/netfilter/nft_dynset.c45
-rw-r--r--net/netfilter/nft_exthdr.c8
-rw-r--r--net/netfilter/nft_fwd_netdev.c12
-rw-r--r--net/netfilter/nft_lookup.c1
-rw-r--r--net/netfilter/nft_payload.c1
-rw-r--r--net/netfilter/nft_set_bitmap.c6
-rw-r--r--net/netfilter/nft_set_hash.c9
-rw-r--r--net/netfilter/nft_set_pipapo.c683
-rw-r--r--net/netfilter/nft_set_pipapo.h280
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c1223
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h14
-rw-r--r--net/netfilter/nft_set_rbtree.c90
-rw-r--r--net/netfilter/nft_tunnel.c112
-rw-r--r--net/netfilter/x_tables.c6
-rw-r--r--net/netfilter/xt_IDLETIMER.c248
-rw-r--r--net/netfilter/xt_SECMARK.c2
-rw-r--r--net/netfilter/xt_hashlimit.c40
-rw-r--r--net/netfilter/xt_recent.c6
-rw-r--r--net/netlabel/netlabel_domainhash.c3
-rw-r--r--net/netlabel/netlabel_unlabeled.c3
-rw-r--r--net/netlink/af_netlink.c51
-rw-r--r--net/netlink/genetlink.c5
-rw-r--r--net/netrom/af_netrom.c2
-rw-r--r--net/netrom/nr_route.c4
-rw-r--r--net/nfc/digital_dep.c4
-rw-r--r--net/nfc/hci/core.c19
-rw-r--r--net/nfc/netlink.c4
-rw-r--r--net/openvswitch/actions.c67
-rw-r--r--net/openvswitch/datapath.c12
-rw-r--r--net/openvswitch/flow_netlink.c88
-rw-r--r--net/openvswitch/flow_table.c6
-rw-r--r--net/openvswitch/meter.c3
-rw-r--r--net/openvswitch/vport.c3
-rw-r--r--net/packet/af_packet.c34
-rw-r--r--net/packet/internal.h5
-rw-r--r--net/qrtr/Makefile2
-rw-r--r--net/qrtr/ns.c757
-rw-r--r--net/qrtr/qrtr.c43
-rw-r--r--net/qrtr/qrtr.h4
-rw-r--r--net/rds/rdma.c24
-rw-r--r--net/rxrpc/af_rxrpc.c37
-rw-r--r--net/rxrpc/ar-internal.h5
-rw-r--r--net/rxrpc/call_object.c3
-rw-r--r--net/rxrpc/conn_client.c13
-rw-r--r--net/rxrpc/input.c1
-rw-r--r--net/rxrpc/sendmsg.c75
-rw-r--r--net/sched/Kconfig2
-rw-r--r--net/sched/act_api.c44
-rw-r--r--net/sched/act_bpf.c3
-rw-r--r--net/sched/act_ct.c571
-rw-r--r--net/sched/act_mirred.c6
-rw-r--r--net/sched/act_pedit.c11
-rw-r--r--net/sched/act_sample.c2
-rw-r--r--net/sched/act_skbedit.c11
-rw-r--r--net/sched/cls_api.c168
-rw-r--r--net/sched/cls_flower.c72
-rw-r--r--net/sched/cls_matchall.c9
-rw-r--r--net/sched/cls_route.c4
-rw-r--r--net/sched/cls_tcindex.c3
-rw-r--r--net/sched/em_ipt.c2
-rw-r--r--net/sched/em_nbyte.c2
-rw-r--r--net/sched/sch_api.c21
-rw-r--r--net/sched/sch_atm.c2
-rw-r--r--net/sched/sch_cbs.c12
-rw-r--r--net/sched/sch_fifo.c97
-rw-r--r--net/sched/sch_fq.c22
-rw-r--r--net/sched/sch_fq_pie.c1
-rw-r--r--net/sched/sch_generic.c8
-rw-r--r--net/sched/sch_ingress.c11
-rw-r--r--net/sched/sch_netem.c2
-rw-r--r--net/sched/sch_pie.c49
-rw-r--r--net/sched/sch_red.c69
-rw-r--r--net/sched/sch_taprio.c13
-rw-r--r--net/sctp/diag.c15
-rw-r--r--net/sctp/input.c1
-rw-r--r--net/sctp/ipv6.c20
-rw-r--r--net/sctp/protocol.c28
-rw-r--r--net/sctp/sm_statefuns.c29
-rw-r--r--net/sctp/socket.c35
-rw-r--r--net/smc/af_smc.c27
-rw-r--r--net/smc/smc_clc.c7
-rw-r--r--net/smc/smc_core.c38
-rw-r--r--net/smc/smc_core.h10
-rw-r--r--net/smc/smc_diag.c5
-rw-r--r--net/smc/smc_ib.c66
-rw-r--r--net/smc/smc_ib.h1
-rw-r--r--net/smc/smc_llc.c2
-rw-r--r--net/smc/smc_tx.c2
-rw-r--r--net/socket.c33
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c13
-rw-r--r--net/switchdev/switchdev.c11
-rw-r--r--net/tipc/monitor.c12
-rw-r--r--net/tipc/msg.c3
-rw-r--r--net/tipc/msg.h5
-rw-r--r--net/tipc/netlink.c1
-rw-r--r--net/tipc/node.c10
-rw-r--r--net/tipc/socket.c4
-rw-r--r--net/tls/tls_device.c22
-rw-r--r--net/tls/tls_main.c31
-rw-r--r--net/unix/af_unix.c33
-rw-r--r--net/vmw_vsock/af_vsock.c20
-rw-r--r--net/vmw_vsock/hyperv_transport.c3
-rw-r--r--net/vmw_vsock/virtio_transport_common.c3
-rw-r--r--net/wireless/core.c6
-rw-r--r--net/wireless/core.h2
-rw-r--r--net/wireless/ethtool.c8
-rw-r--r--net/wireless/mlme.c33
-rw-r--r--net/wireless/nl80211.c429
-rw-r--r--net/wireless/pmsr.c32
-rw-r--r--net/wireless/rdev-ops.h37
-rw-r--r--net/wireless/reg.c4
-rw-r--r--net/wireless/scan.c17
-rw-r--r--net/wireless/sme.c11
-rw-r--r--net/wireless/trace.h54
-rw-r--r--net/wireless/util.c9
-rw-r--r--net/x25/x25_forward.c12
-rw-r--r--net/xdp/xsk.c2
-rw-r--r--net/xdp/xsk_queue.h7
-rw-r--r--net/xfrm/espintcp.c2
-rw-r--r--net/xfrm/xfrm_device.c37
-rw-r--r--net/xfrm/xfrm_interface.c6
-rw-r--r--net/xfrm/xfrm_output.c4
-rw-r--r--net/xfrm/xfrm_policy.c5
-rw-r--r--net/xfrm/xfrm_state.c2
-rw-r--r--net/xfrm/xfrm_user.c6
395 files changed, 20699 insertions, 4106 deletions
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 40ab2aea7b31..4492e8d7ad20 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -30,7 +30,7 @@ static struct datalink_proto *find_snap_client(const unsigned char *desc)
{
struct datalink_proto *proto = NULL, *p;
- list_for_each_entry_rcu(p, &snap_list, node) {
+ list_for_each_entry_rcu(p, &snap_list, node, lockdep_is_held(&snap_lock)) {
if (!memcmp(p->type, desc, 5)) {
proto = p;
break;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index a313165e7a67..78ec2e1b14d1 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -359,9 +359,8 @@ static void __vlan_vid_del(struct vlan_info *vlan_info,
int err;
err = vlan_kill_rx_filter_info(dev, proto, vid);
- if (err)
- pr_warn("failed to kill vid %04x/%d for device %s\n",
- proto, vid, dev->name);
+ if (err && dev->reg_state != NETREG_UNREGISTERING)
+ netdev_warn(dev, "failed to kill vid %04x/%d\n", proto, vid);
list_del(&vid_info->list);
kfree(vid_info);
diff --git a/net/Kconfig b/net/Kconfig
index b0937a700f01..df8d8c9bd021 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -52,6 +52,9 @@ config NET_INGRESS
config NET_EGRESS
bool
+config NET_REDIRECT
+ bool
+
config SKB_EXTENSIONS
bool
@@ -189,7 +192,6 @@ config BRIDGE_NETFILTER
depends on NETFILTER_ADVANCED
select NETFILTER_FAMILY_BRIDGE
select SKB_EXTENSIONS
- default m
---help---
Enabling this option will let arptables resp. iptables see bridged
ARP resp. IP traffic. If you want a bridging firewall, you probably
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index f0209505e41a..a7c8dd7ae513 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -789,6 +789,10 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface)
lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex);
+ /* interface already disabled by batadv_iv_ogm_iface_disable */
+ if (!*ogm_buff)
+ return;
+
/* the interface gets activated here to avoid race conditions between
* the moment of activating the interface in
* hardif_activate_interface() where the originator mac is set and
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 3d21dd83f8cc..b85da4b7a77b 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -88,7 +88,7 @@ struct batadv_dhcp_packet {
__u8 sname[64];
__u8 file[128];
__be32 magic;
- __u8 options[0];
+ __u8 options[];
};
#define BATADV_DHCP_YIADDR_LEN sizeof(((struct batadv_dhcp_packet *)0)->yiaddr)
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 692306df7b6f..2a234d0ad445 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -13,7 +13,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2020.0"
+#define BATADV_SOURCE_VERSION "2020.1"
#endif
/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 852932838ddc..a9635c882fe0 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -862,7 +862,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
u8 *tt_change_ptr;
spin_lock_bh(&orig_node->vlan_list_lock);
- hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ hlist_for_each_entry(vlan, &orig_node->vlan_list, list) {
num_vlan++;
num_entries += atomic_read(&vlan->tt.num_entries);
}
@@ -888,7 +888,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
(*tt_data)->num_vlan = htons(num_vlan);
tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1);
- hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ hlist_for_each_entry(vlan, &orig_node->vlan_list, list) {
tt_vlan->vid = htons(vlan->vid);
tt_vlan->crc = htonl(vlan->tt.crc);
@@ -937,7 +937,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
int change_offset;
spin_lock_bh(&bat_priv->softif_vlan_list_lock);
- hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) {
vlan_entries = atomic_read(&vlan->tt.num_entries);
if (vlan_entries < 1)
continue;
@@ -967,7 +967,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
(*tt_data)->num_vlan = htons(num_vlan);
tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1);
- hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) {
vlan_entries = atomic_read(&vlan->tt.num_entries);
if (vlan_entries < 1)
continue;
diff --git a/net/bluetooth/a2mp.h b/net/bluetooth/a2mp.h
index 0029d5119be6..2fd253a61a2a 100644
--- a/net/bluetooth/a2mp.h
+++ b/net/bluetooth/a2mp.h
@@ -36,14 +36,14 @@ struct a2mp_cmd {
__u8 code;
__u8 ident;
__le16 len;
- __u8 data[0];
+ __u8 data[];
} __packed;
/* A2MP command codes */
#define A2MP_COMMAND_REJ 0x01
struct a2mp_cmd_rej {
__le16 reason;
- __u8 data[0];
+ __u8 data[];
} __packed;
#define A2MP_DISCOVER_REQ 0x02
@@ -62,7 +62,7 @@ struct a2mp_cl {
struct a2mp_discov_rsp {
__le16 mtu;
__le16 ext_feat;
- struct a2mp_cl cl[0];
+ struct a2mp_cl cl[];
} __packed;
#define A2MP_CHANGE_NOTIFY 0x04
@@ -93,7 +93,7 @@ struct a2mp_amp_assoc_req {
struct a2mp_amp_assoc_rsp {
__u8 id;
__u8 status;
- __u8 amp_assoc[0];
+ __u8 amp_assoc[];
} __packed;
#define A2MP_CREATEPHYSLINK_REQ 0x0A
@@ -101,7 +101,7 @@ struct a2mp_amp_assoc_rsp {
struct a2mp_physlink_req {
__u8 local_id;
__u8 remote_id;
- __u8 amp_assoc[0];
+ __u8 amp_assoc[];
} __packed;
#define A2MP_CREATEPHYSLINK_RSP 0x0B
diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h
index 24f18b133959..9680473ed7ef 100644
--- a/net/bluetooth/bnep/bnep.h
+++ b/net/bluetooth/bnep/bnep.h
@@ -74,14 +74,14 @@ struct bnep_setup_conn_req {
__u8 type;
__u8 ctrl;
__u8 uuid_size;
- __u8 service[0];
+ __u8 service[];
} __packed;
struct bnep_set_filter_req {
__u8 type;
__u8 ctrl;
__be16 len;
- __u8 list[0];
+ __u8 list[];
} __packed;
struct bnep_control_rsp {
@@ -93,7 +93,7 @@ struct bnep_control_rsp {
struct bnep_ext_hdr {
__u8 type;
__u8 len;
- __u8 data[0];
+ __u8 data[];
} __packed;
/* BNEP ioctl defines */
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 87691404d0c6..e245bc155cc2 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -467,6 +467,23 @@ static void hci_conn_auto_accept(struct work_struct *work)
&conn->dst);
}
+static void le_disable_advertising(struct hci_dev *hdev)
+{
+ if (ext_adv_capable(hdev)) {
+ struct hci_cp_le_set_ext_adv_enable cp;
+
+ cp.enable = 0x00;
+ cp.num_of_sets = 0x00;
+
+ hci_send_cmd(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp),
+ &cp);
+ } else {
+ u8 enable = 0x00;
+ hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
+ &enable);
+ }
+}
+
static void le_conn_timeout(struct work_struct *work)
{
struct hci_conn *conn = container_of(work, struct hci_conn,
@@ -481,9 +498,8 @@ static void le_conn_timeout(struct work_struct *work)
* (which doesn't have a timeout of its own).
*/
if (conn->role == HCI_ROLE_SLAVE) {
- u8 enable = 0x00;
- hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
- &enable);
+ /* Disable LE Advertising */
+ le_disable_advertising(hdev);
hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT);
return;
}
@@ -898,6 +914,16 @@ static void hci_req_directed_advertising(struct hci_request *req,
cp.peer_addr_type = conn->dst_type;
bacpy(&cp.peer_addr, &conn->dst);
+ /* As per Core Spec 5.2 Vol 2, PART E, Sec 7.8.53, for
+ * advertising_event_property LE_LEGACY_ADV_DIRECT_IND
+ * does not supports advertising data when the advertising set already
+ * contains some, the controller shall return erroc code 'Invalid
+ * HCI Command Parameters(0x12).
+ * So it is required to remove adv set for handle 0x00. since we use
+ * instance 0 for directed adv.
+ */
+ hci_req_add(req, HCI_OP_LE_REMOVE_ADV_SET, sizeof(cp.handle), &cp.handle);
+
hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
if (own_addr_type == ADDR_LE_DEV_RANDOM &&
@@ -1029,11 +1055,8 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
* anyway have to disable it in order to start directed
* advertising.
*/
- if (hci_dev_test_flag(hdev, HCI_LE_ADV)) {
- u8 enable = 0x00;
- hci_req_add(&req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
- &enable);
- }
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ __hci_req_disable_advertising(&req);
/* If requested to connect as slave use directed advertising */
if (conn->role == HCI_ROLE_SLAVE) {
@@ -1725,3 +1748,110 @@ struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle)
return hchan;
}
+
+u32 hci_conn_get_phy(struct hci_conn *conn)
+{
+ u32 phys = 0;
+
+ hci_dev_lock(conn->hdev);
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 2, Part B page 471:
+ * Table 6.2: Packets defined for synchronous, asynchronous, and
+ * CSB logical transport types.
+ */
+ switch (conn->type) {
+ case SCO_LINK:
+ /* SCO logical transport (1 Mb/s):
+ * HV1, HV2, HV3 and DV.
+ */
+ phys |= BT_PHY_BR_1M_1SLOT;
+
+ break;
+
+ case ACL_LINK:
+ /* ACL logical transport (1 Mb/s) ptt=0:
+ * DH1, DM3, DH3, DM5 and DH5.
+ */
+ phys |= BT_PHY_BR_1M_1SLOT;
+
+ if (conn->pkt_type & (HCI_DM3 | HCI_DH3))
+ phys |= BT_PHY_BR_1M_3SLOT;
+
+ if (conn->pkt_type & (HCI_DM5 | HCI_DH5))
+ phys |= BT_PHY_BR_1M_5SLOT;
+
+ /* ACL logical transport (2 Mb/s) ptt=1:
+ * 2-DH1, 2-DH3 and 2-DH5.
+ */
+ if (!(conn->pkt_type & HCI_2DH1))
+ phys |= BT_PHY_EDR_2M_1SLOT;
+
+ if (!(conn->pkt_type & HCI_2DH3))
+ phys |= BT_PHY_EDR_2M_3SLOT;
+
+ if (!(conn->pkt_type & HCI_2DH5))
+ phys |= BT_PHY_EDR_2M_5SLOT;
+
+ /* ACL logical transport (3 Mb/s) ptt=1:
+ * 3-DH1, 3-DH3 and 3-DH5.
+ */
+ if (!(conn->pkt_type & HCI_3DH1))
+ phys |= BT_PHY_EDR_3M_1SLOT;
+
+ if (!(conn->pkt_type & HCI_3DH3))
+ phys |= BT_PHY_EDR_3M_3SLOT;
+
+ if (!(conn->pkt_type & HCI_3DH5))
+ phys |= BT_PHY_EDR_3M_5SLOT;
+
+ break;
+
+ case ESCO_LINK:
+ /* eSCO logical transport (1 Mb/s): EV3, EV4 and EV5 */
+ phys |= BT_PHY_BR_1M_1SLOT;
+
+ if (!(conn->pkt_type & (ESCO_EV4 | ESCO_EV5)))
+ phys |= BT_PHY_BR_1M_3SLOT;
+
+ /* eSCO logical transport (2 Mb/s): 2-EV3, 2-EV5 */
+ if (!(conn->pkt_type & ESCO_2EV3))
+ phys |= BT_PHY_EDR_2M_1SLOT;
+
+ if (!(conn->pkt_type & ESCO_2EV5))
+ phys |= BT_PHY_EDR_2M_3SLOT;
+
+ /* eSCO logical transport (3 Mb/s): 3-EV3, 3-EV5 */
+ if (!(conn->pkt_type & ESCO_3EV3))
+ phys |= BT_PHY_EDR_3M_1SLOT;
+
+ if (!(conn->pkt_type & ESCO_3EV5))
+ phys |= BT_PHY_EDR_3M_3SLOT;
+
+ break;
+
+ case LE_LINK:
+ if (conn->le_tx_phy & HCI_LE_SET_PHY_1M)
+ phys |= BT_PHY_LE_1M_TX;
+
+ if (conn->le_rx_phy & HCI_LE_SET_PHY_1M)
+ phys |= BT_PHY_LE_1M_RX;
+
+ if (conn->le_tx_phy & HCI_LE_SET_PHY_2M)
+ phys |= BT_PHY_LE_2M_TX;
+
+ if (conn->le_rx_phy & HCI_LE_SET_PHY_2M)
+ phys |= BT_PHY_LE_2M_RX;
+
+ if (conn->le_tx_phy & HCI_LE_SET_PHY_CODED)
+ phys |= BT_PHY_LE_CODED_TX;
+
+ if (conn->le_rx_phy & HCI_LE_SET_PHY_CODED)
+ phys |= BT_PHY_LE_CODED_RX;
+
+ break;
+ }
+
+ hci_dev_unlock(conn->hdev);
+
+ return phys;
+}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index cbbc34a006d1..2e7bc2da8371 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -31,6 +31,8 @@
#include <linux/debugfs.h>
#include <linux/crypto.h>
#include <linux/property.h>
+#include <linux/suspend.h>
+#include <linux/wait.h>
#include <asm/unaligned.h>
#include <net/bluetooth/bluetooth.h>
@@ -603,6 +605,9 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
if (hdev->commands[8] & 0x01)
hci_req_add(req, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL);
+ if (hdev->commands[18] & 0x04)
+ hci_req_add(req, HCI_OP_READ_DEF_ERR_DATA_REPORTING, 0, NULL);
+
/* Some older Broadcom based Bluetooth 1.2 controllers do not
* support the Read Page Scan Type command. Check support for
* this command in the bit mask of supported commands.
@@ -838,6 +843,26 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt)
sizeof(support), &support);
}
+ /* Set erroneous data reporting if supported to the wideband speech
+ * setting value
+ */
+ if (hdev->commands[18] & 0x08) {
+ bool enabled = hci_dev_test_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED);
+
+ if (enabled !=
+ (hdev->err_data_reporting == ERR_DATA_REPORTING_ENABLED)) {
+ struct hci_cp_write_def_err_data_reporting cp;
+
+ cp.err_data_reporting = enabled ?
+ ERR_DATA_REPORTING_ENABLED :
+ ERR_DATA_REPORTING_DISABLED;
+
+ hci_req_add(req, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING,
+ sizeof(cp), &cp);
+ }
+ }
+
/* Set Suggested Default Data Length to maximum if supported */
if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
struct hci_cp_le_write_def_data_len cp;
@@ -1764,6 +1789,9 @@ int hci_dev_do_close(struct hci_dev *hdev)
clear_bit(HCI_RUNNING, &hdev->flags);
hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
+ if (test_and_clear_bit(SUSPEND_POWERING_DOWN, hdev->suspend_tasks))
+ wake_up(&hdev->suspend_wait_q);
+
/* After this point our queues are empty
* and no tasks are scheduled. */
hdev->close(hdev);
@@ -2285,7 +2313,7 @@ void hci_link_keys_clear(struct hci_dev *hdev)
{
struct link_key *key;
- list_for_each_entry_rcu(key, &hdev->link_keys, list) {
+ list_for_each_entry(key, &hdev->link_keys, list) {
list_del_rcu(&key->list);
kfree_rcu(key, rcu);
}
@@ -2295,7 +2323,7 @@ void hci_smp_ltks_clear(struct hci_dev *hdev)
{
struct smp_ltk *k;
- list_for_each_entry_rcu(k, &hdev->long_term_keys, list) {
+ list_for_each_entry(k, &hdev->long_term_keys, list) {
list_del_rcu(&k->list);
kfree_rcu(k, rcu);
}
@@ -2305,7 +2333,7 @@ void hci_smp_irks_clear(struct hci_dev *hdev)
{
struct smp_irk *k;
- list_for_each_entry_rcu(k, &hdev->identity_resolving_keys, list) {
+ list_for_each_entry(k, &hdev->identity_resolving_keys, list) {
list_del_rcu(&k->list);
kfree_rcu(k, rcu);
}
@@ -2315,7 +2343,7 @@ void hci_blocked_keys_clear(struct hci_dev *hdev)
{
struct blocked_key *b;
- list_for_each_entry_rcu(b, &hdev->blocked_keys, list) {
+ list_for_each_entry(b, &hdev->blocked_keys, list) {
list_del_rcu(&b->list);
kfree_rcu(b, rcu);
}
@@ -2327,7 +2355,7 @@ bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16])
struct blocked_key *b;
rcu_read_lock();
- list_for_each_entry(b, &hdev->blocked_keys, list) {
+ list_for_each_entry_rcu(b, &hdev->blocked_keys, list) {
if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) {
blocked = true;
break;
@@ -3241,6 +3269,94 @@ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr,
}
}
+static int hci_suspend_wait_event(struct hci_dev *hdev)
+{
+#define WAKE_COND \
+ (find_first_bit(hdev->suspend_tasks, __SUSPEND_NUM_TASKS) == \
+ __SUSPEND_NUM_TASKS)
+
+ int i;
+ int ret = wait_event_timeout(hdev->suspend_wait_q,
+ WAKE_COND, SUSPEND_NOTIFIER_TIMEOUT);
+
+ if (ret == 0) {
+ bt_dev_dbg(hdev, "Timed out waiting for suspend");
+ for (i = 0; i < __SUSPEND_NUM_TASKS; ++i) {
+ if (test_bit(i, hdev->suspend_tasks))
+ bt_dev_dbg(hdev, "Bit %d is set", i);
+ clear_bit(i, hdev->suspend_tasks);
+ }
+
+ ret = -ETIMEDOUT;
+ } else {
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static void hci_prepare_suspend(struct work_struct *work)
+{
+ struct hci_dev *hdev =
+ container_of(work, struct hci_dev, suspend_prepare);
+
+ hci_dev_lock(hdev);
+ hci_req_prepare_suspend(hdev, hdev->suspend_state_next);
+ hci_dev_unlock(hdev);
+}
+
+static int hci_change_suspend_state(struct hci_dev *hdev,
+ enum suspended_state next)
+{
+ hdev->suspend_state_next = next;
+ set_bit(SUSPEND_PREPARE_NOTIFIER, hdev->suspend_tasks);
+ queue_work(hdev->req_workqueue, &hdev->suspend_prepare);
+ return hci_suspend_wait_event(hdev);
+}
+
+static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct hci_dev *hdev =
+ container_of(nb, struct hci_dev, suspend_notifier);
+ int ret = 0;
+
+ /* If powering down, wait for completion. */
+ if (mgmt_powering_down(hdev)) {
+ set_bit(SUSPEND_POWERING_DOWN, hdev->suspend_tasks);
+ ret = hci_suspend_wait_event(hdev);
+ if (ret)
+ goto done;
+ }
+
+ /* Suspend notifier should only act on events when powered. */
+ if (!hdev_is_powered(hdev))
+ goto done;
+
+ if (action == PM_SUSPEND_PREPARE) {
+ /* Suspend consists of two actions:
+ * - First, disconnect everything and make the controller not
+ * connectable (disabling scanning)
+ * - Second, program event filter/whitelist and enable scan
+ */
+ ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT);
+
+ /* Only configure whitelist if disconnect succeeded */
+ if (!ret)
+ ret = hci_change_suspend_state(hdev,
+ BT_SUSPEND_COMPLETE);
+ } else if (action == PM_POST_SUSPEND) {
+ ret = hci_change_suspend_state(hdev, BT_RUNNING);
+ }
+
+ /* If suspend failed, restore it to running */
+ if (ret && action == PM_SUSPEND_PREPARE)
+ hci_change_suspend_state(hdev, BT_RUNNING);
+
+done:
+ return ret ? notifier_from_errno(-EBUSY) : NOTIFY_STOP;
+}
+
/* Alloc HCI device */
struct hci_dev *hci_alloc_dev(void)
{
@@ -3299,6 +3415,7 @@ struct hci_dev *hci_alloc_dev(void)
INIT_LIST_HEAD(&hdev->mgmt_pending);
INIT_LIST_HEAD(&hdev->blacklist);
INIT_LIST_HEAD(&hdev->whitelist);
+ INIT_LIST_HEAD(&hdev->wakeable);
INIT_LIST_HEAD(&hdev->uuids);
INIT_LIST_HEAD(&hdev->link_keys);
INIT_LIST_HEAD(&hdev->long_term_keys);
@@ -3318,6 +3435,7 @@ struct hci_dev *hci_alloc_dev(void)
INIT_WORK(&hdev->tx_work, hci_tx_work);
INIT_WORK(&hdev->power_on, hci_power_on);
INIT_WORK(&hdev->error_reset, hci_error_reset);
+ INIT_WORK(&hdev->suspend_prepare, hci_prepare_suspend);
INIT_DELAYED_WORK(&hdev->power_off, hci_power_off);
@@ -3326,6 +3444,7 @@ struct hci_dev *hci_alloc_dev(void)
skb_queue_head_init(&hdev->raw_q);
init_waitqueue_head(&hdev->req_wait_q);
+ init_waitqueue_head(&hdev->suspend_wait_q);
INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout);
@@ -3437,6 +3556,11 @@ int hci_register_dev(struct hci_dev *hdev)
hci_sock_dev_event(hdev, HCI_DEV_REG);
hci_dev_hold(hdev);
+ hdev->suspend_notifier.notifier_call = hci_suspend_notifier;
+ error = register_pm_notifier(&hdev->suspend_notifier);
+ if (error)
+ goto err_wqueue;
+
queue_work(hdev->req_workqueue, &hdev->power_on);
return id;
@@ -3470,6 +3594,8 @@ void hci_unregister_dev(struct hci_dev *hdev)
hci_dev_do_close(hdev);
+ unregister_pm_notifier(&hdev->suspend_notifier);
+
if (!test_bit(HCI_INIT, &hdev->flags) &&
!hci_dev_test_flag(hdev, HCI_SETUP) &&
!hci_dev_test_flag(hdev, HCI_CONFIG)) {
@@ -4387,13 +4513,16 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_sco_hdr *hdr = (void *) skb->data;
struct hci_conn *conn;
- __u16 handle;
+ __u16 handle, flags;
skb_pull(skb, HCI_SCO_HDR_SIZE);
handle = __le16_to_cpu(hdr->handle);
+ flags = hci_flags(handle);
+ handle = hci_handle(handle);
- BT_DBG("%s len %d handle 0x%4.4x", hdev->name, skb->len, handle);
+ BT_DBG("%s len %d handle 0x%4.4x flags 0x%4.4x", hdev->name, skb->len,
+ handle, flags);
hdev->stat.sco_rx++;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 6ddc4a74a5e4..0a591be8b0ae 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -901,6 +901,37 @@ static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev,
hdev->inq_tx_power = rp->tx_power;
}
+static void hci_cc_read_def_err_data_reporting(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_def_err_data_reporting *rp = (void *)skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ hdev->err_data_reporting = rp->err_data_reporting;
+}
+
+static void hci_cc_write_def_err_data_reporting(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *)skb->data);
+ struct hci_cp_write_def_err_data_reporting *cp;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING);
+ if (!cp)
+ return;
+
+ hdev->err_data_reporting = cp->err_data_reporting;
+}
+
static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_rp_pin_code_reply *rp = (void *) skb->data;
@@ -2202,10 +2233,22 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
- if (conn)
+ if (conn) {
+ u8 type = conn->type;
+
mgmt_disconnect_failed(hdev, &conn->dst, conn->type,
conn->dst_type, status);
+ /* If the disconnection failed for any reason, the upper layer
+ * does not retry to disconnect in current implementation.
+ * Hence, we need to do some basic cleanup here and re-enable
+ * advertising if necessary.
+ */
+ hci_conn_del(conn);
+ if (type == LE_LINK)
+ hci_req_reenable_advertising(hdev);
+ }
+
hci_dev_unlock(hdev);
}
@@ -2474,6 +2517,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_ev_conn_complete *ev = (void *) skb->data;
+ struct inquiry_entry *ie;
struct hci_conn *conn;
BT_DBG("%s", hdev->name);
@@ -2482,14 +2526,30 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
if (!conn) {
- if (ev->link_type != SCO_LINK)
- goto unlock;
+ /* Connection may not exist if auto-connected. Check the inquiry
+ * cache to see if we've already discovered this bdaddr before.
+ * If found and link is an ACL type, create a connection class
+ * automatically.
+ */
+ ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
+ if (ie && ev->link_type == ACL_LINK) {
+ conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr,
+ HCI_ROLE_SLAVE);
+ if (!conn) {
+ bt_dev_err(hdev, "no memory for new conn");
+ goto unlock;
+ }
+ } else {
+ if (ev->link_type != SCO_LINK)
+ goto unlock;
- conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr);
- if (!conn)
- goto unlock;
+ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK,
+ &ev->bdaddr);
+ if (!conn)
+ goto unlock;
- conn->type = SCO_LINK;
+ conn->type = SCO_LINK;
+ }
}
if (!ev->status) {
@@ -2743,6 +2803,14 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_disconn_cfm(conn, ev->reason);
hci_conn_del(conn);
+ /* The suspend notifier is waiting for all devices to disconnect so
+ * clear the bit from pending tasks and inform the wait queue.
+ */
+ if (list_empty(&hdev->conn_hash.list) &&
+ test_and_clear_bit(SUSPEND_DISCONNECTING, hdev->suspend_tasks)) {
+ wake_up(&hdev->suspend_wait_q);
+ }
+
/* Re-enable advertising if necessary, since it might
* have been disabled by the connection. From the
* HCI_LE_Set_Advertise_Enable command description in
@@ -2895,14 +2963,14 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status,
if (!conn)
goto unlock;
- /* If we fail to read the encryption key size, assume maximum
- * (which is the same we do also when this HCI command isn't
- * supported.
+ /* While unexpected, the read_enc_key_size command may fail. The most
+ * secure approach is to then assume the key size is 0 to force a
+ * disconnection.
*/
if (rp->status) {
bt_dev_err(hdev, "failed to read key size for handle %u",
handle);
- conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ conn->enc_key_size = 0;
} else {
conn->enc_key_size = rp->key_size;
}
@@ -3302,6 +3370,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_read_inq_rsp_tx_power(hdev, skb);
break;
+ case HCI_OP_READ_DEF_ERR_DATA_REPORTING:
+ hci_cc_read_def_err_data_reporting(hdev, skb);
+ break;
+
+ case HCI_OP_WRITE_DEF_ERR_DATA_REPORTING:
+ hci_cc_write_def_err_data_reporting(hdev, skb);
+ break;
+
case HCI_OP_PIN_CODE_REPLY:
hci_cc_pin_code_reply(hdev, skb);
break;
@@ -4557,6 +4633,16 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev,
goto confirm;
}
+ /* If there already exists link key in local host, leave the
+ * decision to user space since the remote device could be
+ * legitimate or malicious.
+ */
+ if (hci_find_link_key(hdev, &ev->bdaddr)) {
+ bt_dev_dbg(hdev, "Local host already has link key");
+ confirm_hint = 1;
+ goto confirm;
+ }
+
BT_DBG("Auto-accept of user confirmation with %ums delay",
hdev->auto_accept_delay);
@@ -5858,6 +5944,11 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
u8 status = 0, event = hdr->evt, req_evt = 0;
u16 opcode = HCI_OP_NOP;
+ if (!event) {
+ bt_dev_warn(hdev, "Received unexpected HCI Event 00000000");
+ goto done;
+ }
+
if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->hci.req_event == event) {
struct hci_command_hdr *cmd_hdr = (void *) hdev->sent_cmd->data;
opcode = __le16_to_cpu(cmd_hdr->opcode);
@@ -6069,6 +6160,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
req_complete_skb(hdev, status, opcode, orig_skb);
}
+done:
kfree_skb(orig_skb);
kfree_skb(skb);
hdev->stat.evt_rx++;
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 2a1b64dbf76e..649e1e5ed446 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -34,6 +34,9 @@
#define HCI_REQ_PEND 1
#define HCI_REQ_CANCELED 2
+#define LE_SUSPEND_SCAN_WINDOW 0x0012
+#define LE_SUSPEND_SCAN_INTERVAL 0x0060
+
void hci_req_init(struct hci_request *req, struct hci_dev *hdev)
{
skb_queue_head_init(&req->cmd_q);
@@ -654,6 +657,11 @@ void hci_req_add_le_scan_disable(struct hci_request *req)
{
struct hci_dev *hdev = req->hdev;
+ if (hdev->scanning_paused) {
+ bt_dev_dbg(hdev, "Scanning is paused for suspend");
+ return;
+ }
+
if (use_ext_scan(hdev)) {
struct hci_cp_le_set_ext_scan_enable cp;
@@ -670,15 +678,55 @@ void hci_req_add_le_scan_disable(struct hci_request *req)
}
}
-static void add_to_white_list(struct hci_request *req,
- struct hci_conn_params *params)
+static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr,
+ u8 bdaddr_type)
+{
+ struct hci_cp_le_del_from_white_list cp;
+
+ cp.bdaddr_type = bdaddr_type;
+ bacpy(&cp.bdaddr, bdaddr);
+
+ bt_dev_dbg(req->hdev, "Remove %pMR (0x%x) from whitelist", &cp.bdaddr,
+ cp.bdaddr_type);
+ hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, sizeof(cp), &cp);
+}
+
+/* Adds connection to white list if needed. On error, returns -1. */
+static int add_to_white_list(struct hci_request *req,
+ struct hci_conn_params *params, u8 *num_entries,
+ bool allow_rpa)
{
struct hci_cp_le_add_to_white_list cp;
+ struct hci_dev *hdev = req->hdev;
+
+ /* Already in white list */
+ if (hci_bdaddr_list_lookup(&hdev->le_white_list, &params->addr,
+ params->addr_type))
+ return 0;
+
+ /* Select filter policy to accept all advertising */
+ if (*num_entries >= hdev->le_white_list_size)
+ return -1;
+
+ /* White list can not be used with RPAs */
+ if (!allow_rpa &&
+ hci_find_irk_by_addr(hdev, &params->addr, params->addr_type)) {
+ return -1;
+ }
+ /* During suspend, only wakeable devices can be in whitelist */
+ if (hdev->suspended && !params->wakeable)
+ return 0;
+
+ *num_entries += 1;
cp.bdaddr_type = params->addr_type;
bacpy(&cp.bdaddr, &params->addr);
+ bt_dev_dbg(hdev, "Add %pMR (0x%x) to whitelist", &cp.bdaddr,
+ cp.bdaddr_type);
hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp);
+
+ return 0;
}
static u8 update_white_list(struct hci_request *req)
@@ -686,7 +734,14 @@ static u8 update_white_list(struct hci_request *req)
struct hci_dev *hdev = req->hdev;
struct hci_conn_params *params;
struct bdaddr_list *b;
- uint8_t white_list_entries = 0;
+ u8 num_entries = 0;
+ bool pend_conn, pend_report;
+ /* We allow whitelisting even with RPAs in suspend. In the worst case,
+ * we won't be able to wake from devices that use the privacy1.2
+ * features. Additionally, once we support privacy1.2 and IRK
+ * offloading, we can update this to also check for those conditions.
+ */
+ bool allow_rpa = hdev->suspended;
/* Go through the current white list programmed into the
* controller one by one and check if that address is still
@@ -695,29 +750,28 @@ static u8 update_white_list(struct hci_request *req)
* command to remove it from the controller.
*/
list_for_each_entry(b, &hdev->le_white_list, list) {
- /* If the device is neither in pend_le_conns nor
- * pend_le_reports then remove it from the whitelist.
+ pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns,
+ &b->bdaddr,
+ b->bdaddr_type);
+ pend_report = hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ &b->bdaddr,
+ b->bdaddr_type);
+
+ /* If the device is not likely to connect or report,
+ * remove it from the whitelist.
*/
- if (!hci_pend_le_action_lookup(&hdev->pend_le_conns,
- &b->bdaddr, b->bdaddr_type) &&
- !hci_pend_le_action_lookup(&hdev->pend_le_reports,
- &b->bdaddr, b->bdaddr_type)) {
- struct hci_cp_le_del_from_white_list cp;
-
- cp.bdaddr_type = b->bdaddr_type;
- bacpy(&cp.bdaddr, &b->bdaddr);
-
- hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST,
- sizeof(cp), &cp);
+ if (!pend_conn && !pend_report) {
+ del_from_white_list(req, &b->bdaddr, b->bdaddr_type);
continue;
}
- if (hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) {
- /* White list can not be used with RPAs */
+ /* White list can not be used with RPAs */
+ if (!allow_rpa &&
+ hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) {
return 0x00;
}
- white_list_entries++;
+ num_entries++;
}
/* Since all no longer valid white list entries have been
@@ -731,47 +785,17 @@ static u8 update_white_list(struct hci_request *req)
* white list.
*/
list_for_each_entry(params, &hdev->pend_le_conns, action) {
- if (hci_bdaddr_list_lookup(&hdev->le_white_list,
- &params->addr, params->addr_type))
- continue;
-
- if (white_list_entries >= hdev->le_white_list_size) {
- /* Select filter policy to accept all advertising */
+ if (add_to_white_list(req, params, &num_entries, allow_rpa))
return 0x00;
- }
-
- if (hci_find_irk_by_addr(hdev, &params->addr,
- params->addr_type)) {
- /* White list can not be used with RPAs */
- return 0x00;
- }
-
- white_list_entries++;
- add_to_white_list(req, params);
}
/* After adding all new pending connections, walk through
* the list of pending reports and also add these to the
- * white list if there is still space.
+ * white list if there is still space. Abort if space runs out.
*/
list_for_each_entry(params, &hdev->pend_le_reports, action) {
- if (hci_bdaddr_list_lookup(&hdev->le_white_list,
- &params->addr, params->addr_type))
- continue;
-
- if (white_list_entries >= hdev->le_white_list_size) {
- /* Select filter policy to accept all advertising */
- return 0x00;
- }
-
- if (hci_find_irk_by_addr(hdev, &params->addr,
- params->addr_type)) {
- /* White list can not be used with RPAs */
+ if (add_to_white_list(req, params, &num_entries, allow_rpa))
return 0x00;
- }
-
- white_list_entries++;
- add_to_white_list(req, params);
}
/* Select filter policy to use white list */
@@ -866,6 +890,12 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
struct hci_dev *hdev = req->hdev;
u8 own_addr_type;
u8 filter_policy;
+ u8 window, interval;
+
+ if (hdev->scanning_paused) {
+ bt_dev_dbg(hdev, "Scanning is paused for suspend");
+ return;
+ }
/* Set require_privacy to false since no SCAN_REQ are send
* during passive scanning. Not using an non-resolvable address
@@ -896,8 +926,17 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
(hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY))
filter_policy |= 0x02;
- hci_req_start_scan(req, LE_SCAN_PASSIVE, hdev->le_scan_interval,
- hdev->le_scan_window, own_addr_type, filter_policy);
+ if (hdev->suspended) {
+ window = LE_SUSPEND_SCAN_WINDOW;
+ interval = LE_SUSPEND_SCAN_INTERVAL;
+ } else {
+ window = hdev->le_scan_window;
+ interval = hdev->le_scan_interval;
+ }
+
+ bt_dev_dbg(hdev, "LE passive scan with whitelist = %d", filter_policy);
+ hci_req_start_scan(req, LE_SCAN_PASSIVE, interval, window,
+ own_addr_type, filter_policy);
}
static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
@@ -918,6 +957,187 @@ static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
return adv_instance->scan_rsp_len;
}
+static void hci_req_clear_event_filter(struct hci_request *req)
+{
+ struct hci_cp_set_event_filter f;
+
+ memset(&f, 0, sizeof(f));
+ f.flt_type = HCI_FLT_CLEAR_ALL;
+ hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &f);
+
+ /* Update page scan state (since we may have modified it when setting
+ * the event filter).
+ */
+ __hci_req_update_scan(req);
+}
+
+static void hci_req_set_event_filter(struct hci_request *req)
+{
+ struct bdaddr_list *b;
+ struct hci_cp_set_event_filter f;
+ struct hci_dev *hdev = req->hdev;
+ u8 scan;
+
+ /* Always clear event filter when starting */
+ hci_req_clear_event_filter(req);
+
+ list_for_each_entry(b, &hdev->wakeable, list) {
+ memset(&f, 0, sizeof(f));
+ bacpy(&f.addr_conn_flt.bdaddr, &b->bdaddr);
+ f.flt_type = HCI_FLT_CONN_SETUP;
+ f.cond_type = HCI_CONN_SETUP_ALLOW_BDADDR;
+ f.addr_conn_flt.auto_accept = HCI_CONN_SETUP_AUTO_ON;
+
+ bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr);
+ hci_req_add(req, HCI_OP_SET_EVENT_FLT, sizeof(f), &f);
+ }
+
+ scan = !list_empty(&hdev->wakeable) ? SCAN_PAGE : SCAN_DISABLED;
+ hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+}
+
+static void hci_req_config_le_suspend_scan(struct hci_request *req)
+{
+ /* Can't change params without disabling first */
+ hci_req_add_le_scan_disable(req);
+
+ /* Configure params and enable scanning */
+ hci_req_add_le_passive_scan(req);
+
+ /* Block suspend notifier on response */
+ set_bit(SUSPEND_SCAN_ENABLE, req->hdev->suspend_tasks);
+}
+
+static void suspend_req_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ bt_dev_dbg(hdev, "Request complete opcode=0x%x, status=0x%x", opcode,
+ status);
+ if (test_and_clear_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks) ||
+ test_and_clear_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks)) {
+ wake_up(&hdev->suspend_wait_q);
+ }
+}
+
+/* Call with hci_dev_lock */
+void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next)
+{
+ int old_state;
+ struct hci_conn *conn;
+ struct hci_request req;
+ u8 page_scan;
+ int disconnect_counter;
+
+ if (next == hdev->suspend_state) {
+ bt_dev_dbg(hdev, "Same state before and after: %d", next);
+ goto done;
+ }
+
+ hdev->suspend_state = next;
+ hci_req_init(&req, hdev);
+
+ if (next == BT_SUSPEND_DISCONNECT) {
+ /* Mark device as suspended */
+ hdev->suspended = true;
+
+ /* Pause discovery if not already stopped */
+ old_state = hdev->discovery.state;
+ if (old_state != DISCOVERY_STOPPED) {
+ set_bit(SUSPEND_PAUSE_DISCOVERY, hdev->suspend_tasks);
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
+ queue_work(hdev->req_workqueue, &hdev->discov_update);
+ }
+
+ hdev->discovery_paused = true;
+ hdev->discovery_old_state = old_state;
+
+ /* Stop advertising */
+ old_state = hci_dev_test_flag(hdev, HCI_ADVERTISING);
+ if (old_state) {
+ set_bit(SUSPEND_PAUSE_ADVERTISING, hdev->suspend_tasks);
+ cancel_delayed_work(&hdev->discov_off);
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->discov_off, 0);
+ }
+
+ hdev->advertising_paused = true;
+ hdev->advertising_old_state = old_state;
+ /* Disable page scan */
+ page_scan = SCAN_DISABLED;
+ hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &page_scan);
+
+ /* Disable LE passive scan */
+ hci_req_add_le_scan_disable(&req);
+
+ /* Mark task needing completion */
+ set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks);
+
+ /* Prevent disconnects from causing scanning to be re-enabled */
+ hdev->scanning_paused = true;
+
+ /* Run commands before disconnecting */
+ hci_req_run(&req, suspend_req_complete);
+
+ disconnect_counter = 0;
+ /* Soft disconnect everything (power off) */
+ list_for_each_entry(conn, &hdev->conn_hash.list, list) {
+ hci_disconnect(conn, HCI_ERROR_REMOTE_POWER_OFF);
+ disconnect_counter++;
+ }
+
+ if (disconnect_counter > 0) {
+ bt_dev_dbg(hdev,
+ "Had %d disconnects. Will wait on them",
+ disconnect_counter);
+ set_bit(SUSPEND_DISCONNECTING, hdev->suspend_tasks);
+ }
+ } else if (next == BT_SUSPEND_COMPLETE) {
+ /* Unpause to take care of updating scanning params */
+ hdev->scanning_paused = false;
+ /* Enable event filter for paired devices */
+ hci_req_set_event_filter(&req);
+ /* Enable passive scan at lower duty cycle */
+ hci_req_config_le_suspend_scan(&req);
+ /* Pause scan changes again. */
+ hdev->scanning_paused = true;
+ hci_req_run(&req, suspend_req_complete);
+ } else {
+ hdev->suspended = false;
+ hdev->scanning_paused = false;
+
+ hci_req_clear_event_filter(&req);
+ /* Reset passive/background scanning to normal */
+ hci_req_config_le_suspend_scan(&req);
+
+ /* Unpause advertising */
+ hdev->advertising_paused = false;
+ if (hdev->advertising_old_state) {
+ set_bit(SUSPEND_UNPAUSE_ADVERTISING,
+ hdev->suspend_tasks);
+ hci_dev_set_flag(hdev, HCI_ADVERTISING);
+ queue_work(hdev->req_workqueue,
+ &hdev->discoverable_update);
+ hdev->advertising_old_state = 0;
+ }
+
+ /* Unpause discovery */
+ hdev->discovery_paused = false;
+ if (hdev->discovery_old_state != DISCOVERY_STOPPED &&
+ hdev->discovery_old_state != DISCOVERY_STOPPING) {
+ set_bit(SUSPEND_UNPAUSE_DISCOVERY, hdev->suspend_tasks);
+ hci_discovery_set_state(hdev, DISCOVERY_STARTING);
+ queue_work(hdev->req_workqueue, &hdev->discov_update);
+ }
+
+ hci_req_run(&req, suspend_req_complete);
+ }
+
+ hdev->suspend_state = next;
+
+done:
+ clear_bit(SUSPEND_PREPARE_NOTIFIER, hdev->suspend_tasks);
+ wake_up(&hdev->suspend_wait_q);
+}
+
static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
{
u8 instance = hdev->cur_adv_instance;
@@ -1499,7 +1719,7 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
if (err < 0) {
- BT_ERR("%s failed to generate new RPA", hdev->name);
+ bt_dev_err(hdev, "failed to generate new RPA");
return err;
}
@@ -2015,6 +2235,9 @@ void __hci_req_update_scan(struct hci_request *req)
if (mgmt_powering_down(hdev))
return;
+ if (hdev->scanning_paused)
+ return;
+
if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) ||
disconnected_whitelist_entries(hdev))
scan = SCAN_PAGE;
@@ -2504,23 +2727,6 @@ static int active_scan(struct hci_request *req, unsigned long opt)
BT_DBG("%s", hdev->name);
- if (hci_dev_test_flag(hdev, HCI_LE_ADV)) {
- hci_dev_lock(hdev);
-
- /* Don't let discovery abort an outgoing connection attempt
- * that's using directed advertising.
- */
- if (hci_lookup_le_connect(hdev)) {
- hci_dev_unlock(hdev);
- return -EBUSY;
- }
-
- cancel_adv_timeout(hdev);
- hci_dev_unlock(hdev);
-
- __hci_req_disable_advertising(req);
- }
-
/* If controller is scanning, it means the background scanning is
* running. Thus, we should temporarily stop it in order to set the
* discovery scanning parameters.
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index a7019fbeadd3..0e81614d235e 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -68,6 +68,8 @@ void __hci_req_update_eir(struct hci_request *req);
void hci_req_add_le_scan_disable(struct hci_request *req);
void hci_req_add_le_passive_scan(struct hci_request *req);
+void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next);
+
void hci_req_reenable_advertising(struct hci_dev *hdev);
void __hci_req_enable_advertising(struct hci_request *req);
void __hci_req_disable_advertising(struct hci_request *req);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index bef84b95e2c4..3b4fa27a44e6 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1279,7 +1279,7 @@ static int hidp_session_thread(void *arg)
add_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait);
/* This memory barrier is paired with wq_has_sleeper(). See
* sock_poll_wait() for more information why this is needed. */
- smp_mb();
+ smp_mb__before_atomic();
/* notify synchronous startup that we're ready */
atomic_inc(&session->state);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 195459a1e53e..fd9d0d08f9c9 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -45,6 +45,7 @@
#define LE_FLOWCTL_MAX_CREDITS 65535
bool disable_ertm;
+bool enable_ecred;
static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD;
@@ -419,6 +420,9 @@ static void l2cap_chan_timeout(struct work_struct *work)
BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
mutex_lock(&conn->chan_lock);
+ /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling
+ * this work. No need to call l2cap_chan_hold(chan) here again.
+ */
l2cap_chan_lock(chan);
if (chan->state == BT_CONNECTED || chan->state == BT_CONFIG)
@@ -431,12 +435,12 @@ static void l2cap_chan_timeout(struct work_struct *work)
l2cap_chan_close(chan, reason);
- l2cap_chan_unlock(chan);
-
chan->ops->close(chan);
- mutex_unlock(&conn->chan_lock);
+ l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
+
+ mutex_unlock(&conn->chan_lock);
}
struct l2cap_chan *l2cap_chan_create(void)
@@ -532,6 +536,17 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits)
skb_queue_head_init(&chan->tx_q);
}
+static void l2cap_ecred_init(struct l2cap_chan *chan, u16 tx_credits)
+{
+ l2cap_le_flowctl_init(chan, tx_credits);
+
+ /* L2CAP implementations shall support a minimum MPS of 64 octets */
+ if (chan->mps < L2CAP_ECRED_MIN_MPS) {
+ chan->mps = L2CAP_ECRED_MIN_MPS;
+ chan->rx_credits = (chan->imtu / chan->mps) + 1;
+ }
+}
+
void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
{
BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn,
@@ -638,6 +653,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
break;
case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
skb_queue_purge(&chan->tx_q);
break;
@@ -662,6 +678,29 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
}
EXPORT_SYMBOL_GPL(l2cap_chan_del);
+static void __l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
+ void *data)
+{
+ struct l2cap_chan *chan;
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ func(chan, data);
+ }
+}
+
+void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
+ void *data)
+{
+ if (!conn)
+ return;
+
+ mutex_lock(&conn->chan_lock);
+ __l2cap_chan_list(conn, func, data);
+ mutex_unlock(&conn->chan_lock);
+}
+
+EXPORT_SYMBOL_GPL(l2cap_chan_list);
+
static void l2cap_conn_update_id_addr(struct work_struct *work)
{
struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
@@ -704,6 +743,27 @@ static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
&rsp);
}
+static void l2cap_chan_ecred_connect_reject(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_ecred_conn_rsp rsp;
+ u16 result;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ result = L2CAP_CR_LE_AUTHORIZATION;
+ else
+ result = L2CAP_CR_LE_BAD_PSM;
+
+ l2cap_state_change(chan, BT_DISCONN);
+
+ memset(&rsp, 0, sizeof(rsp));
+
+ rsp.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp),
+ &rsp);
+}
+
static void l2cap_chan_connect_reject(struct l2cap_chan *chan)
{
struct l2cap_conn *conn = chan->conn;
@@ -749,8 +809,16 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason)
if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) {
if (conn->hcon->type == ACL_LINK)
l2cap_chan_connect_reject(chan);
- else if (conn->hcon->type == LE_LINK)
- l2cap_chan_le_connect_reject(chan);
+ else if (conn->hcon->type == LE_LINK) {
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ l2cap_chan_le_connect_reject(chan);
+ break;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ l2cap_chan_ecred_connect_reject(chan);
+ break;
+ }
+ }
}
l2cap_chan_del(chan, reason);
@@ -1273,8 +1341,13 @@ static void l2cap_chan_ready(struct l2cap_chan *chan)
chan->conf_state = 0;
__clear_chan_timer(chan);
- if (chan->mode == L2CAP_MODE_LE_FLOWCTL && !chan->tx_credits)
- chan->ops->suspend(chan);
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (!chan->tx_credits)
+ chan->ops->suspend(chan);
+ break;
+ }
chan->state = BT_CONNECTED;
@@ -1306,6 +1379,81 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
sizeof(req), &req);
}
+struct l2cap_ecred_conn_data {
+ struct {
+ struct l2cap_ecred_conn_req req;
+ __le16 scid[5];
+ } __packed pdu;
+ struct l2cap_chan *chan;
+ struct pid *pid;
+ int count;
+};
+
+static void l2cap_ecred_defer_connect(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_ecred_conn_data *conn = data;
+ struct pid *pid;
+
+ if (chan == conn->chan)
+ return;
+
+ if (!test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ pid = chan->ops->get_peer_pid(chan);
+
+ /* Only add deferred channels with the same PID/PSM */
+ if (conn->pid != pid || chan->psm != conn->chan->psm || chan->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT)
+ return;
+
+ if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ l2cap_ecred_init(chan, 0);
+
+ /* Set the same ident so we can match on the rsp */
+ chan->ident = conn->chan->ident;
+
+ /* Include all channels deferred */
+ conn->pdu.scid[conn->count] = cpu_to_le16(chan->scid);
+
+ conn->count++;
+}
+
+static void l2cap_ecred_connect(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_ecred_conn_data data;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ l2cap_ecred_init(chan, 0);
+
+ data.pdu.req.psm = chan->psm;
+ data.pdu.req.mtu = cpu_to_le16(chan->imtu);
+ data.pdu.req.mps = cpu_to_le16(chan->mps);
+ data.pdu.req.credits = cpu_to_le16(chan->rx_credits);
+ data.pdu.scid[0] = cpu_to_le16(chan->scid);
+
+ chan->ident = l2cap_get_ident(conn);
+ data.pid = chan->ops->get_peer_pid(chan);
+
+ data.count = 1;
+ data.chan = chan;
+ data.pid = chan->ops->get_peer_pid(chan);
+
+ __l2cap_chan_list(conn, l2cap_ecred_defer_connect, &data);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_CONN_REQ,
+ sizeof(data.pdu.req) + data.count * sizeof(__le16),
+ &data.pdu);
+}
+
static void l2cap_le_start(struct l2cap_chan *chan)
{
struct l2cap_conn *conn = chan->conn;
@@ -1318,8 +1466,12 @@ static void l2cap_le_start(struct l2cap_chan *chan)
return;
}
- if (chan->state == BT_CONNECT)
- l2cap_le_connect(chan);
+ if (chan->state == BT_CONNECT) {
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL)
+ l2cap_ecred_connect(chan);
+ else
+ l2cap_le_connect(chan);
+ }
}
static void l2cap_start_connection(struct l2cap_chan *chan)
@@ -1737,9 +1889,9 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
l2cap_chan_del(chan, err);
- l2cap_chan_unlock(chan);
-
chan->ops->close(chan);
+
+ l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
}
@@ -2505,6 +2657,7 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
switch (chan->mode) {
case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
/* Check outgoing MTU */
if (len > chan->omtu)
return -EMSGSIZE;
@@ -3773,6 +3926,45 @@ void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan)
&rsp);
}
+void __l2cap_ecred_conn_rsp_defer(struct l2cap_chan *chan)
+{
+ struct {
+ struct l2cap_ecred_conn_rsp rsp;
+ __le16 dcid[5];
+ } __packed pdu;
+ struct l2cap_conn *conn = chan->conn;
+ u16 ident = chan->ident;
+ int i = 0;
+
+ if (!ident)
+ return;
+
+ BT_DBG("chan %p ident %d", chan, ident);
+
+ pdu.rsp.mtu = cpu_to_le16(chan->imtu);
+ pdu.rsp.mps = cpu_to_le16(chan->mps);
+ pdu.rsp.credits = cpu_to_le16(chan->rx_credits);
+ pdu.rsp.result = cpu_to_le16(L2CAP_CR_LE_SUCCESS);
+
+ mutex_lock(&conn->chan_lock);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ if (chan->ident != ident)
+ continue;
+
+ /* Reset ident so only one response is sent */
+ chan->ident = 0;
+
+ /* Include all channels pending with the same ident */
+ pdu.dcid[i++] = cpu_to_le16(chan->scid);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+
+ l2cap_send_cmd(conn, ident, L2CAP_ECRED_CONN_RSP,
+ sizeof(pdu.rsp) + i * sizeof(__le16), &pdu);
+}
+
void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
{
struct l2cap_conn_rsp rsp;
@@ -4181,7 +4373,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
return 0;
}
- if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2) {
+ if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2 &&
+ chan->state != BT_CONNECTED) {
cmd_reject_invalid_cid(conn, cmd->ident, chan->scid,
chan->dcid);
goto unlock;
@@ -4405,6 +4598,7 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn,
return 0;
}
+ l2cap_chan_hold(chan);
l2cap_chan_lock(chan);
rsp.dcid = cpu_to_le16(chan->scid);
@@ -4413,12 +4607,11 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn,
chan->ops->set_shutdown(chan);
- l2cap_chan_hold(chan);
l2cap_chan_del(chan, ECONNRESET);
- l2cap_chan_unlock(chan);
-
chan->ops->close(chan);
+
+ l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
mutex_unlock(&conn->chan_lock);
@@ -4450,20 +4643,21 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn,
return 0;
}
+ l2cap_chan_hold(chan);
l2cap_chan_lock(chan);
if (chan->state != BT_DISCONN) {
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
mutex_unlock(&conn->chan_lock);
return 0;
}
- l2cap_chan_hold(chan);
l2cap_chan_del(chan, 0);
- l2cap_chan_unlock(chan);
-
chan->ops->close(chan);
+
+ l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
mutex_unlock(&conn->chan_lock);
@@ -5714,6 +5908,356 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn,
return 0;
}
+static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_ecred_conn_req *req = (void *) data;
+ struct {
+ struct l2cap_ecred_conn_rsp rsp;
+ __le16 dcid[5];
+ } __packed pdu;
+ struct l2cap_chan *chan, *pchan;
+ u16 mtu, mps;
+ __le16 psm;
+ u8 result, len = 0;
+ int i, num_scid;
+ bool defer = false;
+
+ if (!enable_ecred)
+ return -EINVAL;
+
+ if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) {
+ result = L2CAP_CR_LE_INVALID_PARAMS;
+ goto response;
+ }
+
+ mtu = __le16_to_cpu(req->mtu);
+ mps = __le16_to_cpu(req->mps);
+
+ if (mtu < L2CAP_ECRED_MIN_MTU || mps < L2CAP_ECRED_MIN_MPS) {
+ result = L2CAP_CR_LE_UNACCEPT_PARAMS;
+ goto response;
+ }
+
+ psm = req->psm;
+
+ BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps);
+
+ memset(&pdu, 0, sizeof(pdu));
+
+ /* Check if we have socket listening on psm */
+ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
+ &conn->hcon->dst, LE_LINK);
+ if (!pchan) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ goto response;
+ }
+
+ mutex_lock(&conn->chan_lock);
+ l2cap_chan_lock(pchan);
+
+ if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
+ SMP_ALLOW_STK)) {
+ result = L2CAP_CR_LE_AUTHENTICATION;
+ goto unlock;
+ }
+
+ result = L2CAP_CR_LE_SUCCESS;
+ cmd_len -= sizeof(req);
+ num_scid = cmd_len / sizeof(u16);
+
+ for (i = 0; i < num_scid; i++) {
+ u16 scid = __le16_to_cpu(req->scid[i]);
+
+ BT_DBG("scid[%d] 0x%4.4x", i, scid);
+
+ pdu.dcid[i] = 0x0000;
+ len += sizeof(*pdu.dcid);
+
+ /* Check for valid dynamic CID range */
+ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
+ result = L2CAP_CR_LE_INVALID_SCID;
+ continue;
+ }
+
+ /* Check if we already have channel with that dcid */
+ if (__l2cap_get_chan_by_dcid(conn, scid)) {
+ result = L2CAP_CR_LE_SCID_IN_USE;
+ continue;
+ }
+
+ chan = pchan->ops->new_connection(pchan);
+ if (!chan) {
+ result = L2CAP_CR_LE_NO_MEM;
+ continue;
+ }
+
+ bacpy(&chan->src, &conn->hcon->src);
+ bacpy(&chan->dst, &conn->hcon->dst);
+ chan->src_type = bdaddr_src_type(conn->hcon);
+ chan->dst_type = bdaddr_dst_type(conn->hcon);
+ chan->psm = psm;
+ chan->dcid = scid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+
+ __l2cap_chan_add(conn, chan);
+
+ l2cap_ecred_init(chan, __le16_to_cpu(req->credits));
+
+ /* Init response */
+ if (!pdu.rsp.credits) {
+ pdu.rsp.mtu = cpu_to_le16(chan->imtu);
+ pdu.rsp.mps = cpu_to_le16(chan->mps);
+ pdu.rsp.credits = cpu_to_le16(chan->rx_credits);
+ }
+
+ pdu.dcid[i] = cpu_to_le16(chan->scid);
+
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+
+ chan->ident = cmd->ident;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ l2cap_state_change(chan, BT_CONNECT2);
+ defer = true;
+ chan->ops->defer(chan);
+ } else {
+ l2cap_chan_ready(chan);
+ }
+ }
+
+unlock:
+ l2cap_chan_unlock(pchan);
+ mutex_unlock(&conn->chan_lock);
+ l2cap_chan_put(pchan);
+
+response:
+ pdu.rsp.result = cpu_to_le16(result);
+
+ if (defer)
+ return 0;
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_CONN_RSP,
+ sizeof(pdu.rsp) + len, &pdu);
+
+ return 0;
+}
+
+static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_ecred_conn_rsp *rsp = (void *) data;
+ struct hci_conn *hcon = conn->hcon;
+ u16 mtu, mps, credits, result;
+ struct l2cap_chan *chan;
+ int err = 0, sec_level;
+ int i = 0;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ mtu = __le16_to_cpu(rsp->mtu);
+ mps = __le16_to_cpu(rsp->mps);
+ credits = __le16_to_cpu(rsp->credits);
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits,
+ result);
+
+ mutex_lock(&conn->chan_lock);
+
+ cmd_len -= sizeof(*rsp);
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ u16 dcid;
+
+ if (chan->ident != cmd->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL ||
+ chan->state == BT_CONNECTED)
+ continue;
+
+ l2cap_chan_lock(chan);
+
+ /* Check that there is a dcid for each pending channel */
+ if (cmd_len < sizeof(dcid)) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ dcid = __le16_to_cpu(rsp->dcid[i++]);
+ cmd_len -= sizeof(u16);
+
+ BT_DBG("dcid[%d] 0x%4.4x", i, dcid);
+
+ /* Check if dcid is already in use */
+ if (dcid && __l2cap_get_chan_by_dcid(conn, dcid)) {
+ /* If a device receives a
+ * L2CAP_CREDIT_BASED_CONNECTION_RSP packet with an
+ * already-assigned Destination CID, then both the
+ * original channel and the new channel shall be
+ * immediately discarded and not used.
+ */
+ l2cap_chan_del(chan, ECONNREFUSED);
+ l2cap_chan_unlock(chan);
+ chan = __l2cap_get_chan_by_dcid(conn, dcid);
+ l2cap_chan_lock(chan);
+ l2cap_chan_del(chan, ECONNRESET);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ switch (result) {
+ case L2CAP_CR_LE_AUTHENTICATION:
+ case L2CAP_CR_LE_ENCRYPTION:
+ /* If we already have MITM protection we can't do
+ * anything.
+ */
+ if (hcon->sec_level > BT_SECURITY_MEDIUM) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ sec_level = hcon->sec_level + 1;
+ if (chan->sec_level < sec_level)
+ chan->sec_level = sec_level;
+
+ /* We'll need to send a new Connect Request */
+ clear_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags);
+
+ smp_conn_security(hcon, chan->sec_level);
+ break;
+
+ case L2CAP_CR_LE_BAD_PSM:
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+
+ default:
+ /* If dcid was not set it means channels was refused */
+ if (!dcid) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ chan->ident = 0;
+ chan->dcid = dcid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+ chan->tx_credits = credits;
+ l2cap_chan_ready(chan);
+ break;
+ }
+
+ l2cap_chan_unlock(chan);
+ }
+
+ mutex_unlock(&conn->chan_lock);
+
+ return err;
+}
+
+static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_ecred_reconf_req *req = (void *) data;
+ struct l2cap_ecred_reconf_rsp rsp;
+ u16 mtu, mps, result;
+ struct l2cap_chan *chan;
+ int i, num_scid;
+
+ if (!enable_ecred)
+ return -EINVAL;
+
+ if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) {
+ result = L2CAP_CR_LE_INVALID_PARAMS;
+ goto respond;
+ }
+
+ mtu = __le16_to_cpu(req->mtu);
+ mps = __le16_to_cpu(req->mps);
+
+ BT_DBG("mtu %u mps %u", mtu, mps);
+
+ if (mtu < L2CAP_ECRED_MIN_MTU) {
+ result = L2CAP_RECONF_INVALID_MTU;
+ goto respond;
+ }
+
+ if (mps < L2CAP_ECRED_MIN_MPS) {
+ result = L2CAP_RECONF_INVALID_MPS;
+ goto respond;
+ }
+
+ cmd_len -= sizeof(*req);
+ num_scid = cmd_len / sizeof(u16);
+ result = L2CAP_RECONF_SUCCESS;
+
+ for (i = 0; i < num_scid; i++) {
+ u16 scid;
+
+ scid = __le16_to_cpu(req->scid[i]);
+ if (!scid)
+ return -EPROTO;
+
+ chan = __l2cap_get_chan_by_dcid(conn, scid);
+ if (!chan)
+ continue;
+
+ /* If the MTU value is decreased for any of the included
+ * channels, then the receiver shall disconnect all
+ * included channels.
+ */
+ if (chan->omtu > mtu) {
+ BT_ERR("chan %p decreased MTU %u -> %u", chan,
+ chan->omtu, mtu);
+ result = L2CAP_RECONF_INVALID_MTU;
+ }
+
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+ }
+
+respond:
+ rsp.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_RECONF_RSP, sizeof(rsp),
+ &rsp);
+
+ return 0;
+}
+
+static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_chan *chan;
+ struct l2cap_ecred_conn_rsp *rsp = (void *) data;
+ u16 result;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("result 0x%4.4x", rsp->result);
+
+ if (!result)
+ return 0;
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ if (chan->ident != cmd->ident)
+ continue;
+
+ l2cap_chan_del(chan, ECONNRESET);
+ }
+
+ return 0;
+}
+
static inline int l2cap_le_command_rej(struct l2cap_conn *conn,
struct l2cap_cmd_hdr *cmd, u16 cmd_len,
u8 *data)
@@ -5769,6 +6313,22 @@ static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn,
err = l2cap_le_credits(conn, cmd, cmd_len, data);
break;
+ case L2CAP_ECRED_CONN_REQ:
+ err = l2cap_ecred_conn_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_CONN_RSP:
+ err = l2cap_ecred_conn_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_RECONF_REQ:
+ err = l2cap_ecred_reconf_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_RECONF_RSP:
+ err = l2cap_ecred_reconf_rsp(conn, cmd, cmd_len, data);
+ break;
+
case L2CAP_DISCONN_REQ:
err = l2cap_disconnect_req(conn, cmd, cmd_len, data);
break;
@@ -5831,9 +6391,7 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn,
struct sk_buff *skb)
{
struct hci_conn *hcon = conn->hcon;
- u8 *data = skb->data;
- int len = skb->len;
- struct l2cap_cmd_hdr cmd;
+ struct l2cap_cmd_hdr *cmd;
int err;
l2cap_raw_recv(conn, skb);
@@ -5841,35 +6399,34 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn,
if (hcon->type != ACL_LINK)
goto drop;
- while (len >= L2CAP_CMD_HDR_SIZE) {
- u16 cmd_len;
- memcpy(&cmd, data, L2CAP_CMD_HDR_SIZE);
- data += L2CAP_CMD_HDR_SIZE;
- len -= L2CAP_CMD_HDR_SIZE;
+ while (skb->len >= L2CAP_CMD_HDR_SIZE) {
+ u16 len;
+
+ cmd = (void *) skb->data;
+ skb_pull(skb, L2CAP_CMD_HDR_SIZE);
- cmd_len = le16_to_cpu(cmd.len);
+ len = le16_to_cpu(cmd->len);
- BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd.code, cmd_len,
- cmd.ident);
+ BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd->code, len,
+ cmd->ident);
- if (cmd_len > len || !cmd.ident) {
+ if (len > skb->len || !cmd->ident) {
BT_DBG("corrupted command");
break;
}
- err = l2cap_bredr_sig_cmd(conn, &cmd, cmd_len, data);
+ err = l2cap_bredr_sig_cmd(conn, cmd, len, skb->data);
if (err) {
struct l2cap_cmd_rej_unk rej;
BT_ERR("Wrong link type (%d)", err);
rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD);
- l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ,
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ,
sizeof(rej), &rej);
}
- data += cmd_len;
- len -= cmd_len;
+ skb_pull(skb, len);
}
drop:
@@ -6814,11 +7371,13 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
struct l2cap_le_credits pkt;
u16 return_credits;
- return_credits = ((chan->imtu / chan->mps) + 1) - chan->rx_credits;
+ return_credits = (chan->imtu / chan->mps) + 1;
- if (!return_credits)
+ if (chan->rx_credits >= return_credits)
return;
+ return_credits -= chan->rx_credits;
+
BT_DBG("chan %p returning %u credits to sender", chan, return_credits);
chan->rx_credits += return_credits;
@@ -6831,7 +7390,7 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt);
}
-static int l2cap_le_recv(struct l2cap_chan *chan, struct sk_buff *skb)
+static int l2cap_ecred_recv(struct l2cap_chan *chan, struct sk_buff *skb)
{
int err;
@@ -6846,7 +7405,7 @@ static int l2cap_le_recv(struct l2cap_chan *chan, struct sk_buff *skb)
return err;
}
-static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
+static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
{
int err;
@@ -6894,7 +7453,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
}
if (skb->len == sdu_len)
- return l2cap_le_recv(chan, skb);
+ return l2cap_ecred_recv(chan, skb);
chan->sdu = skb;
chan->sdu_len = sdu_len;
@@ -6926,7 +7485,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
skb = NULL;
if (chan->sdu->len == chan->sdu_len) {
- err = l2cap_le_recv(chan, chan->sdu);
+ err = l2cap_ecred_recv(chan, chan->sdu);
if (!err) {
chan->sdu = NULL;
chan->sdu_last_frag = NULL;
@@ -6987,7 +7546,8 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid,
switch (chan->mode) {
case L2CAP_MODE_LE_FLOWCTL:
- if (l2cap_le_data_rcv(chan, skb) < 0)
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (l2cap_ecred_data_rcv(chan, skb) < 0)
goto drop;
goto done;
@@ -7206,6 +7766,33 @@ static bool is_valid_psm(u16 psm, u8 dst_type) {
return ((psm & 0x0101) == 0x0001);
}
+struct l2cap_chan_data {
+ struct l2cap_chan *chan;
+ struct pid *pid;
+ int count;
+};
+
+static void l2cap_chan_by_pid(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_chan_data *d = data;
+ struct pid *pid;
+
+ if (chan == d->chan)
+ return;
+
+ if (!test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ pid = chan->ops->get_peer_pid(chan);
+
+ /* Only count deferred channels with the same PID/PSM */
+ if (d->pid != pid || chan->psm != d->chan->psm || chan->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT)
+ return;
+
+ d->count++;
+}
+
int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
bdaddr_t *dst, u8 dst_type)
{
@@ -7214,8 +7801,8 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
struct hci_dev *hdev;
int err;
- BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst,
- dst_type, __le16_to_cpu(psm));
+ BT_DBG("%pMR -> %pMR (type %u) psm 0x%4.4x mode 0x%2.2x", &chan->src,
+ dst, dst_type, __le16_to_cpu(psm), chan->mode);
hdev = hci_get_route(dst, &chan->src, chan->src_type);
if (!hdev)
@@ -7244,6 +7831,12 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
break;
case L2CAP_MODE_LE_FLOWCTL:
break;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (!enable_ecred) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+ break;
case L2CAP_MODE_ERTM:
case L2CAP_MODE_STREAMING:
if (!disable_ertm)
@@ -7319,6 +7912,23 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
goto done;
}
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL) {
+ struct l2cap_chan_data data;
+
+ data.chan = chan;
+ data.pid = chan->ops->get_peer_pid(chan);
+ data.count = 1;
+
+ l2cap_chan_list(conn, l2cap_chan_by_pid, &data);
+
+ /* Check if there isn't too many channels being connected */
+ if (data.count > L2CAP_ECRED_CONN_SCID_MAX) {
+ hci_conn_drop(hcon);
+ err = -EPROTO;
+ goto done;
+ }
+ }
+
mutex_lock(&conn->chan_lock);
l2cap_chan_lock(chan);
@@ -7368,6 +7978,38 @@ done:
}
EXPORT_SYMBOL_GPL(l2cap_chan_connect);
+static void l2cap_ecred_reconfigure(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct {
+ struct l2cap_ecred_reconf_req req;
+ __le16 scid;
+ } pdu;
+
+ pdu.req.mtu = cpu_to_le16(chan->imtu);
+ pdu.req.mps = cpu_to_le16(chan->mps);
+ pdu.scid = cpu_to_le16(chan->scid);
+
+ chan->ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_RECONF_REQ,
+ sizeof(pdu), &pdu);
+}
+
+int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu)
+{
+ if (chan->imtu > mtu)
+ return -EINVAL;
+
+ BT_DBG("chan %p mtu 0x%4.4x", chan, mtu);
+
+ chan->imtu = mtu;
+
+ l2cap_ecred_reconfigure(chan);
+
+ return 0;
+}
+
/* ---- L2CAP interface with lower layer (HCI) ---- */
int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr)
@@ -7579,7 +8221,8 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
else
__set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
} else if (chan->state == BT_CONNECT2 &&
- chan->mode != L2CAP_MODE_LE_FLOWCTL) {
+ !(chan->mode == L2CAP_MODE_EXT_FLOWCTL ||
+ chan->mode == L2CAP_MODE_LE_FLOWCTL)) {
struct l2cap_conn_rsp rsp;
__u16 res, stat;
@@ -7787,3 +8430,6 @@ void l2cap_exit(void)
module_param(disable_ertm, bool, 0644);
MODULE_PARM_DESC(disable_ertm, "Disable enhanced retransmission mode");
+
+module_param(enable_ecred, bool, 0644);
+MODULE_PARM_DESC(enable_ecred, "Enable enhanced credit flow control mode");
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a7be8b59b3c2..117ba20ea194 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -232,7 +232,7 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr,
return -EINVAL;
}
- if (chan->psm && bdaddr_type_is_le(chan->src_type))
+ if (chan->psm && bdaddr_type_is_le(chan->src_type) && !chan->mode)
chan->mode = L2CAP_MODE_LE_FLOWCTL;
err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid),
@@ -274,6 +274,12 @@ static int l2cap_sock_listen(struct socket *sock, int backlog)
case L2CAP_MODE_BASIC:
case L2CAP_MODE_LE_FLOWCTL:
break;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (!enable_ecred) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+ break;
case L2CAP_MODE_ERTM:
case L2CAP_MODE_STREAMING:
if (!disable_ertm)
@@ -427,6 +433,8 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
opts.max_tx = chan->max_tx;
opts.txwin_size = chan->tx_win;
+ BT_DBG("mode 0x%2.2x", chan->mode);
+
len = min_t(unsigned int, len, sizeof(opts));
if (copy_to_user(optval, (char *) &opts, len))
err = -EFAULT;
@@ -499,6 +507,7 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname,
struct l2cap_chan *chan = l2cap_pi(sk)->chan;
struct bt_security sec;
struct bt_power pwr;
+ u32 phys;
int len, err = 0;
BT_DBG("sk %p", sk);
@@ -603,6 +612,18 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname,
err = -EFAULT;
break;
+ case BT_PHY:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ phys = hci_conn_get_phy(chan->conn->hcon);
+
+ if (put_user(phys, (u32 __user *) optval))
+ err = -EFAULT;
+ break;
+
default:
err = -ENOPROTOOPT;
break;
@@ -694,6 +715,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
break;
}
+ BT_DBG("mode 0x%2.2x", chan->mode);
+
chan->imtu = opts.imtu;
chan->omtu = opts.omtu;
chan->fcs = opts.fcs;
@@ -926,7 +949,8 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
break;
}
- if (sk->sk_state == BT_CONNECTED) {
+ if (chan->mode == L2CAP_MODE_LE_FLOWCTL &&
+ sk->sk_state == BT_CONNECTED) {
err = -EISCONN;
break;
}
@@ -936,7 +960,12 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
break;
}
- chan->imtu = opt;
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL &&
+ sk->sk_state == BT_CONNECTED)
+ err = l2cap_chan_reconfigure(chan, opt);
+ else
+ chan->imtu = opt;
+
break;
default:
@@ -991,7 +1020,11 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP,
&bt_sk(sk)->flags)) {
- if (bdaddr_type_is_le(pi->chan->src_type)) {
+ if (pi->chan->mode == L2CAP_MODE_EXT_FLOWCTL) {
+ sk->sk_state = BT_CONNECTED;
+ pi->chan->state = BT_CONNECTED;
+ __l2cap_ecred_conn_rsp_defer(pi->chan);
+ } else if (bdaddr_type_is_le(pi->chan->src_type)) {
sk->sk_state = BT_CONNECTED;
pi->chan->state = BT_CONNECTED;
__l2cap_le_connect_rsp_defer(pi->chan);
@@ -1042,7 +1075,7 @@ done:
}
/* Kill socket (only if zapped and orphan)
- * Must be called on unlocked socket.
+ * Must be called on unlocked socket, with l2cap channel lock.
*/
static void l2cap_sock_kill(struct sock *sk)
{
@@ -1193,6 +1226,7 @@ static int l2cap_sock_release(struct socket *sock)
{
struct sock *sk = sock->sk;
int err;
+ struct l2cap_chan *chan;
BT_DBG("sock %p, sk %p", sock, sk);
@@ -1202,9 +1236,17 @@ static int l2cap_sock_release(struct socket *sock)
bt_sock_unlink(&l2cap_sk_list, sk);
err = l2cap_sock_shutdown(sock, 2);
+ chan = l2cap_pi(sk)->chan;
+
+ l2cap_chan_hold(chan);
+ l2cap_chan_lock(chan);
sock_orphan(sk);
l2cap_sock_kill(sk);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
return err;
}
@@ -1222,12 +1264,15 @@ static void l2cap_sock_cleanup_listen(struct sock *parent)
BT_DBG("child chan %p state %s", chan,
state_to_string(chan->state));
+ l2cap_chan_hold(chan);
l2cap_chan_lock(chan);
+
__clear_chan_timer(chan);
l2cap_chan_close(chan, ECONNRESET);
- l2cap_chan_unlock(chan);
-
l2cap_sock_kill(sk);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
}
}
@@ -1459,6 +1504,13 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan)
return sk->sk_sndtimeo;
}
+static struct pid *l2cap_sock_get_peer_pid_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ return sk->sk_peer_pid;
+}
+
static void l2cap_sock_suspend_cb(struct l2cap_chan *chan)
{
struct sock *sk = chan->data;
@@ -1480,6 +1532,7 @@ static const struct l2cap_ops l2cap_chan_ops = {
.suspend = l2cap_sock_suspend_cb,
.set_shutdown = l2cap_sock_set_shutdown_cb,
.get_sndtimeo = l2cap_sock_get_sndtimeo_cb,
+ .get_peer_pid = l2cap_sock_get_peer_pid_cb,
.alloc_skb = l2cap_sock_alloc_skb_cb,
};
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 3074363c68df..6552003a170e 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
#include "mgmt_util.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 15
+#define MGMT_REVISION 16
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -107,6 +107,7 @@ static const u16 mgmt_commands[] = {
MGMT_OP_READ_EXT_INFO,
MGMT_OP_SET_APPEARANCE,
MGMT_OP_SET_BLOCKED_KEYS,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
};
static const u16 mgmt_events[] = {
@@ -762,6 +763,10 @@ static u32 get_supported_settings(struct hci_dev *hdev)
if (lmp_sc_capable(hdev))
settings |= MGMT_SETTING_SECURE_CONN;
+
+ if (test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED,
+ &hdev->quirks))
+ settings |= MGMT_SETTING_WIDEBAND_SPEECH;
}
if (lmp_le_capable(hdev)) {
@@ -846,6 +851,9 @@ static u32 get_current_settings(struct hci_dev *hdev)
settings |= MGMT_SETTING_STATIC_ADDRESS;
}
+ if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED))
+ settings |= MGMT_SETTING_WIDEBAND_SPEECH;
+
return settings;
}
@@ -1382,6 +1390,12 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
goto failed;
}
+ if (hdev->advertising_paused) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
if (!hdev_is_powered(hdev)) {
bool changed = false;
@@ -3589,6 +3603,62 @@ static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data,
err, NULL, 0);
}
+static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ int err;
+ bool changed = false;
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (!test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks))
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ if (pending_find(MGMT_OP_SET_WIDEBAND_SPEECH, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (hdev_is_powered(hdev) &&
+ !!cp->val != hci_dev_test_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (cp->val)
+ changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED);
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_WIDEBAND_SPEECH, hdev);
+ if (err < 0)
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -3865,6 +3935,13 @@ void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status)
}
hci_dev_unlock(hdev);
+
+ /* Handle suspend notifier */
+ if (test_and_clear_bit(SUSPEND_UNPAUSE_DISCOVERY,
+ hdev->suspend_tasks)) {
+ bt_dev_dbg(hdev, "Unpaused discovery");
+ wake_up(&hdev->suspend_wait_q);
+ }
}
static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type,
@@ -3926,6 +4003,13 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev,
goto failed;
}
+ /* Can't start discovery when it is paused */
+ if (hdev->discovery_paused) {
+ err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_BUSY,
+ &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
/* Clear the discovery filter first to free any previously
* allocated memory for the UUID list.
*/
@@ -4093,6 +4177,12 @@ void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status)
}
hci_dev_unlock(hdev);
+
+ /* Handle suspend notifier */
+ if (test_and_clear_bit(SUSPEND_PAUSE_DISCOVERY, hdev->suspend_tasks)) {
+ bt_dev_dbg(hdev, "Paused discovery");
+ wake_up(&hdev->suspend_wait_q);
+ }
}
static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data,
@@ -4324,6 +4414,17 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status,
if (match.sk)
sock_put(match.sk);
+ /* Handle suspend notifier */
+ if (test_and_clear_bit(SUSPEND_PAUSE_ADVERTISING,
+ hdev->suspend_tasks)) {
+ bt_dev_dbg(hdev, "Paused advertising");
+ wake_up(&hdev->suspend_wait_q);
+ } else if (test_and_clear_bit(SUSPEND_UNPAUSE_ADVERTISING,
+ hdev->suspend_tasks)) {
+ bt_dev_dbg(hdev, "Unpaused advertising");
+ wake_up(&hdev->suspend_wait_q);
+ }
+
/* If "Set Advertising" was just disabled and instance advertising was
* set up earlier, then re-enable multi-instance advertising.
*/
@@ -4375,6 +4476,10 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
+ if (hdev->advertising_paused)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
+ MGMT_STATUS_BUSY);
+
hci_dev_lock(hdev);
val = !!cp->val;
@@ -6743,8 +6848,11 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
if (!err)
err = hci_req_run(&req, add_advertising_complete);
- if (err < 0)
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_FAILED);
mgmt_pending_remove(cmd);
+ }
unlock:
hci_dev_unlock(hdev);
@@ -6990,6 +7098,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE },
{ set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE,
HCI_MGMT_VAR_LEN },
+ { set_wideband_speech, MGMT_SETTING_SIZE },
};
void mgmt_index_added(struct hci_dev *hdev)
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 3a9e9d9670be..2e20af317cea 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -40,7 +40,6 @@
static bool disable_cfc;
static bool l2cap_ertm;
static int channel_mtu = -1;
-static unsigned int l2cap_mtu = RFCOMM_MAX_L2CAP_MTU;
static struct task_struct *rfcomm_thread;
@@ -73,8 +72,6 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s);
/* ---- RFCOMM frame parsing macros ---- */
#define __get_dlci(b) ((b & 0xfc) >> 2)
-#define __get_channel(b) ((b & 0xf8) >> 3)
-#define __get_dir(b) ((b & 0x04) >> 2)
#define __get_type(b) ((b & 0xef))
#define __test_ea(b) ((b & 0x01))
@@ -87,7 +84,6 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s);
#define __ctrl(type, pf) (((type & 0xef) | (pf << 4)))
#define __dlci(dir, chn) (((chn & 0x1f) << 1) | dir)
#define __srv_channel(dlci) (dlci >> 1)
-#define __dir(dlci) (dlci & 0x01)
#define __len8(len) (((len) << 1) | 1)
#define __len16(len) ((len) << 1)
@@ -752,7 +748,8 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
/* Set L2CAP options */
sk = sock->sk;
lock_sock(sk);
- l2cap_pi(sk)->chan->imtu = l2cap_mtu;
+ /* Set MTU to 0 so L2CAP can auto select the MTU */
+ l2cap_pi(sk)->chan->imtu = 0;
l2cap_pi(sk)->chan->sec_level = sec_level;
if (l2cap_ertm)
l2cap_pi(sk)->chan->mode = L2CAP_MODE_ERTM;
@@ -2039,7 +2036,8 @@ static int rfcomm_add_listener(bdaddr_t *ba)
/* Set L2CAP options */
sk = sock->sk;
lock_sock(sk);
- l2cap_pi(sk)->chan->imtu = l2cap_mtu;
+ /* Set MTU to 0 so L2CAP can auto select the MTU */
+ l2cap_pi(sk)->chan->imtu = 0;
release_sock(sk);
/* Start listening on the socket */
@@ -2237,9 +2235,6 @@ MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control");
module_param(channel_mtu, int, 0644);
MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel");
-module_param(l2cap_mtu, uint, 0644);
-MODULE_PARM_DESC(l2cap_mtu, "Default MTU for the L2CAP connection");
-
module_param(l2cap_ertm, bool, 0644);
MODULE_PARM_DESC(l2cap_ertm, "Use L2CAP ERTM mode for connection");
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 0c7d31c6c18c..a58584949a95 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -413,10 +413,8 @@ static int __rfcomm_create_dev(struct sock *sk, void __user *arg)
dlc = rfcomm_dlc_exists(&req.src, &req.dst, req.channel);
if (IS_ERR(dlc))
return PTR_ERR(dlc);
- else if (dlc) {
- rfcomm_dlc_put(dlc);
+ if (dlc)
return -EBUSY;
- }
dlc = rfcomm_dlc_alloc(GFP_KERNEL);
if (!dlc)
return -ENOMEM;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index b91d6b440fdf..c8c3d38cdc7b 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -922,6 +922,7 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
struct sock *sk = sock->sk;
int len, err = 0;
struct bt_voice voice;
+ u32 phys;
BT_DBG("sk %p", sk);
@@ -956,6 +957,18 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
break;
+ case BT_PHY:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ phys = hci_conn_get_phy(sco_pi(sk)->conn->hcon);
+
+ if (put_user(phys, (u32 __user *) optval))
+ err = -EFAULT;
+ break;
+
default:
err = -ENOPROTOOPT;
break;
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 204f14f8b507..1476a91ce935 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -1145,7 +1145,7 @@ static void sc_generate_link_key(struct smp_chan *smp)
return;
if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
- /* SALT = 0x00000000000000000000000000000000746D7031 */
+ /* SALT = 0x000000000000000000000000746D7031 */
const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 };
if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) {
@@ -1203,7 +1203,7 @@ static void sc_generate_ltk(struct smp_chan *smp)
set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
- /* SALT = 0x00000000000000000000000000000000746D7032 */
+ /* SALT = 0x000000000000000000000000746D7032 */
const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 };
if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk))
@@ -2115,7 +2115,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
struct l2cap_chan *chan = conn->smp;
struct smp_chan *smp = chan->data;
struct hci_conn *hcon = conn->hcon;
- u8 *pkax, *pkbx, *na, *nb;
+ u8 *pkax, *pkbx, *na, *nb, confirm_hint;
u32 passkey;
int err;
@@ -2168,6 +2168,24 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
smp->prnd);
SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+
+ /* Only Just-Works pairing requires extra checks */
+ if (smp->method != JUST_WORKS)
+ goto mackey_and_ltk;
+
+ /* If there already exists long term key in local host, leave
+ * the decision to user space since the remote device could
+ * be legitimate or malicious.
+ */
+ if (hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type,
+ hcon->role)) {
+ /* Set passkey to 0. The value can be any number since
+ * it'll be ignored anyway.
+ */
+ passkey = 0;
+ confirm_hint = 1;
+ goto confirm;
+ }
}
mackey_and_ltk:
@@ -2188,8 +2206,11 @@ mackey_and_ltk:
if (err)
return SMP_UNSPECIFIED;
+ confirm_hint = 0;
+
+confirm:
err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type,
- hcon->dst_type, passkey, 0);
+ hcon->dst_type, passkey, confirm_hint);
if (err)
return SMP_UNSPECIFIED;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index d555c0d8657d..29dbdd4c29f6 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -10,6 +10,7 @@
#include <net/bpf_sk_storage.h>
#include <net/sock.h>
#include <net/tcp.h>
+#include <linux/error-injection.h>
#define CREATE_TRACE_POINTS
#include <trace/events/bpf_test_run.h>
@@ -37,7 +38,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
repeat = 1;
rcu_read_lock();
- preempt_disable();
+ migrate_disable();
time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) {
bpf_cgroup_storage_set(storage);
@@ -54,18 +55,18 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
if (need_resched()) {
time_spent += ktime_get_ns() - time_start;
- preempt_enable();
+ migrate_enable();
rcu_read_unlock();
cond_resched();
rcu_read_lock();
- preempt_disable();
+ migrate_disable();
time_start = ktime_get_ns();
}
}
time_spent += ktime_get_ns() - time_start;
- preempt_enable();
+ migrate_enable();
rcu_read_unlock();
do_div(time_spent, repeat);
@@ -113,6 +114,9 @@ out:
* architecture dependent calling conventions. 7+ can be supported in the
* future.
*/
+__diag_push();
+__diag_ignore(GCC, 8, "-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
int noinline bpf_fentry_test1(int a)
{
return a + 1;
@@ -143,6 +147,15 @@ int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f)
return a + (long)b + c + d + (long)e + f;
}
+int noinline bpf_modify_return_test(int a, int *b)
+{
+ *b += 1;
+ return a + *b;
+}
+__diag_pop();
+
+ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
+
static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
u32 headroom, u32 tailroom)
{
@@ -160,18 +173,48 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
kfree(data);
return ERR_PTR(-EFAULT);
}
- if (bpf_fentry_test1(1) != 2 ||
- bpf_fentry_test2(2, 3) != 5 ||
- bpf_fentry_test3(4, 5, 6) != 15 ||
- bpf_fentry_test4((void *)7, 8, 9, 10) != 34 ||
- bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
- bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111) {
- kfree(data);
- return ERR_PTR(-EFAULT);
- }
+
return data;
}
+int bpf_prog_test_run_tracing(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ u16 side_effect = 0, ret = 0;
+ int b = 2, err = -EFAULT;
+ u32 retval = 0;
+
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ if (bpf_fentry_test1(1) != 2 ||
+ bpf_fentry_test2(2, 3) != 5 ||
+ bpf_fentry_test3(4, 5, 6) != 15 ||
+ bpf_fentry_test4((void *)7, 8, 9, 10) != 34 ||
+ bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
+ bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111)
+ goto out;
+ break;
+ case BPF_MODIFY_RETURN:
+ ret = bpf_modify_return_test(1, &b);
+ if (b != 2)
+ side_effect = 1;
+ break;
+ default:
+ goto out;
+ }
+
+ retval = ((u32)side_effect << 16) | ret;
+ if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval)))
+ goto out;
+
+ err = 0;
+out:
+ trace_bpf_test_finish(&err);
+ return err;
+}
+
static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size)
{
void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in);
@@ -277,6 +320,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
/* gso_segs is allowed */
if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs),
+ offsetof(struct __sk_buff, gso_size)))
+ return -EINVAL;
+
+ /* gso_size is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size),
sizeof(struct __sk_buff)))
return -EINVAL;
@@ -297,6 +346,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
if (__skb->gso_segs > GSO_MAX_SEGS)
return -EINVAL;
skb_shinfo(skb)->gso_segs = __skb->gso_segs;
+ skb_shinfo(skb)->gso_size = __skb->gso_size;
return 0;
}
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
index 77396a098fbe..05e1cfc1e5cd 100644
--- a/net/bpfilter/main.c
+++ b/net/bpfilter/main.c
@@ -10,7 +10,7 @@
#include <asm/unistd.h>
#include "msgfmt.h"
-int debug_fd;
+FILE *debug_f;
static int handle_get_cmd(struct mbox_request *cmd)
{
@@ -37,7 +37,7 @@ static void loop(void)
n = read(0, &req, sizeof(req));
if (n != sizeof(req)) {
- dprintf(debug_fd, "invalid request %d\n", n);
+ fprintf(debug_f, "invalid request %d\n", n);
return;
}
@@ -47,7 +47,7 @@ static void loop(void)
n = write(1, &reply, sizeof(reply));
if (n != sizeof(reply)) {
- dprintf(debug_fd, "reply failed %d\n", n);
+ fprintf(debug_f, "reply failed %d\n", n);
return;
}
}
@@ -55,9 +55,10 @@ static void loop(void)
int main(void)
{
- debug_fd = open("/dev/kmsg", 00000002);
- dprintf(debug_fd, "Started bpfilter\n");
+ debug_f = fopen("/dev/kmsg", "w");
+ setvbuf(debug_f, 0, _IOLBF, 0);
+ fprintf(debug_f, "Started bpfilter\n");
loop();
- close(debug_fd);
+ fclose(debug_f);
return 0;
}
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index dc3d2c1dd9d5..0e3dbc5f3c34 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -34,7 +34,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
const struct nf_br_ops *nf_ops;
u8 state = BR_STATE_FORWARDING;
const unsigned char *dest;
- struct ethhdr *eth;
u16 vid = 0;
rcu_read_lock();
@@ -54,15 +53,14 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
BR_INPUT_SKB_CB(skb)->frag_max_size = 0;
skb_reset_mac_header(skb);
- eth = eth_hdr(skb);
skb_pull(skb, ETH_HLEN);
if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state))
goto out;
if (IS_ENABLED(CONFIG_INET) &&
- (eth->h_proto == htons(ETH_P_ARP) ||
- eth->h_proto == htons(ETH_P_RARP)) &&
+ (eth_hdr(skb)->h_proto == htons(ETH_P_ARP) ||
+ eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) &&
br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
br_do_proxy_suppress_arp(skb, br, vid, NULL);
} else if (IS_ENABLED(CONFIG_IPV6) &&
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
index afee292fb004..162998e2f039 100644
--- a/net/bridge/br_netlink_tunnel.c
+++ b/net/bridge/br_netlink_tunnel.c
@@ -26,8 +26,8 @@ static size_t __get_vlan_tinfo_size(void)
nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
}
-static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr,
- struct net_bridge_vlan *v_last)
+bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *v_last)
{
__be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id);
__be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id);
@@ -193,8 +193,8 @@ static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX +
[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
};
-static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd,
- u16 vid, u32 tun_id, bool *changed)
+int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd,
+ u16 vid, u32 tun_id, bool *changed)
{
int err = 0;
@@ -250,8 +250,8 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr,
return 0;
}
-int br_process_vlan_tunnel_info(struct net_bridge *br,
- struct net_bridge_port *p, int cmd,
+int br_process_vlan_tunnel_info(const struct net_bridge *br,
+ const struct net_bridge_port *p, int cmd,
struct vtunnel_info *tinfo_curr,
struct vtunnel_info *tinfo_last,
bool *changed)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 5153ffe79a01..1f97703a52ff 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1199,8 +1199,8 @@ static inline void br_vlan_notify(const struct net_bridge *br,
/* br_vlan_options.c */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
-bool br_vlan_opts_eq(const struct net_bridge_vlan *v1,
- const struct net_bridge_vlan *v2);
+bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end);
bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v);
size_t br_vlan_opts_nl_size(void);
int br_vlan_process_options(const struct net_bridge *br,
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
index 2bdef2ea3420..c54cc26211d7 100644
--- a/net/bridge/br_private_tunnel.h
+++ b/net/bridge/br_private_tunnel.h
@@ -18,8 +18,8 @@ struct vtunnel_info {
/* br_netlink_tunnel.c */
int br_parse_vlan_tunnel_info(struct nlattr *attr,
struct vtunnel_info *tinfo);
-int br_process_vlan_tunnel_info(struct net_bridge *br,
- struct net_bridge_port *p,
+int br_process_vlan_tunnel_info(const struct net_bridge *br,
+ const struct net_bridge_port *p,
int cmd,
struct vtunnel_info *tinfo_curr,
struct vtunnel_info *tinfo_last,
@@ -32,8 +32,9 @@ int br_fill_vlan_tunnel_info(struct sk_buff *skb,
/* br_vlan_tunnel.c */
int vlan_tunnel_init(struct net_bridge_vlan_group *vg);
void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg);
-int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid);
-int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id);
+int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid);
+int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid,
+ u32 tun_id);
void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
struct net_bridge_vlan *vlan);
@@ -42,19 +43,23 @@ int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
struct net_bridge_vlan_group *vg);
int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
struct net_bridge_vlan *vlan);
+bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *v_last);
+int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd,
+ u16 vid, u32 tun_id, bool *changed);
#else
static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
{
return 0;
}
-static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port,
+static inline int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port,
u16 vid)
{
return 0;
}
-static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port,
+static inline int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port,
u16 vid, u32 tun_id)
{
return 0;
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 6856a6d9282b..1f14b8455345 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -63,7 +63,8 @@ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no)
{
struct net_bridge_port *p;
- list_for_each_entry_rcu(p, &br->port_list, list) {
+ list_for_each_entry_rcu(p, &br->port_list, list,
+ lockdep_is_held(&br->lock)) {
if (p->port_no == port_no)
return p;
}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 6b5deca08b89..f9092c71225f 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1569,10 +1569,41 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event)
}
}
+static bool br_vlan_stats_fill(struct sk_buff *skb,
+ const struct net_bridge_vlan *v)
+{
+ struct br_vlan_stats stats;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS);
+ if (!nest)
+ return false;
+
+ br_vlan_get_stats(v, &stats);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES, stats.rx_bytes,
+ BRIDGE_VLANDB_STATS_PAD) ||
+ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS,
+ stats.rx_packets, BRIDGE_VLANDB_STATS_PAD) ||
+ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES, stats.tx_bytes,
+ BRIDGE_VLANDB_STATS_PAD) ||
+ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS,
+ stats.tx_packets, BRIDGE_VLANDB_STATS_PAD))
+ goto out_err;
+
+ nla_nest_end(skb, nest);
+
+ return true;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return false;
+}
+
/* v_opts is used to dump the options which must be equal in the whole range */
static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range,
const struct net_bridge_vlan *v_opts,
- u16 flags)
+ u16 flags,
+ bool dump_stats)
{
struct bridge_vlan_info info;
struct nlattr *nest;
@@ -1596,8 +1627,13 @@ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range,
nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range))
goto out_err;
- if (v_opts && !br_vlan_opts_fill(skb, v_opts))
- goto out_err;
+ if (v_opts) {
+ if (!br_vlan_opts_fill(skb, v_opts))
+ goto out_err;
+
+ if (dump_stats && !br_vlan_stats_fill(skb, v_opts))
+ goto out_err;
+ }
nla_nest_end(skb, nest);
@@ -1675,7 +1711,7 @@ void br_vlan_notify(const struct net_bridge *br,
goto out_kfree;
}
- if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags))
+ if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags, false))
goto out_err;
nlmsg_end(skb, nlh);
@@ -1694,14 +1730,16 @@ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
{
return v_curr->vid - range_end->vid == 1 &&
range_end->flags == v_curr->flags &&
- br_vlan_opts_eq(v_curr, range_end);
+ br_vlan_opts_eq_range(v_curr, range_end);
}
static int br_vlan_dump_dev(const struct net_device *dev,
struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ u32 dump_flags)
{
struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL;
+ bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS);
struct net_bridge_vlan_group *vg;
int idx = 0, s_idx = cb->args[1];
struct nlmsghdr *nlh = NULL;
@@ -1754,12 +1792,13 @@ static int br_vlan_dump_dev(const struct net_device *dev,
continue;
}
- if (v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) {
- u16 flags = br_vlan_flags(range_start, pvid);
+ if (dump_stats || v->vid == pvid ||
+ !br_vlan_can_enter_range(v, range_end)) {
+ u16 vlan_flags = br_vlan_flags(range_start, pvid);
if (!br_vlan_fill_vids(skb, range_start->vid,
range_end->vid, range_start,
- flags)) {
+ vlan_flags, dump_stats)) {
err = -EMSGSIZE;
break;
}
@@ -1778,7 +1817,8 @@ static int br_vlan_dump_dev(const struct net_device *dev,
*/
if (!err && range_start &&
!br_vlan_fill_vids(skb, range_start->vid, range_end->vid,
- range_start, br_vlan_flags(range_start, pvid)))
+ range_start, br_vlan_flags(range_start, pvid),
+ dump_stats))
err = -EMSGSIZE;
cb->args[1] = err ? idx : 0;
@@ -1788,18 +1828,27 @@ static int br_vlan_dump_dev(const struct net_device *dev,
return err;
}
+static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = {
+ [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 },
+};
+
static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1];
int idx = 0, err = 0, s_idx = cb->args[0];
struct net *net = sock_net(skb->sk);
struct br_vlan_msg *bvm;
struct net_device *dev;
+ u32 dump_flags = 0;
- err = nlmsg_parse(cb->nlh, sizeof(*bvm), NULL, 0, NULL, cb->extack);
+ err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX,
+ br_vlan_db_dump_pol, cb->extack);
if (err < 0)
return err;
bvm = nlmsg_data(cb->nlh);
+ if (dtb[BRIDGE_VLANDB_DUMP_FLAGS])
+ dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]);
rcu_read_lock();
if (bvm->ifindex) {
@@ -1808,7 +1857,7 @@ static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb)
err = -ENODEV;
goto out_err;
}
- err = br_vlan_dump_dev(dev, skb, cb);
+ err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
if (err && err != -EMSGSIZE)
goto out_err;
} else {
@@ -1816,7 +1865,7 @@ static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (idx < s_idx)
goto skip;
- err = br_vlan_dump_dev(dev, skb, cb);
+ err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
if (err == -EMSGSIZE)
break;
skip:
@@ -1839,6 +1888,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] =
.len = sizeof(struct bridge_vlan_info) },
[BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 },
[BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
};
static int br_vlan_rtm_process_one(struct net_device *dev,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index cd2eb194eb98..b4add9ea8964 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -4,25 +4,58 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <net/ip_tunnels.h>
#include "br_private.h"
+#include "br_private_tunnel.h"
-/* check if the options between two vlans are equal */
-bool br_vlan_opts_eq(const struct net_bridge_vlan *v1,
- const struct net_bridge_vlan *v2)
+static bool __vlan_tun_put(struct sk_buff *skb, const struct net_bridge_vlan *v)
{
- return v1->state == v2->state;
+ __be32 tid = tunnel_id_to_key32(v->tinfo.tunnel_id);
+ struct nlattr *nest;
+
+ if (!v->tinfo.tunnel_dst)
+ return true;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_TUNNEL_INFO);
+ if (!nest)
+ return false;
+ if (nla_put_u32(skb, BRIDGE_VLANDB_TINFO_ID, be32_to_cpu(tid))) {
+ nla_nest_cancel(skb, nest);
+ return false;
+ }
+ nla_nest_end(skb, nest);
+
+ return true;
+}
+
+static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return (!v_curr->tinfo.tunnel_dst && !range_end->tinfo.tunnel_dst) ||
+ vlan_tunid_inrange(v_curr, range_end);
+}
+
+/* check if the options' state of v_curr allow it to enter the range */
+bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return v_curr->state == range_end->state &&
+ __vlan_tun_can_enter_range(v_curr, range_end);
}
bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v)
{
return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE,
- br_vlan_get_state(v));
+ br_vlan_get_state(v)) &&
+ __vlan_tun_put(skb, v);
}
size_t br_vlan_opts_nl_size(void)
{
- return nla_total_size(sizeof(u8)); /* BRIDGE_VLANDB_ENTRY_STATE */
+ return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */
+ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */
+ + nla_total_size(sizeof(u32)); /* BRIDGE_VLANDB_TINFO_ID */
}
static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
@@ -62,6 +95,68 @@ static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
return 0;
}
+static const struct nla_policy br_vlandb_tinfo_pol[BRIDGE_VLANDB_TINFO_MAX + 1] = {
+ [BRIDGE_VLANDB_TINFO_ID] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_TINFO_CMD] = { .type = NLA_U32 },
+};
+
+static int br_vlan_modify_tunnel(const struct net_bridge_port *p,
+ struct net_bridge_vlan *v,
+ struct nlattr **tb,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tun_tb[BRIDGE_VLANDB_TINFO_MAX + 1], *attr;
+ struct bridge_vlan_info *vinfo;
+ u32 tun_id = 0;
+ int cmd, err;
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify tunnel mapping of non-port vlans");
+ return -EINVAL;
+ }
+ if (!(p->flags & BR_VLAN_TUNNEL)) {
+ NL_SET_ERR_MSG_MOD(extack, "Port doesn't have tunnel flag set");
+ return -EINVAL;
+ }
+
+ attr = tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO];
+ err = nla_parse_nested(tun_tb, BRIDGE_VLANDB_TINFO_MAX, attr,
+ br_vlandb_tinfo_pol, extack);
+ if (err)
+ return err;
+
+ if (!tun_tb[BRIDGE_VLANDB_TINFO_CMD]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing tunnel command attribute");
+ return -ENOENT;
+ }
+ cmd = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_CMD]);
+ switch (cmd) {
+ case RTM_SETLINK:
+ if (!tun_tb[BRIDGE_VLANDB_TINFO_ID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing tunnel id attribute");
+ return -ENOENT;
+ }
+ /* when working on vlan ranges this is the starting tunnel id */
+ tun_id = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_ID]);
+ /* vlan info attr is guaranteed by br_vlan_rtm_process_one */
+ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]);
+ /* tunnel ids are mapped to each vlan in increasing order,
+ * the starting vlan is in BRIDGE_VLANDB_ENTRY_INFO and v is the
+ * current vlan, so we compute: tun_id + v - vinfo->vid
+ */
+ tun_id += v->vid - vinfo->vid;
+ break;
+ case RTM_DELLINK:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel command");
+ return -EINVAL;
+ }
+
+ return br_vlan_tunnel_info(p, cmd, v->vid, tun_id, changed);
+}
+
static int br_vlan_process_one_opts(const struct net_bridge *br,
const struct net_bridge_port *p,
struct net_bridge_vlan_group *vg,
@@ -80,6 +175,11 @@ static int br_vlan_process_one_opts(const struct net_bridge *br,
if (err)
return err;
}
+ if (tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO]) {
+ err = br_vlan_modify_tunnel(p, v, tb, changed, extack);
+ if (err)
+ return err;
+ }
return 0;
}
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index d13d2080f527..169e005fbda2 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -89,7 +89,8 @@ out:
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
-int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
+int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid,
+ u32 tun_id)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *vlan;
@@ -107,7 +108,7 @@ int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
-int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid)
+int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *v;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index e1256e03a9a8..78db58c7aec2 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1561,7 +1561,7 @@ struct compat_ebt_entry_mwt {
compat_uptr_t ptr;
} u;
compat_uint_t match_size;
- compat_uint_t data[0] __attribute__ ((aligned (__alignof__(struct compat_ebt_replace))));
+ compat_uint_t data[] __aligned(__alignof__(struct compat_ebt_replace));
};
/* account for possible padding between match_size and ->data */
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 03c7cdd8e4cb..195d2d67be8a 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -112,7 +112,8 @@ static struct caif_device_entry *caif_get(struct net_device *dev)
caif_device_list(dev_net(dev));
struct caif_device_entry *caifd;
- list_for_each_entry_rcu(caifd, &caifdevs->list, list) {
+ list_for_each_entry_rcu(caifd, &caifdevs->list, list,
+ lockdep_rtnl_is_held()) {
if (caifd->netdev == dev)
return caifd;
}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5b4bd8261002..f8ca5edc5f2c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3248,12 +3248,16 @@ static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg)
static void ceph_msg_data_destroy(struct ceph_msg_data *data)
{
- if (data->type == CEPH_MSG_DATA_PAGELIST)
+ if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) {
+ int num_pages = calc_pages_for(data->alignment, data->length);
+ ceph_release_page_vector(data->pages, num_pages);
+ } else if (data->type == CEPH_MSG_DATA_PAGELIST) {
ceph_pagelist_release(data->pagelist);
+ }
}
void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
- size_t length, size_t alignment)
+ size_t length, size_t alignment, bool own_pages)
{
struct ceph_msg_data *data;
@@ -3265,6 +3269,7 @@ void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
data->pages = pages;
data->length = length;
data->alignment = alignment & ~PAGE_MASK;
+ data->own_pages = own_pages;
msg->data_length += length;
}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b68b376d8c2f..af868d3923b9 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -962,7 +962,7 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
BUG_ON(length > (u64) SIZE_MAX);
if (length)
ceph_msg_data_add_pages(msg, osd_data->pages,
- length, osd_data->alignment);
+ length, osd_data->alignment, false);
} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
BUG_ON(!length);
ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
@@ -4436,9 +4436,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
CEPH_MSG_DATA_PAGES);
*lreq->preply_pages = data->pages;
*lreq->preply_len = data->length;
- } else {
- ceph_release_page_vector(data->pages,
- calc_pages_for(0, data->length));
+ data->own_pages = false;
}
}
lreq->notify_finish_error = return_code;
@@ -5506,9 +5504,6 @@ out_unlock_osdc:
return m;
}
-/*
- * TODO: switch to a msg-owned pagelist
- */
static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
{
struct ceph_msg *m;
@@ -5522,7 +5517,6 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
if (data_len) {
struct page **pages;
- struct ceph_osd_data osd_data;
pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
GFP_NOIO);
@@ -5531,9 +5525,7 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
return NULL;
}
- ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
- false);
- ceph_osdc_msg_data_add(m, &osd_data);
+ ceph_msg_data_add_pages(m, pages, data_len, 0, true);
}
return m;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4e0de14f80bb..2a6e63a8edbe 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -710,6 +710,15 @@ int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
}
EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = __lookup_pg_pool(&map->pg_pools, id);
+ return pi ? pi->flags : 0;
+}
+EXPORT_SYMBOL(ceph_pg_pool_flags);
+
static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
{
rb_erase(&pi->node, root);
diff --git a/net/compat.c b/net/compat.c
index 47d99c784947..4bed96e84d9a 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -33,10 +33,10 @@
#include <linux/uaccess.h>
#include <net/compat.h>
-int get_compat_msghdr(struct msghdr *kmsg,
- struct compat_msghdr __user *umsg,
- struct sockaddr __user **save_addr,
- struct iovec **iov)
+int __get_compat_msghdr(struct msghdr *kmsg,
+ struct compat_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ compat_uptr_t *ptr, compat_size_t *len)
{
struct compat_msghdr msg;
ssize_t err;
@@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
+ *ptr = msg.msg_iov;
+ *len = msg.msg_iovlen;
+ return 0;
+}
+
+int get_compat_msghdr(struct msghdr *kmsg,
+ struct compat_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec **iov)
+{
+ compat_uptr_t ptr;
+ compat_size_t len;
+ ssize_t err;
+
+ err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
+ if (err)
+ return err;
- err = compat_import_iovec(save_addr ? READ : WRITE,
- compat_ptr(msg.msg_iov), msg.msg_iovlen,
- UIO_FASTIOV, iov, &kmsg->msg_iter);
+ err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr),
+ len, UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0;
}
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 3ab23f698221..756b63b6f7b3 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -8,6 +8,7 @@
#include <linux/bpf.h>
#include <net/bpf_sk_storage.h>
#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
static atomic_t cache_idx;
@@ -60,7 +61,7 @@ struct bpf_sk_storage_data {
* the number of cachelines access during the cache hit case.
*/
struct bpf_sk_storage_map __rcu *smap;
- u8 data[0] __aligned(8);
+ u8 data[] __aligned(8);
};
/* Linked to bpf_sk_storage and bpf_sk_storage_map */
@@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
kfree(map);
}
+/* U16_MAX is much more than enough for sk local storage
+ * considering a tcp_sock is ~2k.
+ */
+#define MAX_VALUE_SIZE \
+ min_t(u32, \
+ (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
+ (U16_MAX - sizeof(struct bpf_sk_storage_elem)))
+
static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
{
if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
@@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (attr->value_size >= KMALLOC_MAX_SIZE -
- MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) ||
- /* U16_MAX is much more than enough for sk local storage
- * considering a tcp_sock is ~2k.
- */
- attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem))
+ if (attr->value_size > MAX_VALUE_SIZE)
return -E2BIG;
return 0;
@@ -910,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_SOCKET,
};
+
+struct bpf_sk_storage_diag {
+ u32 nr_maps;
+ struct bpf_map *maps[];
+};
+
+/* The reply will be like:
+ * INET_DIAG_BPF_SK_STORAGES (nla_nest)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * ....
+ */
+static int nla_value_size(u32 value_size)
+{
+ /* SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ */
+ return nla_total_size(0) + nla_total_size(sizeof(u32)) +
+ nla_total_size_64bit(value_size);
+}
+
+void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
+{
+ u32 i;
+
+ if (!diag)
+ return;
+
+ for (i = 0; i < diag->nr_maps; i++)
+ bpf_map_put(diag->maps[i]);
+
+ kfree(diag);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
+
+static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
+ const struct bpf_map *map)
+{
+ u32 i;
+
+ for (i = 0; i < diag->nr_maps; i++) {
+ if (diag->maps[i] == map)
+ return true;
+ }
+
+ return false;
+}
+
+struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
+{
+ struct bpf_sk_storage_diag *diag;
+ struct nlattr *nla;
+ u32 nr_maps = 0;
+ int rem, err;
+
+ /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
+ * the map_alloc_check() side also does.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ nla_for_each_nested(nla, nla_stgs, rem) {
+ if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+ nr_maps++;
+ }
+
+ diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps,
+ GFP_KERNEL);
+ if (!diag)
+ return ERR_PTR(-ENOMEM);
+
+ nla_for_each_nested(nla, nla_stgs, rem) {
+ struct bpf_map *map;
+ int map_fd;
+
+ if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
+ continue;
+
+ map_fd = nla_get_u32(nla);
+ map = bpf_map_get(map_fd);
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto err_free;
+ }
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
+ bpf_map_put(map);
+ err = -EINVAL;
+ goto err_free;
+ }
+ if (diag_check_dup(diag, map)) {
+ bpf_map_put(map);
+ err = -EEXIST;
+ goto err_free;
+ }
+ diag->maps[diag->nr_maps++] = map;
+ }
+
+ return diag;
+
+err_free:
+ bpf_sk_storage_diag_free(diag);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
+
+static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
+{
+ struct nlattr *nla_stg, *nla_value;
+ struct bpf_sk_storage_map *smap;
+
+ /* It cannot exceed max nlattr's payload */
+ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);
+
+ nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
+ if (!nla_stg)
+ return -EMSGSIZE;
+
+ smap = rcu_dereference(sdata->smap);
+ if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
+ goto errout;
+
+ nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
+ smap->map.value_size,
+ SK_DIAG_BPF_STORAGE_PAD);
+ if (!nla_value)
+ goto errout;
+
+ if (map_value_has_spin_lock(&smap->map))
+ copy_map_value_locked(&smap->map, nla_data(nla_value),
+ sdata->data, true);
+ else
+ copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+
+ nla_nest_end(skb, nla_stg);
+ return 0;
+
+errout:
+ nla_nest_cancel(skb, nla_stg);
+ return -EMSGSIZE;
+}
+
+static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_sk_storage *sk_storage;
+ struct bpf_sk_storage_elem *selem;
+ struct bpf_sk_storage_map *smap;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+
+ rcu_read_lock();
+
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ smap = rcu_dereference(SDATA(selem)->smap);
+ diag_size += nla_value_size(smap->map.value_size);
+
+ if (nla_stgs && diag_get(SDATA(selem), skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+
+int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+ struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_sk_storage *sk_storage;
+ struct bpf_sk_storage_data *sdata;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+ u32 i;
+
+ *res_diag_size = 0;
+
+ /* No map has been specified. Dump all. */
+ if (!diag->nr_maps)
+ return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
+ res_diag_size);
+
+ rcu_read_lock();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ for (i = 0; i < diag->nr_maps; i++) {
+ sdata = __sk_storage_lookup(sk_storage,
+ (struct bpf_sk_storage_map *)diag->maps[i],
+ false);
+
+ if (!sdata)
+ continue;
+
+ diag_size += nla_value_size(diag->maps[i]->value_size);
+
+ if (nla_stgs && diag_get(sdata, skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a78e7f864c1e..639745d4f3b9 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -166,8 +167,6 @@ done:
struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
struct sk_buff_head *queue,
unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
int *off, int *err,
struct sk_buff **last)
{
@@ -198,8 +197,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
refcount_inc(&skb->users);
} else {
__skb_unlink(skb, queue);
- if (destructor)
- destructor(sk, skb);
}
*off = _off;
return skb;
@@ -212,7 +209,6 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
* @sk: socket
* @queue: socket queue from which to receive
* @flags: MSG\_ flags
- * @destructor: invoked under the receive lock on successful dequeue
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
* @err: error code returned
@@ -245,10 +241,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
*/
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
struct sk_buff_head *queue,
- unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
- int *off, int *err,
+ unsigned int flags, int *off, int *err,
struct sk_buff **last)
{
struct sk_buff *skb;
@@ -269,8 +262,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
* However, this function was correct in any case. 8)
*/
spin_lock_irqsave(&queue->lock, cpu_flags);
- skb = __skb_try_recv_from_queue(sk, queue, flags, destructor,
- off, &error, last);
+ skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
+ last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (error)
goto no_packet;
@@ -293,10 +286,7 @@ EXPORT_SYMBOL(__skb_try_recv_datagram);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
struct sk_buff_head *sk_queue,
- unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
- int *off, int *err)
+ unsigned int flags, int *off, int *err)
{
struct sk_buff *skb, *last;
long timeo;
@@ -304,8 +294,8 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, sk_queue, flags, destructor,
- off, err, &last);
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
+ &last);
if (skb)
return skb;
@@ -326,7 +316,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
return __skb_recv_datagram(sk, &sk->sk_receive_queue,
flags | (noblock ? MSG_DONTWAIT : 0),
- NULL, &off, err);
+ &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
@@ -414,6 +404,11 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
}
EXPORT_SYMBOL(skb_kill_datagram);
+INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
+ size_t bytes,
+ void *data __always_unused,
+ struct iov_iter *i));
+
static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len, bool fault_short,
size_t (*cb)(const void *, size_t, void *,
@@ -427,7 +422,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- n = cb(skb->data + offset, copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ skb->data + offset, copy, data, to);
offset += n;
if (n != copy)
goto short_copy;
@@ -449,8 +445,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
- n = cb(vaddr + skb_frag_off(frag) + offset - start,
- copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ vaddr + skb_frag_off(frag) + offset - start,
+ copy, data, to);
kunmap(page);
offset += n;
if (n != copy)
diff --git a/net/core/dev.c b/net/core/dev.c
index a69e8bd7ed74..9c9e763bfe0e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -146,7 +146,6 @@
#include "net-sysfs.h"
#define MAX_GRO_SKBS 8
-#define MAX_NEST_DEV 8
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
@@ -331,6 +330,12 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
name_node = netdev_name_node_lookup(net, name);
if (!name_node)
return -ENOENT;
+ /* lookup might have found our primary name or a name belonging
+ * to another device.
+ */
+ if (name_node == dev->name_node || name_node->dev != dev)
+ return -EINVAL;
+
__netdev_name_node_alt_destroy(name_node);
return 0;
@@ -3071,6 +3076,8 @@ static u16 skb_tx_hash(const struct net_device *dev,
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
+ if (hash >= qoffset)
+ hash -= qoffset;
while (unlikely(hash >= qcount))
hash -= qcount;
return hash + qoffset;
@@ -3259,7 +3266,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*
- * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
@@ -3288,7 +3295,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
features &= ~NETIF_F_GSO_PARTIAL;
}
- BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
@@ -3657,26 +3664,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
qdisc_calculate_pkt_len(skb, q);
if (q->flags & TCQ_F_NOLOCK) {
- if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) &&
- qdisc_run_begin(q)) {
- if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
- &q->state))) {
- __qdisc_drop(skb, &to_free);
- rc = NET_XMIT_DROP;
- goto end_run;
- }
- qdisc_bstats_cpu_update(q, skb);
-
- rc = NET_XMIT_SUCCESS;
- if (sch_direct_xmit(skb, q, dev, txq, NULL, true))
- __qdisc_run(q);
-
-end_run:
- qdisc_run_end(q);
- } else {
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- qdisc_run(q);
- }
+ rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+ qdisc_run(q);
if (unlikely(to_free))
kfree_skb_list(to_free);
@@ -4527,14 +4516,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
- if (skb_cloned(skb) || skb_is_tc_redirected(skb))
+ if (skb_is_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
* of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
* native XDP provides, thus we need to do it here as well.
*/
- if (skb_is_nonlinear(skb) ||
+ if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
skb_headroom(skb) < XDP_PACKET_HEADROOM) {
int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
int troom = skb->tail + skb->data_len - skb->end;
@@ -4649,7 +4638,6 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
kfree_skb(skb);
}
}
-EXPORT_SYMBOL_GPL(generic_xdp_tx);
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
@@ -4860,7 +4848,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
skb->tc_at_ingress = 1;
mini_qdisc_bstats_cpu_update(miniq, skb);
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+ switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
+ &cl_res, false)) {
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
skb->tc_index = TC_H_MIN(cl_res.classid);
@@ -5074,7 +5063,7 @@ skip_taps:
goto out;
}
#endif
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
@@ -5206,7 +5195,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
*
* More direct receive version of netif_receive_skb(). It should
* only be used by callers that have a need to skip RPS and Generic XDP.
- * Caller must also take care of handling if (page_is_)pfmemalloc.
+ * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
@@ -7201,8 +7190,8 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev,
return 0;
}
-static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
- struct list_head **iter)
+struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
@@ -7214,6 +7203,7 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
return lower->dev;
}
+EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
static u8 __netdev_upper_depth(struct net_device *dev)
{
@@ -8665,15 +8655,17 @@ static void dev_xdp_uninstall(struct net_device *dev)
* @dev: device
* @extack: netlink extended ack
* @fd: new program fd or negative value to clear
+ * @expected_fd: old program fd that userspace expects to replace or clear
* @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
- int fd, u32 flags)
+ int fd, int expected_fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
enum bpf_netdev_command query;
+ u32 prog_id, expected_id = 0;
struct bpf_prog *prog = NULL;
bpf_op_t bpf_op, bpf_chk;
bool offload;
@@ -8694,15 +8686,29 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
if (bpf_op == bpf_chk)
bpf_chk = generic_xdp_install;
- if (fd >= 0) {
- u32 prog_id;
+ prog_id = __dev_xdp_query(dev, bpf_op, query);
+ if (flags & XDP_FLAGS_REPLACE) {
+ if (expected_fd >= 0) {
+ prog = bpf_prog_get_type_dev(expected_fd,
+ BPF_PROG_TYPE_XDP,
+ bpf_op == ops->ndo_bpf);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ expected_id = prog->aux->id;
+ bpf_prog_put(prog);
+ }
+ if (prog_id != expected_id) {
+ NL_SET_ERR_MSG(extack, "Active program does not match expected");
+ return -EEXIST;
+ }
+ }
+ if (fd >= 0) {
if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
return -EEXIST;
}
- prog_id = __dev_xdp_query(dev, bpf_op, query);
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
NL_SET_ERR_MSG(extack, "XDP program already attached");
return -EBUSY;
@@ -8725,7 +8731,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
return 0;
}
} else {
- if (!__dev_xdp_query(dev, bpf_op, query))
+ if (!prog_id)
return 0;
}
@@ -9293,6 +9299,10 @@ int register_netdevice(struct net_device *dev)
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
+ ret = ethtool_check_ops(dev->ethtool_ops);
+ if (ret)
+ return ret;
+
spin_lock_init(&dev->addr_list_lock);
lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key);
@@ -10016,6 +10026,7 @@ EXPORT_SYMBOL(unregister_netdev);
int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
{
+ struct net *net_old = dev_net(dev);
int err, new_nsid, new_ifindex;
ASSERT_RTNL();
@@ -10031,7 +10042,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
/* Get out if there is nothing todo */
err = 0;
- if (net_eq(dev_net(dev), net))
+ if (net_eq(net_old, net))
goto out;
/* Pick the destination device name, and ensure
@@ -10107,6 +10118,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
err = device_rename(&dev->dev, dev->name);
WARN_ON(err);
+ /* Adapt owner in case owning user namespace of target network
+ * namespace is different from the original one.
+ */
+ err = netdev_change_owner(dev, net_old, net);
+ WARN_ON(err);
+
/* Add the device back in the hashes */
list_netdevice(dev);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index dbaebbe573f0..547b587c1950 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -190,6 +190,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_TX_ONESTEP_P2P:
tx_type_valid = 1;
break;
+ case __HWTSTAMP_TX_CNT:
+ /* not a real value */
+ break;
}
switch (rx_filter) {
@@ -211,6 +214,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_FILTER_NTP_ALL:
rx_filter_valid = 1;
break;
+ case __HWTSTAMP_FILTER_CNT:
+ /* not a real value */
+ break;
}
if (!tx_type_valid || !rx_filter_valid)
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 549ee56b7a21..80f97722f31f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -344,7 +344,7 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
struct devlink_region {
struct devlink *devlink;
struct list_head list;
- const char *name;
+ const struct devlink_region_ops *ops;
struct list_head snapshot_list;
u32 max_snapshots;
u32 cur_snapshots;
@@ -354,7 +354,6 @@ struct devlink_region {
struct devlink_snapshot {
struct list_head list;
struct devlink_region *region;
- devlink_snapshot_data_dest_t *data_destructor;
u8 *data;
u32 id;
};
@@ -365,7 +364,7 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
struct devlink_region *region;
list_for_each_entry(region, &devlink->region_list, list)
- if (!strcmp(region->name, region_name))
+ if (!strcmp(region->ops->name, region_name))
return region;
return NULL;
@@ -545,6 +544,7 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
+ case DEVLINK_PORT_FLAVOUR_VIRTUAL:
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
attrs->phys.port_number))
return -EMSGSIZE;
@@ -2103,11 +2103,11 @@ err_action_values_put:
static struct devlink_dpipe_table *
devlink_dpipe_table_find(struct list_head *dpipe_tables,
- const char *table_name)
+ const char *table_name, struct devlink *devlink)
{
struct devlink_dpipe_table *table;
-
- list_for_each_entry_rcu(table, dpipe_tables, list) {
+ list_for_each_entry_rcu(table, dpipe_tables, list,
+ lockdep_is_held(&devlink->lock)) {
if (!strcmp(table->name, table_name))
return table;
}
@@ -2226,7 +2226,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
if (!table)
return -EINVAL;
@@ -2382,7 +2382,7 @@ static int devlink_dpipe_table_counters_set(struct devlink *devlink,
struct devlink_dpipe_table *table;
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
if (!table)
return -EINVAL;
@@ -2709,7 +2709,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb,
struct net *net;
if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
- NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified");
+ NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified");
return ERR_PTR(-EINVAL);
}
@@ -2727,7 +2727,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb,
net = ERR_PTR(-EINVAL);
}
if (IS_ERR(net)) {
- NL_SET_ERR_MSG(info->extack, "Unknown network namespace");
+ NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace");
return ERR_PTR(-EINVAL);
}
if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
@@ -3352,34 +3352,41 @@ devlink_param_value_get_from_info(const struct devlink_param *param,
struct genl_info *info,
union devlink_param_value *value)
{
+ struct nlattr *param_data;
int len;
- if (param->type != DEVLINK_PARAM_TYPE_BOOL &&
- !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])
+ param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA];
+
+ if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data)
return -EINVAL;
switch (param->type) {
case DEVLINK_PARAM_TYPE_U8:
- value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ if (nla_len(param_data) != sizeof(u8))
+ return -EINVAL;
+ value->vu8 = nla_get_u8(param_data);
break;
case DEVLINK_PARAM_TYPE_U16:
- value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ if (nla_len(param_data) != sizeof(u16))
+ return -EINVAL;
+ value->vu16 = nla_get_u16(param_data);
break;
case DEVLINK_PARAM_TYPE_U32:
- value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
+ if (nla_len(param_data) != sizeof(u32))
+ return -EINVAL;
+ value->vu32 = nla_get_u32(param_data);
break;
case DEVLINK_PARAM_TYPE_STRING:
- len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]),
- nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
- if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) ||
+ len = strnlen(nla_data(param_data), nla_len(param_data));
+ if (len == nla_len(param_data) ||
len >= __DEVLINK_PARAM_MAX_STRING_VALUE)
return -EINVAL;
- strcpy(value->vstr,
- nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
+ strcpy(value->vstr, nla_data(param_data));
break;
case DEVLINK_PARAM_TYPE_BOOL:
- value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ?
- true : false;
+ if (param_data && nla_len(param_data))
+ return -EINVAL;
+ value->vbool = nla_get_flag(param_data);
break;
}
return 0;
@@ -3687,7 +3694,7 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
if (err)
goto nla_put_failure;
- err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name);
if (err)
goto nla_put_failure;
@@ -3733,7 +3740,7 @@ static void devlink_nl_region_notify(struct devlink_region *region,
goto out_cancel_msg;
err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
- region->name);
+ region->ops->name);
if (err)
goto out_cancel_msg;
@@ -3761,13 +3768,201 @@ out_free_msg:
nlmsg_free(msg);
}
+/**
+ * __devlink_snapshot_id_increment - Increment number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a new snapshot begins using an id. Load the count for the
+ * given id from the snapshot xarray, increment it, and store it back.
+ *
+ * Called when a new snapshot is created with the given id.
+ *
+ * The id *must* have been previously allocated by
+ * devlink_region_snapshot_id_get().
+ *
+ * Returns 0 on success, or an error on failure.
+ */
+static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+
+ lockdep_assert_held(&devlink->lock);
+
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p))
+ return -EINVAL;
+
+ if (WARN_ON(!xa_is_value(p)))
+ return -EINVAL;
+
+ count = xa_to_value(p);
+ count++;
+
+ return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_KERNEL));
+}
+
+/**
+ * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a snapshot is deleted and stops using an id. Load the count
+ * for the given id from the snapshot xarray, decrement it, and store it
+ * back.
+ *
+ * If the count reaches zero, erase this id from the xarray, freeing it
+ * up for future re-use by devlink_region_snapshot_id_get().
+ *
+ * Called when a snapshot using the given id is deleted, and when the
+ * initial allocator of the id is finished using it.
+ */
+static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+
+ lockdep_assert_held(&devlink->lock);
+
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p))
+ return;
+
+ if (WARN_ON(!xa_is_value(p)))
+ return;
+
+ count = xa_to_value(p);
+
+ if (count > 1) {
+ count--;
+ xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_KERNEL);
+ } else {
+ /* If this was the last user, we can erase this id */
+ xa_erase(&devlink->snapshot_ids, id);
+ }
+}
+
+/**
+ * __devlink_snapshot_id_insert - Insert a specific snapshot ID
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Mark the given snapshot id as used by inserting a zero value into the
+ * snapshot xarray.
+ *
+ * This must be called while holding the devlink instance lock. Unlike
+ * devlink_snapshot_id_get, the initial reference count is zero, not one.
+ * It is expected that the id will immediately be used before
+ * releasing the devlink instance lock.
+ *
+ * Returns zero on success, or an error code if the snapshot id could not
+ * be inserted.
+ */
+static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ if (WARN_ON(xa_load(&devlink->snapshot_ids, id)))
+ return -EEXIST;
+
+ return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(0),
+ GFP_KERNEL));
+}
+
+/**
+ * __devlink_region_snapshot_id_get - get snapshot ID
+ * @devlink: devlink instance
+ * @id: storage to return snapshot id
+ *
+ * Allocates a new snapshot id. Returns zero on success, or a negative
+ * error on failure. Must be called while holding the devlink instance
+ * lock.
+ *
+ * Snapshot IDs are tracked using an xarray which stores the number of
+ * users of the snapshot id.
+ *
+ * Note that the caller of this function counts as a 'user', in order to
+ * avoid race conditions. The caller must release its hold on the
+ * snapshot by using devlink_region_snapshot_id_put.
+ */
+static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1),
+ xa_limit_32b, GFP_KERNEL);
+}
+
+/**
+ * __devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * Must be called only while holding the devlink instance lock.
+ *
+ * @region: devlink region of the snapshot
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ */
+static int
+__devlink_region_snapshot_create(struct devlink_region *region,
+ u8 *data, u32 snapshot_id)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot;
+ int err;
+
+ lockdep_assert_held(&devlink->lock);
+
+ /* check if region can hold one more snapshot */
+ if (region->cur_snapshots == region->max_snapshots)
+ return -ENOSPC;
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id))
+ return -EEXIST;
+
+ snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+ if (!snapshot)
+ return -ENOMEM;
+
+ err = __devlink_snapshot_id_increment(devlink, snapshot_id);
+ if (err)
+ goto err_snapshot_id_increment;
+
+ snapshot->id = snapshot_id;
+ snapshot->region = region;
+ snapshot->data = data;
+
+ list_add_tail(&snapshot->list, &region->snapshot_list);
+
+ region->cur_snapshots++;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ return 0;
+
+err_snapshot_id_increment:
+ kfree(snapshot);
+ return err;
+}
+
static void devlink_region_snapshot_del(struct devlink_region *region,
struct devlink_snapshot *snapshot)
{
+ struct devlink *devlink = region->devlink;
+
+ lockdep_assert_held(&devlink->lock);
+
devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
region->cur_snapshots--;
list_del(&snapshot->list);
- (*snapshot->data_destructor)(snapshot->data);
+ region->ops->destructor(snapshot->data);
+ __devlink_snapshot_id_decrement(devlink, snapshot->id);
kfree(snapshot);
}
@@ -3870,6 +4065,71 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb,
return 0;
}
+static int
+devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_region *region;
+ const char *region_name;
+ u32 snapshot_id;
+ u8 *data;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) {
+ NL_SET_ERR_MSG_MOD(info->extack, "No region name provided");
+ return -EINVAL;
+ }
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) {
+ NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided");
+ return -EINVAL;
+ }
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist");
+ return -EINVAL;
+ }
+
+ if (!region->ops->snapshot) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not support taking an immediate snapshot");
+ return -EOPNOTSUPP;
+ }
+
+ if (region->cur_snapshots == region->max_snapshots) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots");
+ return -ENOSPC;
+ }
+
+ snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use");
+ return -EEXIST;
+ }
+
+ err = __devlink_snapshot_id_insert(devlink, snapshot_id);
+ if (err)
+ return err;
+
+ err = region->ops->snapshot(devlink, info->extack, &data);
+ if (err)
+ goto err_snapshot_capture;
+
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
+ if (err)
+ goto err_snapshot_create;
+
+ return 0;
+
+err_snapshot_create:
+ region->ops->destructor(data);
+err_snapshot_capture:
+ __devlink_snapshot_id_decrement(devlink, snapshot_id);
+ return err;
+}
+
static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
struct devlink *devlink,
u8 *chunk, u32 chunk_size,
@@ -4232,11 +4492,17 @@ struct devlink_fmsg_item {
int attrtype;
u8 nla_type;
u16 len;
- int value[0];
+ int value[];
};
struct devlink_fmsg {
struct list_head item_list;
+ bool putting_binary; /* This flag forces enclosing of binary data
+ * in an array brackets. It forces using
+ * of designated API:
+ * devlink_fmsg_binary_pair_nest_start()
+ * devlink_fmsg_binary_pair_nest_end()
+ */
};
static struct devlink_fmsg *devlink_fmsg_alloc(void)
@@ -4280,17 +4546,26 @@ static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg,
int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start);
static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END);
}
int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_end(fmsg);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end);
@@ -4301,6 +4576,9 @@ static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
{
struct devlink_fmsg_item *item;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE)
return -EMSGSIZE;
@@ -4321,6 +4599,9 @@ int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
{
int err;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START);
if (err)
return err;
@@ -4335,6 +4616,9 @@ EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start);
int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_nest_end(fmsg);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end);
@@ -4344,6 +4628,9 @@ int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
{
int err;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
err = devlink_fmsg_pair_nest_start(fmsg, name);
if (err)
return err;
@@ -4360,6 +4647,9 @@ int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
{
int err;
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
err = devlink_fmsg_nest_end(fmsg);
if (err)
return err;
@@ -4372,6 +4662,30 @@ int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
}
EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end);
+int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
+ const char *name)
+{
+ int err;
+
+ err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
+ if (err)
+ return err;
+
+ fmsg->putting_binary = true;
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start);
+
+int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+ if (!fmsg->putting_binary)
+ return -EINVAL;
+
+ fmsg->putting_binary = false;
+ return devlink_fmsg_arr_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end);
+
static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
const void *value, u16 value_len,
u8 value_nla_type)
@@ -4396,40 +4710,59 @@ static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_bool_put);
int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u8_put);
int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put);
int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u64_put);
int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
{
+ if (fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1,
NLA_NUL_STRING);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
-static int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
- u16 value_len)
+int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+ u16 value_len)
{
+ if (!fmsg->putting_binary)
+ return -EINVAL;
+
return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
bool value)
@@ -4540,10 +4873,11 @@ int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
const void *value, u32 value_len)
{
u32 data_size;
+ int end_err;
u32 offset;
int err;
- err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
+ err = devlink_fmsg_binary_pair_nest_start(fmsg, name);
if (err)
return err;
@@ -4553,14 +4887,18 @@ int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
data_size = DEVLINK_FMSG_MAX_SIZE;
err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
if (err)
- return err;
+ break;
+ /* Exit from loop with a break (instead of
+ * return) to make sure putting_binary is turned off in
+ * devlink_fmsg_binary_pair_nest_end
+ */
}
- err = devlink_fmsg_arr_pair_nest_end(fmsg);
- if (err)
- return err;
+ end_err = devlink_fmsg_binary_pair_nest_end(fmsg);
+ if (end_err)
+ err = end_err;
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put);
@@ -4751,6 +5089,7 @@ struct devlink_health_reporter {
struct mutex dump_lock; /* lock parallel read/write from dump buffers */
u64 graceful_period;
bool auto_recover;
+ bool auto_dump;
u8 health_state;
u64 dump_ts;
u64 dump_real_ts;
@@ -4786,14 +5125,12 @@ devlink_health_reporter_find_by_name(struct devlink *devlink,
* @devlink: devlink
* @ops: ops
* @graceful_period: to avoid recovery loops, in msecs
- * @auto_recover: auto recover when error occurs
* @priv: priv
*/
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, bool auto_recover,
- void *priv)
+ u64 graceful_period, void *priv)
{
struct devlink_health_reporter *reporter;
@@ -4803,8 +5140,7 @@ devlink_health_reporter_create(struct devlink *devlink,
goto unlock;
}
- if (WARN_ON(auto_recover && !ops->recover) ||
- WARN_ON(graceful_period && !ops->recover)) {
+ if (WARN_ON(graceful_period && !ops->recover)) {
reporter = ERR_PTR(-EINVAL);
goto unlock;
}
@@ -4819,7 +5155,8 @@ devlink_health_reporter_create(struct devlink *devlink,
reporter->ops = ops;
reporter->devlink = devlink;
reporter->graceful_period = graceful_period;
- reporter->auto_recover = auto_recover;
+ reporter->auto_recover = !!ops->recover;
+ reporter->auto_dump = !!ops->dump;
mutex_init(&reporter->dump_lock);
refcount_set(&reporter->refcount, 1);
list_add_tail(&reporter->list, &devlink->reporter_list);
@@ -4900,6 +5237,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS,
reporter->dump_real_ts, DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
+ if (reporter->ops->dump &&
+ nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+ reporter->auto_dump))
+ goto reporter_nest_cancel;
nla_nest_end(msg, reporter_attr);
genlmsg_end(msg, hdr);
@@ -5046,10 +5387,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
- mutex_lock(&reporter->dump_lock);
- /* store current dump of current error, for later analysis */
- devlink_health_do_dump(reporter, priv_ctx, NULL);
- mutex_unlock(&reporter->dump_lock);
+ if (reporter->auto_dump) {
+ mutex_lock(&reporter->dump_lock);
+ /* store current dump of current error, for later analysis */
+ devlink_health_do_dump(reporter, priv_ctx, NULL);
+ mutex_unlock(&reporter->dump_lock);
+ }
if (reporter->auto_recover)
return devlink_health_reporter_recover(reporter,
@@ -5223,6 +5566,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
err = -EOPNOTSUPP;
goto out;
}
+ if (!reporter->ops->dump &&
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
reporter->graceful_period =
@@ -5232,6 +5580,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
reporter->auto_recover =
nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
+ reporter->auto_dump =
+ nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]);
+
devlink_health_reporter_put(reporter);
return 0;
out:
@@ -5368,18 +5720,35 @@ struct devlink_stats {
};
/**
+ * struct devlink_trap_policer_item - Packet trap policer attributes.
+ * @policer: Immutable packet trap policer attributes.
+ * @rate: Rate in packets / sec.
+ * @burst: Burst size in packets.
+ * @list: trap_policer_list member.
+ *
+ * Describes packet trap policer attributes. Created by devlink during trap
+ * policer registration.
+ */
+struct devlink_trap_policer_item {
+ const struct devlink_trap_policer *policer;
+ u64 rate;
+ u64 burst;
+ struct list_head list;
+};
+
+/**
* struct devlink_trap_group_item - Packet trap group attributes.
* @group: Immutable packet trap group attributes.
- * @refcount: Number of trap items using the group.
+ * @policer_item: Associated policer item. Can be NULL.
* @list: trap_group_list member.
* @stats: Trap group statistics.
*
* Describes packet trap group attributes. Created by devlink during trap
- * registration.
+ * group registration.
*/
struct devlink_trap_group_item {
const struct devlink_trap_group *group;
- refcount_t refcount;
+ struct devlink_trap_policer_item *policer_item;
struct list_head list;
struct devlink_stats __percpu *stats;
};
@@ -5405,6 +5774,19 @@ struct devlink_trap_item {
void *priv;
};
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
+ if (policer_item->policer->id == id)
+ return policer_item;
+ }
+
+ return NULL;
+}
+
static struct devlink_trap_item *
devlink_trap_item_lookup(struct devlink *devlink, const char *name)
{
@@ -5462,6 +5844,9 @@ static int devlink_trap_metadata_put(struct sk_buff *msg,
if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) &&
nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT))
goto nla_put_failure;
+ if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE))
+ goto nla_put_failure;
nla_nest_end(msg, attr);
@@ -5729,6 +6114,19 @@ devlink_trap_group_item_lookup(struct devlink *devlink, const char *name)
}
static struct devlink_trap_group_item *
+devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (group_item->group->id == id)
+ return group_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_group_item *
devlink_trap_group_item_get_from_info(struct devlink *devlink,
struct genl_info *info)
{
@@ -5765,6 +6163,11 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
goto nla_put_failure;
+ if (group_item->policer_item &&
+ nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ group_item->policer_item->policer->id))
+ goto nla_put_failure;
+
err = devlink_trap_stats_put(msg, group_item->stats);
if (err)
goto nla_put_failure;
@@ -5866,7 +6269,7 @@ __devlink_trap_group_action_set(struct devlink *devlink,
int err;
list_for_each_entry(trap_item, &devlink->trap_list, list) {
- if (strcmp(trap_item->trap->group.name, group_name))
+ if (strcmp(trap_item->group_item->group->name, group_name))
continue;
err = __devlink_trap_action_set(devlink, trap_item,
trap_action, extack);
@@ -5880,7 +6283,7 @@ __devlink_trap_group_action_set(struct devlink *devlink,
static int
devlink_trap_group_action_set(struct devlink *devlink,
struct devlink_trap_group_item *group_item,
- struct genl_info *info)
+ struct genl_info *info, bool *p_modified)
{
enum devlink_trap_action trap_action;
int err;
@@ -5899,6 +6302,47 @@ devlink_trap_group_action_set(struct devlink *devlink,
if (err)
return err;
+ *p_modified = true;
+
+ return 0;
+}
+
+static int devlink_trap_group_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ const struct devlink_trap_policer *policer;
+ struct nlattr **attrs = info->attrs;
+ int err;
+
+ if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return 0;
+
+ if (!devlink->ops->trap_group_set)
+ return -EOPNOTSUPP;
+
+ policer_item = group_item->policer_item;
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) {
+ u32 policer_id;
+
+ policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+ policer_item = devlink_trap_policer_item_lookup(devlink,
+ policer_id);
+ if (policer_id && !policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+ }
+ policer = policer_item ? policer_item->policer : NULL;
+
+ err = devlink->ops->trap_group_set(devlink, group_item->group, policer);
+ if (err)
+ return err;
+
+ group_item->policer_item = policer_item;
+
return 0;
}
@@ -5908,6 +6352,7 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_group_item *group_item;
+ bool modified = false;
int err;
if (list_empty(&devlink->trap_group_list))
@@ -5919,14 +6364,262 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
return -ENOENT;
}
- err = devlink_trap_group_action_set(devlink, group_item, info);
+ err = devlink_trap_group_action_set(devlink, group_item, info,
+ &modified);
+ if (err)
+ return err;
+
+ err = devlink_trap_group_set(devlink, group_item, info);
+ if (err)
+ goto err_trap_group_set;
+
+ return 0;
+
+err_trap_group_set:
+ if (modified)
+ NL_SET_ERR_MSG_MOD(extack, "Trap group set failed, but some changes were committed already");
+ return err;
+}
+
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ u32 id;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return NULL;
+ id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+
+ return devlink_trap_policer_item_lookup(devlink, id);
+}
+
+static int
+devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct nlattr *attr;
+ u64 drops;
+ int err;
+
+ if (!devlink->ops->trap_policer_counter_get)
+ return 0;
+
+ err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops);
+ if (err)
+ return err;
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops,
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int
+devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ policer_item->policer->id))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE,
+ policer_item->rate, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST,
+ policer_item->burst, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ err = devlink_trap_policer_stats_put(msg, devlink,
+ policer_item->policer);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err)
+ goto err_trap_policer_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_policer_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW;
+ struct devlink_trap_policer_item *policer_item;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(policer_item, &devlink->trap_policer_list,
+ list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_policer_fill(msg, devlink,
+ policer_item, cmd,
+ portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int
+devlink_trap_policer_set(struct devlink *devlink,
+ struct devlink_trap_policer_item *policer_item,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nlattr **attrs = info->attrs;
+ u64 rate, burst;
+ int err;
+
+ rate = policer_item->rate;
+ burst = policer_item->burst;
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE])
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]);
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST])
+ burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]);
+
+ if (rate < policer_item->policer->min_rate) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer rate lower than limit");
+ return -EINVAL;
+ }
+
+ if (rate > policer_item->policer->max_rate) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer rate higher than limit");
+ return -EINVAL;
+ }
+
+ if (burst < policer_item->policer->min_burst) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer burst size lower than limit");
+ return -EINVAL;
+ }
+
+ if (burst > policer_item->policer->max_burst) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer burst size higher than limit");
+ return -EINVAL;
+ }
+
+ err = devlink->ops->trap_policer_set(devlink, policer_item->policer,
+ rate, burst, info->extack);
if (err)
return err;
+ policer_item->rate = rate;
+ policer_item->burst = burst;
+
return 0;
}
+static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ if (!devlink->ops->trap_policer_set)
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ return devlink_trap_policer_set(devlink, policer_item, info);
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+ [DEVLINK_ATTR_UNSPEC] = { .strict_start_type =
+ DEVLINK_ATTR_TRAP_POLICER_ID },
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
@@ -5951,6 +6644,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64 },
[DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 },
@@ -5962,6 +6657,10 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 },
[DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 },
[DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -6085,7 +6784,8 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_eswitch_get_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_ESWITCH_SET,
@@ -6184,6 +6884,13 @@ static const struct genl_ops devlink_nl_ops[] = {
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
+ .cmd = DEVLINK_CMD_REGION_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = devlink_nl_cmd_region_new,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
.cmd = DEVLINK_CMD_REGION_DEL,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_region_del,
@@ -6289,6 +6996,19 @@ static const struct genl_ops devlink_nl_ops[] = {
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_GET,
+ .doit = devlink_nl_cmd_trap_policer_get_doit,
+ .dumpit = devlink_nl_cmd_trap_policer_get_dumpit,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_SET,
+ .doit = devlink_nl_cmd_trap_policer_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
};
static struct genl_family devlink_nl_family __ro_after_init = {
@@ -6326,6 +7046,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
if (!devlink)
return NULL;
devlink->ops = ops;
+ xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
__devlink_net_set(devlink, &init_net);
INIT_LIST_HEAD(&devlink->port_list);
INIT_LIST_HEAD(&devlink->sb_list);
@@ -6336,6 +7057,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
INIT_LIST_HEAD(&devlink->reporter_list);
INIT_LIST_HEAD(&devlink->trap_list);
INIT_LIST_HEAD(&devlink->trap_group_list);
+ INIT_LIST_HEAD(&devlink->trap_policer_list);
mutex_init(&devlink->lock);
mutex_init(&devlink->reporters_lock);
return devlink;
@@ -6420,6 +7142,7 @@ void devlink_free(struct devlink *devlink)
{
mutex_destroy(&devlink->reporters_lock);
mutex_destroy(&devlink->lock);
+ WARN_ON(!list_empty(&devlink->trap_policer_list));
WARN_ON(!list_empty(&devlink->trap_group_list));
WARN_ON(!list_empty(&devlink->trap_list));
WARN_ON(!list_empty(&devlink->reporter_list));
@@ -6430,6 +7153,8 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->sb_list));
WARN_ON(!list_empty(&devlink->port_list));
+ xa_destroy(&devlink->snapshot_ids);
+
kfree(devlink);
}
EXPORT_SYMBOL_GPL(devlink_free);
@@ -6725,6 +7450,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
switch (attrs->flavour) {
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ case DEVLINK_PORT_FLAVOUR_VIRTUAL:
if (!attrs->split)
n = snprintf(name, len, "p%u", attrs->phys.port_number);
else
@@ -6854,7 +7580,7 @@ bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
rcu_read_lock();
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
enabled = false;
if (table)
enabled = table->counters_enabled;
@@ -6878,26 +7604,34 @@ int devlink_dpipe_table_register(struct devlink *devlink,
void *priv, bool counter_control_extern)
{
struct devlink_dpipe_table *table;
-
- if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name))
- return -EEXIST;
+ int err = 0;
if (WARN_ON(!table_ops->size_get))
return -EINVAL;
+ mutex_lock(&devlink->lock);
+
+ if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name,
+ devlink)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
table = kzalloc(sizeof(*table), GFP_KERNEL);
- if (!table)
- return -ENOMEM;
+ if (!table) {
+ err = -ENOMEM;
+ goto unlock;
+ }
table->name = table_name;
table->table_ops = table_ops;
table->priv = priv;
table->counter_control_extern = counter_control_extern;
- mutex_lock(&devlink->lock);
list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
+unlock:
mutex_unlock(&devlink->lock);
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
@@ -6914,7 +7648,7 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
mutex_lock(&devlink->lock);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
if (!table)
goto unlock;
list_del_rcu(&table->list);
@@ -7071,7 +7805,7 @@ int devlink_dpipe_table_resource_set(struct devlink *devlink,
mutex_lock(&devlink->lock);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
+ table_name, devlink);
if (!table) {
err = -EINVAL;
goto out;
@@ -7536,21 +8270,24 @@ EXPORT_SYMBOL_GPL(devlink_param_value_str_fill);
* devlink_region_create - create a new address region
*
* @devlink: devlink
- * @region_name: region name
+ * @ops: region operations and name
* @region_max_snapshots: Maximum supported number of snapshots for region
* @region_size: size of region
*/
-struct devlink_region *devlink_region_create(struct devlink *devlink,
- const char *region_name,
- u32 region_max_snapshots,
- u64 region_size)
+struct devlink_region *
+devlink_region_create(struct devlink *devlink,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
{
struct devlink_region *region;
int err = 0;
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
mutex_lock(&devlink->lock);
- if (devlink_region_get_by_name(devlink, region_name)) {
+ if (devlink_region_get_by_name(devlink, ops->name)) {
err = -EEXIST;
goto unlock;
}
@@ -7563,7 +8300,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink,
region->devlink = devlink;
region->max_snapshots = region_max_snapshots;
- region->name = region_name;
+ region->ops = ops;
region->size = region_size;
INIT_LIST_HEAD(&region->snapshot_list);
list_add_tail(&region->list, &devlink->region_list);
@@ -7609,75 +8346,66 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy);
* Driver should use the same id for multiple snapshots taken
* on multiple regions at the same time/by the same trigger.
*
+ * The caller of this function must use devlink_region_snapshot_id_put
+ * when finished creating regions using this id.
+ *
+ * Returns zero on success, or a negative error code on failure.
+ *
* @devlink: devlink
+ * @id: storage to return id
*/
-u32 devlink_region_snapshot_id_get(struct devlink *devlink)
+int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
{
- u32 id;
+ int err;
mutex_lock(&devlink->lock);
- id = ++devlink->snapshot_id;
+ err = __devlink_region_snapshot_id_get(devlink, id);
mutex_unlock(&devlink->lock);
- return id;
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get);
/**
+ * devlink_region_snapshot_id_put - put snapshot ID reference
+ *
+ * This should be called by a driver after finishing creating snapshots
+ * with an id. Doing so ensures that the ID can later be released in the
+ * event that all snapshots using it have been destroyed.
+ *
+ * @devlink: devlink
+ * @id: id to release reference on
+ */
+void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id)
+{
+ mutex_lock(&devlink->lock);
+ __devlink_snapshot_id_decrement(devlink, id);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put);
+
+/**
* devlink_region_snapshot_create - create a new snapshot
* This will add a new snapshot of a region. The snapshot
* will be stored on the region struct and can be accessed
- * from devlink. This is useful for future analyses of snapshots.
+ * from devlink. This is useful for future analyses of snapshots.
* Multiple snapshots can be created on a region.
* The @snapshot_id should be obtained using the getter function.
*
* @region: devlink region of the snapshot
* @data: snapshot data
* @snapshot_id: snapshot id to be created
- * @data_destructor: pointer to destructor function to free data
*/
int devlink_region_snapshot_create(struct devlink_region *region,
- u8 *data, u32 snapshot_id,
- devlink_snapshot_data_dest_t *data_destructor)
+ u8 *data, u32 snapshot_id)
{
struct devlink *devlink = region->devlink;
- struct devlink_snapshot *snapshot;
int err;
mutex_lock(&devlink->lock);
-
- /* check if region can hold one more snapshot */
- if (region->cur_snapshots == region->max_snapshots) {
- err = -ENOMEM;
- goto unlock;
- }
-
- if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
- err = -EEXIST;
- goto unlock;
- }
-
- snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
- if (!snapshot) {
- err = -ENOMEM;
- goto unlock;
- }
-
- snapshot->id = snapshot_id;
- snapshot->region = region;
- snapshot->data = data;
- snapshot->data_destructor = data_destructor;
-
- list_add_tail(&snapshot->list, &region->snapshot_list);
-
- region->cur_snapshots++;
-
- devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
mutex_unlock(&devlink->lock);
- return 0;
-unlock:
- mutex_unlock(&devlink->lock);
return err;
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
@@ -7717,6 +8445,8 @@ static const struct devlink_trap devlink_trap_generic[] = {
DEVLINK_TRAP(NON_ROUTABLE, DROP),
DEVLINK_TRAP(DECAP_ERROR, EXCEPTION),
DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP),
+ DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP),
+ DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP),
};
#define DEVLINK_TRAP_GROUP(_id) \
@@ -7730,6 +8460,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = {
DEVLINK_TRAP_GROUP(L3_DROPS),
DEVLINK_TRAP_GROUP(BUFFER_DROPS),
DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
+ DEVLINK_TRAP_GROUP(ACL_DROPS),
};
static int devlink_trap_generic_verify(const struct devlink_trap *trap)
@@ -7763,7 +8494,7 @@ static int devlink_trap_driver_verify(const struct devlink_trap *trap)
static int devlink_trap_verify(const struct devlink_trap *trap)
{
- if (!trap || !trap->name || !trap->group.name)
+ if (!trap || !trap->name)
return -EINVAL;
if (trap->generic)
@@ -7834,108 +8565,22 @@ devlink_trap_group_notify(struct devlink *devlink,
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
-static struct devlink_trap_group_item *
-devlink_trap_group_item_create(struct devlink *devlink,
- const struct devlink_trap_group *group)
-{
- struct devlink_trap_group_item *group_item;
- int err;
-
- err = devlink_trap_group_verify(group);
- if (err)
- return ERR_PTR(err);
-
- group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
- if (!group_item)
- return ERR_PTR(-ENOMEM);
-
- group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
- if (!group_item->stats) {
- err = -ENOMEM;
- goto err_stats_alloc;
- }
-
- group_item->group = group;
- refcount_set(&group_item->refcount, 1);
-
- if (devlink->ops->trap_group_init) {
- err = devlink->ops->trap_group_init(devlink, group);
- if (err)
- goto err_group_init;
- }
-
- list_add_tail(&group_item->list, &devlink->trap_group_list);
- devlink_trap_group_notify(devlink, group_item,
- DEVLINK_CMD_TRAP_GROUP_NEW);
-
- return group_item;
-
-err_group_init:
- free_percpu(group_item->stats);
-err_stats_alloc:
- kfree(group_item);
- return ERR_PTR(err);
-}
-
-static void
-devlink_trap_group_item_destroy(struct devlink *devlink,
- struct devlink_trap_group_item *group_item)
-{
- devlink_trap_group_notify(devlink, group_item,
- DEVLINK_CMD_TRAP_GROUP_DEL);
- list_del(&group_item->list);
- free_percpu(group_item->stats);
- kfree(group_item);
-}
-
-static struct devlink_trap_group_item *
-devlink_trap_group_item_get(struct devlink *devlink,
- const struct devlink_trap_group *group)
-{
- struct devlink_trap_group_item *group_item;
-
- group_item = devlink_trap_group_item_lookup(devlink, group->name);
- if (group_item) {
- refcount_inc(&group_item->refcount);
- return group_item;
- }
-
- return devlink_trap_group_item_create(devlink, group);
-}
-
-static void
-devlink_trap_group_item_put(struct devlink *devlink,
- struct devlink_trap_group_item *group_item)
-{
- if (!refcount_dec_and_test(&group_item->refcount))
- return;
-
- devlink_trap_group_item_destroy(devlink, group_item);
-}
-
static int
devlink_trap_item_group_link(struct devlink *devlink,
struct devlink_trap_item *trap_item)
{
+ u16 group_id = trap_item->trap->init_group_id;
struct devlink_trap_group_item *group_item;
- group_item = devlink_trap_group_item_get(devlink,
- &trap_item->trap->group);
- if (IS_ERR(group_item))
- return PTR_ERR(group_item);
+ group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id);
+ if (WARN_ON_ONCE(!group_item))
+ return -EINVAL;
trap_item->group_item = group_item;
return 0;
}
-static void
-devlink_trap_item_group_unlink(struct devlink *devlink,
- struct devlink_trap_item *trap_item)
-{
- devlink_trap_group_item_put(devlink, trap_item->group_item);
-}
-
static void devlink_trap_notify(struct devlink *devlink,
const struct devlink_trap_item *trap_item,
enum devlink_command cmd)
@@ -7998,7 +8643,6 @@ devlink_trap_register(struct devlink *devlink,
return 0;
err_trap_init:
- devlink_trap_item_group_unlink(devlink, trap_item);
err_group_link:
free_percpu(trap_item->stats);
err_stats_alloc:
@@ -8019,7 +8663,6 @@ static void devlink_trap_unregister(struct devlink *devlink,
list_del(&trap_item->list);
if (devlink->ops->trap_fini)
devlink->ops->trap_fini(devlink, trap, trap_item);
- devlink_trap_item_group_unlink(devlink, trap_item);
free_percpu(trap_item->stats);
kfree(trap_item);
}
@@ -8121,12 +8764,14 @@ devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats,
static void
devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata,
const struct devlink_trap_item *trap_item,
- struct devlink_port *in_devlink_port)
+ struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
{
struct devlink_trap_group_item *group_item = trap_item->group_item;
hw_metadata->trap_group_name = group_item->group->name;
hw_metadata->trap_name = trap_item->trap->name;
+ hw_metadata->fa_cookie = fa_cookie;
spin_lock(&in_devlink_port->type_lock);
if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
@@ -8140,9 +8785,12 @@ devlink_trap_report_metadata_fill(struct net_dm_hw_metadata *hw_metadata,
* @skb: Trapped packet.
* @trap_ctx: Trap context.
* @in_devlink_port: Input devlink port.
+ * @fa_cookie: Flow action cookie. Could be NULL.
*/
void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
- void *trap_ctx, struct devlink_port *in_devlink_port)
+ void *trap_ctx, struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
+
{
struct devlink_trap_item *trap_item = trap_ctx;
struct net_dm_hw_metadata hw_metadata = {};
@@ -8151,7 +8799,7 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
devlink_trap_report_metadata_fill(&hw_metadata, trap_item,
- in_devlink_port);
+ in_devlink_port, fa_cookie);
net_dm_hw_report(skb, &hw_metadata);
}
EXPORT_SYMBOL_GPL(devlink_trap_report);
@@ -8170,6 +8818,288 @@ void *devlink_trap_ctx_priv(void *trap_ctx)
}
EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv);
+static int
+devlink_trap_group_item_policer_link(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item)
+{
+ u32 policer_id = group_item->group->init_policer_id;
+ struct devlink_trap_policer_item *policer_item;
+
+ if (policer_id == 0)
+ return 0;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
+ if (WARN_ON_ONCE(!policer_item))
+ return -EINVAL;
+
+ group_item->policer_item = policer_item;
+
+ return 0;
+}
+
+static int
+devlink_trap_group_register(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+ int err;
+
+ if (devlink_trap_group_item_lookup(devlink, group->name))
+ return -EEXIST;
+
+ group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
+ if (!group_item)
+ return -ENOMEM;
+
+ group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+ if (!group_item->stats) {
+ err = -ENOMEM;
+ goto err_stats_alloc;
+ }
+
+ group_item->group = group;
+
+ err = devlink_trap_group_item_policer_link(devlink, group_item);
+ if (err)
+ goto err_policer_link;
+
+ if (devlink->ops->trap_group_init) {
+ err = devlink->ops->trap_group_init(devlink, group);
+ if (err)
+ goto err_group_init;
+ }
+
+ list_add_tail(&group_item->list, &devlink->trap_group_list);
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW);
+
+ return 0;
+
+err_group_init:
+err_policer_link:
+ free_percpu(group_item->stats);
+err_stats_alloc:
+ kfree(group_item);
+ return err;
+}
+
+static void
+devlink_trap_group_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+
+ group_item = devlink_trap_group_item_lookup(devlink, group->name);
+ if (WARN_ON_ONCE(!group_item))
+ return;
+
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_DEL);
+ list_del(&group_item->list);
+ free_percpu(group_item->stats);
+ kfree(group_item);
+}
+
+/**
+ * devlink_trap_groups_register - Register packet trap groups with devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_trap_groups_register(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i, err;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < groups_count; i++) {
+ const struct devlink_trap_group *group = &groups[i];
+
+ err = devlink_trap_group_verify(group);
+ if (err)
+ goto err_trap_group_verify;
+
+ err = devlink_trap_group_register(devlink, group);
+ if (err)
+ goto err_trap_group_register;
+ }
+ mutex_unlock(&devlink->lock);
+
+ return 0;
+
+err_trap_group_register:
+err_trap_group_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_register);
+
+/**
+ * devlink_trap_groups_unregister - Unregister packet trap groups from devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ */
+void devlink_trap_groups_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i;
+
+ mutex_lock(&devlink->lock);
+ for (i = groups_count - 1; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister);
+
+static void
+devlink_trap_policer_notify(struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW &&
+ cmd != DEVLINK_CMD_TRAP_POLICER_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0,
+ 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int
+devlink_trap_policer_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+ int err;
+
+ if (devlink_trap_policer_item_lookup(devlink, policer->id))
+ return -EEXIST;
+
+ policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL);
+ if (!policer_item)
+ return -ENOMEM;
+
+ policer_item->policer = policer;
+ policer_item->rate = policer->init_rate;
+ policer_item->burst = policer->init_burst;
+
+ if (devlink->ops->trap_policer_init) {
+ err = devlink->ops->trap_policer_init(devlink, policer);
+ if (err)
+ goto err_policer_init;
+ }
+
+ list_add_tail(&policer_item->list, &devlink->trap_policer_list);
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW);
+
+ return 0;
+
+err_policer_init:
+ kfree(policer_item);
+ return err;
+}
+
+static void
+devlink_trap_policer_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer->id);
+ if (WARN_ON_ONCE(!policer_item))
+ return;
+
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_DEL);
+ list_del(&policer_item->list);
+ if (devlink->ops->trap_policer_fini)
+ devlink->ops->trap_policer_fini(devlink, policer);
+ kfree(policer_item);
+}
+
+/**
+ * devlink_trap_policers_register - Register packet trap policers with devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ *
+ * Return: Non-zero value on failure.
+ */
+int
+devlink_trap_policers_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i, err;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < policers_count; i++) {
+ const struct devlink_trap_policer *policer = &policers[i];
+
+ if (WARN_ON(policer->id == 0 ||
+ policer->max_rate < policer->min_rate ||
+ policer->max_burst < policer->min_burst)) {
+ err = -EINVAL;
+ goto err_trap_policer_verify;
+ }
+
+ err = devlink_trap_policer_register(devlink, policer);
+ if (err)
+ goto err_trap_policer_register;
+ }
+ mutex_unlock(&devlink->lock);
+
+ return 0;
+
+err_trap_policer_register:
+err_trap_policer_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_policers_register);
+
+/**
+ * devlink_trap_policers_unregister - Unregister packet trap policers from devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ */
+void
+devlink_trap_policers_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i;
+
+ mutex_lock(&devlink->lock);
+ for (i = policers_count - 1; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_policers_unregister);
+
static void __devlink_compat_running_version(struct devlink *devlink,
char *buf, size_t len)
{
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 31700e0c3928..8e33cec9fc4e 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -29,6 +29,7 @@
#include <net/drop_monitor.h>
#include <net/genetlink.h>
#include <net/netevent.h>
+#include <net/flow_offload.h>
#include <trace/events/skb.h>
#include <trace/events/napi.h>
@@ -67,7 +68,7 @@ struct net_dm_hw_entry {
struct net_dm_hw_entries {
u32 num_entries;
- struct net_dm_hw_entry entries[0];
+ struct net_dm_hw_entry entries[];
};
struct per_cpu_dm_data {
@@ -701,6 +702,13 @@ static void net_dm_packet_work(struct work_struct *work)
}
static size_t
+net_dm_flow_action_cookie_size(const struct net_dm_hw_metadata *hw_metadata)
+{
+ return hw_metadata->fa_cookie ?
+ nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0;
+}
+
+static size_t
net_dm_hw_packet_report_size(size_t payload_len,
const struct net_dm_hw_metadata *hw_metadata)
{
@@ -717,6 +725,8 @@ net_dm_hw_packet_report_size(size_t payload_len,
nla_total_size(strlen(hw_metadata->trap_name) + 1) +
/* NET_DM_ATTR_IN_PORT */
net_dm_in_port_size() +
+ /* NET_DM_ATTR_FLOW_ACTION_COOKIE */
+ net_dm_flow_action_cookie_size(hw_metadata) +
/* NET_DM_ATTR_TIMESTAMP */
nla_total_size(sizeof(u64)) +
/* NET_DM_ATTR_ORIG_LEN */
@@ -762,6 +772,12 @@ static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
goto nla_put_failure;
}
+ if (hw_metadata->fa_cookie &&
+ nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE,
+ hw_metadata->fa_cookie->cookie_len,
+ hw_metadata->fa_cookie->cookie))
+ goto nla_put_failure;
+
if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
goto nla_put_failure;
@@ -794,11 +810,12 @@ nla_put_failure:
static struct net_dm_hw_metadata *
net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata)
{
+ const struct flow_action_cookie *fa_cookie;
struct net_dm_hw_metadata *n_hw_metadata;
const char *trap_group_name;
const char *trap_name;
- n_hw_metadata = kmalloc(sizeof(*hw_metadata), GFP_ATOMIC);
+ n_hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC);
if (!n_hw_metadata)
return NULL;
@@ -812,12 +829,25 @@ net_dm_hw_metadata_clone(const struct net_dm_hw_metadata *hw_metadata)
goto free_trap_group;
n_hw_metadata->trap_name = trap_name;
+ if (hw_metadata->fa_cookie) {
+ size_t cookie_size = sizeof(*fa_cookie) +
+ hw_metadata->fa_cookie->cookie_len;
+
+ fa_cookie = kmemdup(hw_metadata->fa_cookie, cookie_size,
+ GFP_ATOMIC);
+ if (!fa_cookie)
+ goto free_trap_name;
+ n_hw_metadata->fa_cookie = fa_cookie;
+ }
+
n_hw_metadata->input_dev = hw_metadata->input_dev;
if (n_hw_metadata->input_dev)
dev_hold(n_hw_metadata->input_dev);
return n_hw_metadata;
+free_trap_name:
+ kfree(trap_name);
free_trap_group:
kfree(trap_group_name);
free_hw_metadata:
@@ -830,6 +860,7 @@ net_dm_hw_metadata_free(const struct net_dm_hw_metadata *hw_metadata)
{
if (hw_metadata->input_dev)
dev_put(hw_metadata->input_dev);
+ kfree(hw_metadata->fa_cookie);
kfree(hw_metadata->trap_name);
kfree(hw_metadata->trap_group_name);
kfree(hw_metadata);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 3e7e15278c46..bd7eba9066f8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -974,7 +974,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
frh = nlmsg_data(nlh);
frh->family = ops->family;
- frh->table = rule->table;
+ frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT;
if (nla_put_u32(skb, FRA_TABLE, rule->table))
goto nla_put_failure;
if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
diff --git a/net/core/filter.c b/net/core/filter.c
index c180871e606d..7628b947dbc3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2642,6 +2642,19 @@ static const struct bpf_func_proto bpf_msg_pop_data_proto = {
.arg4_type = ARG_ANYTHING,
};
+#ifdef CONFIG_CGROUP_NET_CLASSID
+BPF_CALL_0(bpf_get_cgroup_classid_curr)
+{
+ return __task_get_classid(current);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+ .func = bpf_get_cgroup_classid_curr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+#endif
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -3626,7 +3639,6 @@ err:
_trace_xdp_redirect_err(dev, xdp_prog, index, err);
return err;
}
-EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
@@ -4062,7 +4074,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
- if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+ if (unlikely(!xdp ||
+ xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
return -EFAULT;
return bpf_event_output(map, flags, meta, meta_size, xdp->data,
@@ -4080,6 +4093,19 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+static int bpf_xdp_output_btf_ids[5];
+const struct bpf_func_proto bpf_xdp_output_proto = {
+ .func = bpf_xdp_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+ .btf_id = bpf_xdp_output_btf_ids,
+};
+
BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
return skb->sk ? sock_gen_cookie(skb->sk) : 0;
@@ -4104,6 +4130,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
+{
+ return sock_gen_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
+ .func = bpf_get_socket_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
return sock_gen_cookie(ctx->sk);
@@ -4116,6 +4154,39 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static u64 __bpf_get_netns_cookie(struct sock *sk)
+{
+#ifdef CONFIG_NET_NS
+ return net_gen_cookie(sk ? sk->sk_net.net : &init_net);
+#else
+ return 0;
+#endif
+}
+
+BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
+ .func = bpf_get_netns_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
+ .func = bpf_get_netns_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
struct sock *sk = sk_to_full_sk(skb->sk);
@@ -4134,8 +4205,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
- struct bpf_map *, map, u64, flags, void *, data, u64, size)
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags,
+ void *, data, u64, size)
{
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -4143,8 +4214,8 @@ BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
}
-static const struct bpf_func_proto bpf_sockopt_event_output_proto = {
- .func = bpf_sockopt_event_output,
+static const struct bpf_func_proto bpf_event_output_data_proto = {
+ .func = bpf_event_output_data,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -5330,8 +5401,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- /* Only full sockets have sk->sk_flags. */
- if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
+ if (sk_is_refcounted(sk))
sock_gen_put(sk);
return 0;
}
@@ -5847,6 +5917,36 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
+{
+ if (flags != 0)
+ return -EINVAL;
+ if (!skb_at_tc_ingress(skb))
+ return -EOPNOTSUPP;
+ if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+ return -ENETUNREACH;
+ if (unlikely(sk->sk_reuseport))
+ return -ESOCKTNOSUPPORT;
+ if (sk_is_refcounted(sk) &&
+ unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ return -ENOENT;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_pfree;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_assign_proto = {
+ .func = bpf_sk_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg3_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -5941,6 +6041,26 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -5965,8 +6085,26 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_addr_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_addr_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sock_addr_sk_lookup_tcp_proto;
@@ -6140,6 +6278,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_ecn_set_ce_proto;
case BPF_FUNC_tcp_gen_syncookie:
return &bpf_tcp_gen_syncookie_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_assign_proto;
#endif
default:
return bpf_base_func_proto(func_id);
@@ -6209,7 +6349,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
- return &bpf_sockopt_event_output_proto;
+ return &bpf_event_output_data_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
@@ -7140,6 +7280,27 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ /* si->dst_reg = skb_shinfo(SKB); */
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, end));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, head));
+ *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+#else
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, end));
+#endif
+
+ return insn;
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -7462,26 +7623,21 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, gso_segs):
- /* si->dst_reg = skb_shinfo(SKB); */
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
- BPF_REG_AX, si->src_reg,
- offsetof(struct sk_buff, end));
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
- si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, head));
- *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
-#else
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
- si->dst_reg, si->src_reg,
- offsetof(struct sk_buff, end));
-#endif
+ insn = bpf_convert_shinfo_access(si, insn);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
si->dst_reg, si->dst_reg,
bpf_target_off(struct skb_shared_info,
gso_segs, 2,
target_size));
break;
+ case offsetof(struct __sk_buff, gso_size):
+ insn = bpf_convert_shinfo_access(si, insn);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ gso_size, 2,
+ target_size));
+ break;
case offsetof(struct __sk_buff, wire_len):
BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);
@@ -8620,6 +8776,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
struct bpf_map *, map, void *, key, u32, flags)
{
+ bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
struct sock_reuseport *reuse;
struct sock *selected_sk;
@@ -8628,26 +8785,20 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
return -ENOENT;
reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
- if (!reuse)
- /* selected_sk is unhashed (e.g. by close()) after the
- * above map_lookup_elem(). Treat selected_sk has already
- * been removed from the map.
+ if (!reuse) {
+ /* reuseport_array has only sk with non NULL sk_reuseport_cb.
+ * The only (!reuse) case here is - the sk has already been
+ * unhashed (e.g. by close()), so treat it as -ENOENT.
+ *
+ * Other maps (e.g. sock_map) do not provide this guarantee and
+ * the sk may never be in the reuseport group to begin with.
*/
- return -ENOENT;
+ return is_sockarray ? -ENOENT : -EINVAL;
+ }
if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
- struct sock *sk;
-
- if (unlikely(!reuse_kern->reuseport_id))
- /* There is a small race between adding the
- * sk to the map and setting the
- * reuse_kern->reuseport_id.
- * Treat it as the sk has not been added to
- * the bpf map yet.
- */
- return -ENOENT;
+ struct sock *sk = reuse_kern->sk;
- sk = reuse_kern->sk;
if (sk->sk_protocol != selected_sk->sk_protocol)
return -EPROTOTYPE;
else if (sk->sk_family != selected_sk->sk_family)
@@ -8835,10 +8986,9 @@ const struct bpf_prog_ops sk_reuseport_prog_ops = {
};
#endif /* CONFIG_INET */
-DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+DEFINE_BPF_DISPATCHER(xdp)
void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
{
- bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp),
- prev_prog, prog);
+ bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index a1670dff0629..3eff84824c8b 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -920,9 +920,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
(int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
flow_keys->flags = flags;
- preempt_disable();
- result = BPF_PROG_RUN(prog, ctx);
- preempt_enable();
+ result = bpf_prog_run_pin_on_cpu(prog, ctx);
flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 45b6a59ac124..e951b743bed3 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -167,6 +167,34 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule,
}
EXPORT_SYMBOL(flow_rule_match_enc_opts);
+struct flow_action_cookie *flow_action_cookie_create(void *data,
+ unsigned int len,
+ gfp_t gfp)
+{
+ struct flow_action_cookie *cookie;
+
+ cookie = kmalloc(sizeof(*cookie) + len, gfp);
+ if (!cookie)
+ return NULL;
+ cookie->cookie_len = len;
+ memcpy(cookie->cookie, data, len);
+ return cookie;
+}
+EXPORT_SYMBOL(flow_action_cookie_create);
+
+void flow_action_cookie_destroy(struct flow_action_cookie *cookie)
+{
+ kfree(cookie);
+}
+EXPORT_SYMBOL(flow_action_cookie_destroy);
+
+void flow_rule_match_ct(const struct flow_rule *rule,
+ struct flow_match_ct *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ct);
+
struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb,
void *cb_ident, void *cb_priv,
void (*release)(void *cb_priv))
@@ -483,7 +511,8 @@ EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
void flow_indr_block_call(struct net_device *dev,
struct flow_block_offload *bo,
- enum flow_block_command command)
+ enum flow_block_command command,
+ enum tc_setup_type type)
{
struct flow_indr_block_cb *indr_block_cb;
struct flow_indr_block_dev *indr_dev;
@@ -493,8 +522,7 @@ void flow_indr_block_call(struct net_device *dev,
return;
list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- bo);
+ indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo);
}
EXPORT_SYMBOL_GPL(flow_indr_block_call);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 99a6de52b21d..7d3438215f32 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -367,7 +367,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
[LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
};
-static int bpf_build_state(struct nlattr *nla,
+static int bpf_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 2f9c0de533c7..8ec7d13d2860 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "BPF";
case LWTUNNEL_ENCAP_SEG6_LOCAL:
return "SEG6LOCAL";
+ case LWTUNNEL_ENCAP_RPL:
+ return "RPL";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
@@ -98,7 +100,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
}
EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
-int lwtunnel_build_state(u16 encap_type,
+int lwtunnel_build_state(struct net *net, u16 encap_type,
struct nlattr *encap, unsigned int family,
const void *cfg, struct lwtunnel_state **lws,
struct netlink_ext_ack *extack)
@@ -122,7 +124,7 @@ int lwtunnel_build_state(u16 encap_type,
rcu_read_unlock();
if (found) {
- ret = ops->build_state(encap, family, cfg, lws, extack);
+ ret = ops->build_state(net, encap, family, cfg, lws, extack);
if (ret)
module_put(ops->owner);
} else {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 789a73aa7bd8..5bf8d22a47ec 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3553,9 +3553,6 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
-#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \
- NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
-
#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 4c826b8bf9b1..cf0215734ceb 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -944,6 +944,24 @@ err:
kobject_put(kobj);
return error;
}
+
+static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct netdev_rx_queue *queue = dev->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+ if (dev->sysfs_rx_queue_group)
+ error = sysfs_group_change_owner(
+ kobj, dev->sysfs_rx_queue_group, kuid, kgid);
+
+ return error;
+}
#endif /* CONFIG_SYSFS */
int
@@ -981,6 +999,29 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
#endif
}
+static int net_rx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+#ifndef CONFIG_RPS
+ if (!dev->sysfs_rx_queue_group)
+ return 0;
+#endif
+ for (i = 0; i < num; i++) {
+ error = rx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif
+}
+
#ifdef CONFIG_SYSFS
/*
* netdev_queue sysfs structures and functions.
@@ -1486,6 +1527,23 @@ err:
kobject_put(kobj);
return error;
}
+
+static int tx_queue_change_owner(struct net_device *ndev, int index,
+ kuid_t kuid, kgid_t kgid)
+{
+ struct netdev_queue *queue = ndev->_tx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+#ifdef CONFIG_BQL
+ error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid);
+#endif
+ return error;
+}
#endif /* CONFIG_SYSFS */
int
@@ -1520,6 +1578,25 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
#endif /* CONFIG_SYSFS */
}
+static int net_tx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ error = tx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif /* CONFIG_SYSFS */
+}
+
static int register_queue_kobjects(struct net_device *dev)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
@@ -1554,6 +1631,31 @@ error:
return error;
}
+static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid)
+{
+ int error = 0, real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ if (ndev->queues_kset) {
+ error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid);
+ if (error)
+ return error;
+ }
+ real_rx = ndev->real_num_rx_queues;
+#endif
+ real_tx = ndev->real_num_tx_queues;
+
+ error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid);
+ if (error)
+ return error;
+
+ error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
static void remove_queue_kobjects(struct net_device *dev)
{
int real_rx = 0, real_tx = 0;
@@ -1767,6 +1869,37 @@ int netdev_register_kobject(struct net_device *ndev)
return error;
}
+/* Change owner for sysfs entries when moving network devices across network
+ * namespaces owned by different user namespaces.
+ */
+int netdev_change_owner(struct net_device *ndev, const struct net *net_old,
+ const struct net *net_new)
+{
+ struct device *dev = &ndev->dev;
+ kuid_t old_uid, new_uid;
+ kgid_t old_gid, new_gid;
+ int error;
+
+ net_ns_get_ownership(net_old, &old_uid, &old_gid);
+ net_ns_get_ownership(net_new, &new_uid, &new_gid);
+
+ /* The network namespace was changed but the owning user namespace is
+ * identical so there's no need to change the owner of sysfs entries.
+ */
+ if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid))
+ return 0;
+
+ error = device_change_owner(dev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ error = queue_change_owner(ndev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
int netdev_class_create_file_ns(const struct class_attribute *class_attr,
const void *ns)
{
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 006876c7b78d..8a5b04c2699a 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -8,5 +8,7 @@ void netdev_unregister_kobject(struct net_device *);
int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
int netdev_queue_update_kobjects(struct net_device *net,
int old_num, int new_num);
+int netdev_change_owner(struct net_device *, const struct net *net_old,
+ const struct net *net_new);
#endif
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 757cc1d084e7..190ca66a383b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+static atomic64_t cookie_gen;
+
+u64 net_gen_cookie(struct net *net)
+{
+ while (1) {
+ u64 res = atomic64_read(&net->net_cookie);
+
+ if (res)
+ return res;
+ res = atomic64_inc_return(&cookie_gen);
+ atomic64_cmpxchg(&net->net_cookie, 0, res);
+ }
+}
+
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
@@ -1087,6 +1101,7 @@ static int __init net_ns_init(void)
panic("Could not allocate generic netns");
rcu_assign_pointer(init_net.gen, ng);
+ net_gen_cookie(&init_net);
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 0642f91c4038..b4c87fe31be2 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -53,30 +53,60 @@ static void cgrp_css_free(struct cgroup_subsys_state *css)
kfree(css_cls_state(css));
}
+/*
+ * To avoid freezing of sockets creation for tasks with big number of threads
+ * and opened sockets lets release file_lock every 1000 iterated descriptors.
+ * New sockets will already have been created with new classid.
+ */
+
+struct update_classid_context {
+ u32 classid;
+ unsigned int batch;
+};
+
+#define UPDATE_CLASSID_BATCH 1000
+
static int update_classid_sock(const void *v, struct file *file, unsigned n)
{
int err;
+ struct update_classid_context *ctx = (void *)v;
struct socket *sock = sock_from_file(file, &err);
if (sock) {
spin_lock(&cgroup_sk_update_lock);
- sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
- (unsigned long)v);
+ sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
spin_unlock(&cgroup_sk_update_lock);
}
+ if (--ctx->batch == 0) {
+ ctx->batch = UPDATE_CLASSID_BATCH;
+ return n + 1;
+ }
return 0;
}
+static void update_classid_task(struct task_struct *p, u32 classid)
+{
+ struct update_classid_context ctx = {
+ .classid = classid,
+ .batch = UPDATE_CLASSID_BATCH
+ };
+ unsigned int fd = 0;
+
+ do {
+ task_lock(p);
+ fd = iterate_fd(p->files, fd, update_classid_sock, &ctx);
+ task_unlock(p);
+ cond_resched();
+ } while (fd);
+}
+
static void cgrp_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct task_struct *p;
cgroup_taskset_for_each(p, css, tset) {
- task_lock(p);
- iterate_fd(p->files, 0, update_classid_sock,
- (void *)(unsigned long)css_cls_state(css)->classid);
- task_unlock(p);
+ update_classid_task(p, css_cls_state(css)->classid);
}
}
@@ -98,10 +128,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
css_task_iter_start(css, 0, &it);
while ((p = css_task_iter_next(&it))) {
- task_lock(p);
- iterate_fd(p->files, 0, update_classid_sock,
- (void *)(unsigned long)cs->classid);
- task_unlock(p);
+ update_classid_task(p, cs->classid);
cond_resched();
}
css_task_iter_end(&it);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 9b7cbe35df37..ef98372facf6 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -43,9 +43,11 @@ static int page_pool_init(struct page_pool *pool,
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
* which is the XDP_TX use-case.
*/
- if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
- (pool->p.dma_dir != DMA_BIDIRECTIONAL))
- return -EINVAL;
+ if (pool->p.flags & PP_FLAG_DMA_MAP) {
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+ }
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
/* In order to request DMA-sync-for-device the page
@@ -96,11 +98,10 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
}
EXPORT_SYMBOL(page_pool_create);
-static void __page_pool_return_page(struct page_pool *pool, struct page *page);
+static void page_pool_return_page(struct page_pool *pool, struct page *page);
noinline
-static struct page *page_pool_refill_alloc_cache(struct page_pool *pool,
- bool refill)
+static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
{
struct ptr_ring *r = &pool->ring;
struct page *page;
@@ -137,12 +138,11 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool,
* (2) break out to fallthrough to alloc_pages_node.
* This limit stress on page buddy alloactor.
*/
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
page = NULL;
break;
}
- } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL &&
- refill);
+ } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
/* Return last page */
if (likely(pool->alloc.count > 0))
@@ -155,20 +155,16 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool,
/* fast path */
static struct page *__page_pool_get_cached(struct page_pool *pool)
{
- bool refill = false;
struct page *page;
- /* Test for safe-context, caller should provide this guarantee */
- if (likely(in_serving_softirq())) {
- if (likely(pool->alloc.count)) {
- /* Fast-path */
- page = pool->alloc.cache[--pool->alloc.count];
- return page;
- }
- refill = true;
+ /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ page = pool->alloc.cache[--pool->alloc.count];
+ } else {
+ page = page_pool_refill_alloc_cache(pool);
}
- page = page_pool_refill_alloc_cache(pool, refill);
return page;
}
@@ -280,18 +276,25 @@ static s32 page_pool_inflight(struct page_pool *pool)
return inflight;
}
-/* Cleanup page_pool state from page */
-static void __page_pool_clean_page(struct page_pool *pool,
- struct page *page)
+/* Disconnects a page (from a page_pool). API users can have a need
+ * to disconnect a page (from a page_pool), to allow it to be used as
+ * a regular page (that will eventually be returned to the normal
+ * page-allocator via put_page).
+ */
+void page_pool_release_page(struct page_pool *pool, struct page *page)
{
dma_addr_t dma;
int count;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ /* Always account for inflight pages, even if we didn't
+ * map them
+ */
goto skip_dma_unmap;
dma = page->dma_addr;
- /* DMA unmap */
+
+ /* When page is unmapped, it cannot be returned our pool */
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
@@ -303,21 +306,12 @@ skip_dma_unmap:
count = atomic_inc_return(&pool->pages_state_release_cnt);
trace_page_pool_state_release(pool, page, count);
}
-
-/* unmap the page and clean our state */
-void page_pool_unmap_page(struct page_pool *pool, struct page *page)
-{
- /* When page is unmapped, this implies page will not be
- * returned to page_pool.
- */
- __page_pool_clean_page(pool, page);
-}
-EXPORT_SYMBOL(page_pool_unmap_page);
+EXPORT_SYMBOL(page_pool_release_page);
/* Return a page to the page allocator, cleaning up our state */
-static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+static void page_pool_return_page(struct page_pool *pool, struct page *page)
{
- __page_pool_clean_page(pool, page);
+ page_pool_release_page(pool, page);
put_page(page);
/* An optimization would be to call __free_pages(page, pool->p.order)
@@ -326,8 +320,7 @@ static void __page_pool_return_page(struct page_pool *pool, struct page *page)
*/
}
-static bool __page_pool_recycle_into_ring(struct page_pool *pool,
- struct page *page)
+static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
{
int ret;
/* BH protection not needed if current is serving softirq */
@@ -344,7 +337,7 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool,
*
* Caller must provide appropriate safe context.
*/
-static bool __page_pool_recycle_direct(struct page *page,
+static bool page_pool_recycle_in_cache(struct page *page,
struct page_pool *pool)
{
if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
@@ -363,8 +356,14 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page)
return !page_is_pfmemalloc(page);
}
-void __page_pool_put_page(struct page_pool *pool, struct page *page,
- unsigned int dma_sync_size, bool allow_direct)
+/* If the page refcnt == 1, this will try to recycle the page.
+ * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
+ * the configured size min(dma_sync_size, pool->max_len).
+ * If the page refcnt != 1, then the page will be returned to memory
+ * subsystem.
+ */
+void page_pool_put_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
{
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
@@ -381,12 +380,12 @@ void __page_pool_put_page(struct page_pool *pool, struct page *page,
dma_sync_size);
if (allow_direct && in_serving_softirq())
- if (__page_pool_recycle_direct(page, pool))
+ if (page_pool_recycle_in_cache(page, pool))
return;
- if (!__page_pool_recycle_into_ring(pool, page)) {
+ if (!page_pool_recycle_in_ring(pool, page)) {
/* Cache full, fallback to free pages */
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
return;
}
@@ -403,12 +402,13 @@ void __page_pool_put_page(struct page_pool *pool, struct page *page,
* doing refcnt based recycle tricks, meaning another process
* will be invoking put_page.
*/
- __page_pool_clean_page(pool, page);
+ /* Do not replace this with page_pool_return_page() */
+ page_pool_release_page(pool, page);
put_page(page);
}
-EXPORT_SYMBOL(__page_pool_put_page);
+EXPORT_SYMBOL(page_pool_put_page);
-static void __page_pool_empty_ring(struct page_pool *pool)
+static void page_pool_empty_ring(struct page_pool *pool)
{
struct page *page;
@@ -419,7 +419,7 @@ static void __page_pool_empty_ring(struct page_pool *pool)
pr_crit("%s() page_pool refcnt %d violation\n",
__func__, page_ref_count(page));
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
}
@@ -449,7 +449,7 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
*/
while (pool->alloc.count) {
page = pool->alloc.cache[--pool->alloc.count];
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
}
@@ -461,7 +461,7 @@ static void page_pool_scrub(struct page_pool *pool)
/* No more consumers should exist, but producers could still
* be in-flight.
*/
- __page_pool_empty_ring(pool);
+ page_pool_empty_ring(pool);
}
static int page_pool_release(struct page_pool *pool)
@@ -535,7 +535,7 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
/* Flush pool alloc cache, as refill will check NUMA node */
while (pool->alloc.count) {
page = pool->alloc.cache[--pool->alloc.count];
- __page_pool_return_page(pool, page);
+ page_pool_return_page(pool, page);
}
}
EXPORT_SYMBOL(page_pool_update_nid);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index acc849df60b5..08e2811b5274 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2003,8 +2003,8 @@ static int pktgen_setup_dev(const struct pktgen_net *pn,
return -ENODEV;
}
- if (odev->type != ARPHRD_ETHER) {
- pr_err("not an ethernet device: \"%s\"\n", ifname);
+ if (odev->type != ARPHRD_ETHER && odev->type != ARPHRD_LOOPBACK) {
+ pr_err("not an ethernet or loopback device: \"%s\"\n", ifname);
err = -EINVAL;
} else if (!netif_running(odev)) {
pr_err("device is down: \"%s\"\n", ifname);
@@ -3362,7 +3362,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* skb was 'freed' by stack, so clean few
* bits and reuse it
*/
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
} while (--burst > 0);
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 09c44bf2e1d2..709ebbf8ab5b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1872,7 +1872,9 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
};
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+ [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD },
[IFLA_XDP_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
[IFLA_XDP_FLAGS] = { .type = NLA_U32 },
[IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
@@ -2799,8 +2801,20 @@ static int do_setlink(const struct sk_buff *skb,
}
if (xdp[IFLA_XDP_FD]) {
+ int expected_fd = -1;
+
+ if (xdp_flags & XDP_FLAGS_REPLACE) {
+ if (!xdp[IFLA_XDP_EXPECTED_FD]) {
+ err = -EINVAL;
+ goto errout;
+ }
+ expected_fd =
+ nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]);
+ }
+
err = dev_change_xdp_fd(dev, extack,
nla_get_s32(xdp[IFLA_XDP_FD]),
+ expected_fd,
xdp_flags);
if (err)
goto errout;
@@ -3504,27 +3518,25 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
if (err)
return err;
- alt_ifname = nla_data(attr);
+ alt_ifname = nla_strdup(attr, GFP_KERNEL);
+ if (!alt_ifname)
+ return -ENOMEM;
+
if (cmd == RTM_NEWLINKPROP) {
- alt_ifname = kstrdup(alt_ifname, GFP_KERNEL);
- if (!alt_ifname)
- return -ENOMEM;
err = netdev_name_node_alt_create(dev, alt_ifname);
- if (err) {
- kfree(alt_ifname);
- return err;
- }
+ if (!err)
+ alt_ifname = NULL;
} else if (cmd == RTM_DELLINKPROP) {
err = netdev_name_node_alt_destroy(dev, alt_ifname);
- if (err)
- return err;
} else {
- WARN_ON(1);
- return 0;
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
}
- *changed = true;
- return 0;
+ kfree(alt_ifname);
+ if (!err)
+ *changed = true;
+ return err;
}
static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -3911,7 +3923,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
@@ -4022,7 +4034,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
@@ -4248,13 +4260,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (!br_idx) { /* user did not specify a specific bridge */
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (netif_is_bridge_port(dev)) {
br_dev = netdev_master_upper_dev_get(dev);
cops = br_dev->netdev_ops;
}
} else {
if (dev != br_dev &&
- !(dev->priv_flags & IFF_BRIDGE_PORT))
+ !netif_is_bridge_port(dev))
continue;
if (br_dev != netdev_master_upper_dev_get(dev) &&
@@ -4266,7 +4278,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (idx < s_idx)
goto cont;
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (netif_is_bridge_port(dev)) {
if (cops && cops->ndo_fdb_dump) {
err = cops->ndo_fdb_dump(skb, cb,
br_dev, dev,
@@ -4416,7 +4428,7 @@ static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (dev) {
if (!ndm_flags || (ndm_flags & NTF_MASTER)) {
- if (!(dev->priv_flags & IFF_BRIDGE_PORT)) {
+ if (!netif_is_bridge_port(dev)) {
NL_SET_ERR_MSG(extack, "Device is not a bridge port");
return -EINVAL;
}
@@ -4555,7 +4567,11 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
brport_nla_put_flag(skb, flags, mask,
IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
brport_nla_put_flag(skb, flags, mask,
- IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
+ IFLA_BRPORT_PROXYARP, BR_PROXYARP) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) {
nla_nest_cancel(skb, protinfo);
goto nla_put_failure;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 864cb9e9622f..7e29590482ce 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -467,7 +467,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
return NULL;
}
- /* use OR instead of assignment to avoid clearing of bits in mask */
if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -527,7 +526,6 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
return NULL;
}
- /* use OR instead of assignment to avoid clearing of bits in mask */
if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -3670,6 +3668,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
skb_push(nskb, -skb_network_offset(nskb) + offset);
+ skb_release_head_state(nskb);
__copy_skb_header(nskb, skb);
skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
@@ -3928,14 +3927,21 @@ normal:
goto perform_csum_check;
if (!sg) {
- if (!nskb->remcsum_offload)
- nskb->ip_summed = CHECKSUM_NONE;
- SKB_GSO_CB(nskb)->csum =
- skb_copy_and_csum_bits(head_skb, offset,
- skb_put(nskb, len),
- len, 0);
- SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ if (!csum) {
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb,
+ len),
+ len, 0);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ } else {
+ skb_copy_bits(head_skb, offset,
+ skb_put(nskb, len),
+ len);
+ }
continue;
}
@@ -4805,9 +4811,9 @@ static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
typeof(IPPROTO_IP) proto,
unsigned int off)
{
- switch (proto) {
- int err;
+ int err;
+ switch (proto) {
case IPPROTO_TCP:
err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
off + MAX_TCP_HDR_LEN);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index ded2d5227678..c479372f2cd2 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -512,7 +512,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
refcount_set(&psock->refcnt, 1);
- rcu_assign_sk_user_data(sk, psock);
+ rcu_assign_sk_user_data_nocopy(sk, psock);
sock_hold(sk);
return psock;
@@ -628,7 +628,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
struct bpf_prog *prog;
int ret;
- preempt_disable();
rcu_read_lock();
prog = READ_ONCE(psock->progs.msg_parser);
if (unlikely(!prog)) {
@@ -638,7 +637,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
sk_msg_compute_data_pointers(msg);
msg->sk = sk;
- ret = BPF_PROG_RUN(prog, msg);
+ ret = bpf_prog_run_pin_on_cpu(prog, msg);
ret = sk_psock_map_verd(ret, msg->sk_redir);
psock->apply_bytes = msg->apply_bytes;
if (ret == __SK_REDIRECT) {
@@ -653,7 +652,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
}
out:
rcu_read_unlock();
- preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
@@ -665,9 +663,7 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
skb->sk = psock->sk;
bpf_compute_data_end_sk_skb(skb);
- preempt_disable();
- ret = BPF_PROG_RUN(prog, skb);
- preempt_enable();
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
/* strparser clones the skb before handing it to a upper layer,
* meaning skb_orphan has been called. We NULL sk on the way out
* to ensure we don't trigger a BUG_ON() in skb/sk operations
diff --git a/net/core/sock.c b/net/core/sock.c
index a4c8fac781ff..da32d9b6d09f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1572,13 +1572,14 @@ static inline void sock_lock_init(struct sock *sk)
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
+ const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
#endif
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
- osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+ prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
#ifdef CONFIG_SECURITY_NETWORK
nsk->sk_security = sptr;
@@ -1792,16 +1793,17 @@ static void sk_init_common(struct sock *sk)
*/
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
+ struct proto *prot = READ_ONCE(sk->sk_prot);
struct sock *newsk;
bool is_charged = true;
- newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
+ newsk = sk_prot_alloc(prot, priority, sk->sk_family);
if (newsk != NULL) {
struct sk_filter *filter;
sock_copy(newsk, sk);
- newsk->sk_prot_creator = sk->sk_prot;
+ newsk->sk_prot_creator = prot;
/* SANITY */
if (likely(newsk->sk_net_refcnt))
@@ -1830,7 +1832,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
atomic_set(&newsk->sk_zckey, 0);
sock_reset_flag(newsk, SOCK_DONE);
- mem_cgroup_sk_alloc(newsk);
+
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
+
cgroup_sk_alloc(&newsk->sk_cgrp_data);
rcu_read_lock();
@@ -1863,6 +1868,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
goto out;
}
+ /* Clear sk_user_data if parent had the pointer tagged
+ * as not suitable for copying when cloning.
+ */
+ if (sk_user_data_is_nocopy(newsk))
+ RCU_INIT_POINTER(newsk->sk_user_data, NULL);
+
newsk->sk_err = 0;
newsk->sk_err_soft = 0;
newsk->sk_priority = 0;
@@ -2060,6 +2071,18 @@ void sock_efree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_efree);
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+ if (sk_is_refcounted(skb->sk))
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
+
kuid_t sock_i_uid(struct sock *sk)
{
kuid_t uid;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 085cef5857bb..b08dfae10f88 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -10,6 +10,8 @@
#include <linux/skmsg.h>
#include <linux/list.h>
#include <linux/jhash.h>
+#include <linux/sock_diag.h>
+#include <net/udp.h>
struct bpf_stab {
struct bpf_map map;
@@ -31,7 +33,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
return ERR_PTR(-EPERM);
if (attr->max_entries == 0 ||
attr->key_size != 4 ||
- attr->value_size != 4 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
@@ -139,12 +142,58 @@ static void sock_map_unref(struct sock *sk, void *link_raw)
}
}
+static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
+{
+ struct proto *prot;
+
+ sock_owned_by_me(sk);
+
+ switch (sk->sk_type) {
+ case SOCK_STREAM:
+ prot = tcp_bpf_get_proto(sk, psock);
+ break;
+
+ case SOCK_DGRAM:
+ prot = udp_bpf_get_proto(sk, psock);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (IS_ERR(prot))
+ return PTR_ERR(prot);
+
+ sk_psock_update_proto(sk, psock, prot);
+ return 0;
+}
+
+static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock) {
+ if (sk->sk_prot->close != sock_map_close) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ psock = ERR_PTR(-EBUSY);
+ }
+out:
+ rcu_read_unlock();
+ return psock;
+}
+
static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
struct sock *sk)
{
struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
- bool skb_progs, sk_psock_is_new = false;
struct sk_psock *psock;
+ bool skb_progs;
int ret;
skb_verdict = READ_ONCE(progs->skb_verdict);
@@ -170,7 +219,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
}
}
- psock = sk_psock_get_checked(sk);
+ psock = sock_map_psock_get_checked(sk);
if (IS_ERR(psock)) {
ret = PTR_ERR(psock);
goto out_progs;
@@ -189,18 +238,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
ret = -ENOMEM;
goto out_progs;
}
- sk_psock_is_new = true;
}
if (msg_parser)
psock_set_prog(&psock->progs.msg_parser, msg_parser);
- if (sk_psock_is_new) {
- ret = tcp_bpf_init(sk);
- if (ret < 0)
- goto out_drop;
- } else {
- tcp_bpf_reinit(sk);
- }
+
+ ret = sock_map_init_proto(sk, psock);
+ if (ret < 0)
+ goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
if (skb_progs && !psock->parser.enabled) {
@@ -228,13 +273,37 @@ out:
return ret;
}
+static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk)
+{
+ struct sk_psock *psock;
+ int ret;
+
+ psock = sock_map_psock_get_checked(sk);
+ if (IS_ERR(psock))
+ return PTR_ERR(psock);
+
+ if (!psock) {
+ psock = sk_psock_init(sk, map->numa_node);
+ if (!psock)
+ return -ENOMEM;
+ }
+
+ ret = sock_map_init_proto(sk, psock);
+ if (ret < 0)
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
static void sock_map_free(struct bpf_map *map)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
int i;
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
synchronize_rcu();
- raw_spin_lock_bh(&stab->lock);
for (i = 0; i < stab->map.max_entries; i++) {
struct sock **psk = &stab->sks[i];
struct sock *sk;
@@ -248,7 +317,6 @@ static void sock_map_free(struct bpf_map *map)
release_sock(sk);
}
}
- raw_spin_unlock_bh(&stab->lock);
/* wait for psock readers accessing its map link */
synchronize_rcu();
@@ -275,7 +343,22 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
static void *sock_map_lookup(struct bpf_map *map, void *key)
{
- return ERR_PTR(-EOPNOTSUPP);
+ return __sock_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ sock_gen_cookie(sk);
+ return &sk->sk_cookie;
}
static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
@@ -334,11 +417,15 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
return 0;
}
+static bool sock_map_redirect_allowed(const struct sock *sk)
+{
+ return sk->sk_state != TCP_LISTEN;
+}
+
static int sock_map_update_common(struct bpf_map *map, u32 idx,
struct sock *sk, u64 flags)
{
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
- struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_psock_link *link;
struct sk_psock *psock;
struct sock *osk;
@@ -349,14 +436,21 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
return -EINVAL;
if (unlikely(idx >= map->max_entries))
return -E2BIG;
- if (unlikely(rcu_access_pointer(icsk->icsk_ulp_data)))
+ if (inet_csk_has_ulp(sk))
return -EINVAL;
link = sk_psock_init_link();
if (!link)
return -ENOMEM;
- ret = sock_map_link(map, &stab->progs, sk);
+ /* Only sockets we can redirect into/from in BPF need to hold
+ * refs to parser/verdict progs and have their sk_data_ready
+ * and sk_write_space callbacks overridden.
+ */
+ if (sock_map_redirect_allowed(sk))
+ ret = sock_map_link(map, &stab->progs, sk);
+ else
+ ret = sock_map_link_no_progs(map, sk);
if (ret < 0)
goto out_free;
@@ -391,23 +485,52 @@ out_free:
static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
{
return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
- ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+ ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB;
}
-static bool sock_map_sk_is_suitable(const struct sock *sk)
+static bool sk_is_tcp(const struct sock *sk)
{
return sk->sk_type == SOCK_STREAM &&
sk->sk_protocol == IPPROTO_TCP;
}
+static bool sk_is_udp(const struct sock *sk)
+{
+ return sk->sk_type == SOCK_DGRAM &&
+ sk->sk_protocol == IPPROTO_UDP;
+}
+
+static bool sock_map_sk_is_suitable(const struct sock *sk)
+{
+ return sk_is_tcp(sk) || sk_is_udp(sk);
+}
+
+static bool sock_map_sk_state_allowed(const struct sock *sk)
+{
+ if (sk_is_tcp(sk))
+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
+ else if (sk_is_udp(sk))
+ return sk_hashed(sk);
+
+ return false;
+}
+
static int sock_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
- u32 ufd = *(u32 *)value;
u32 idx = *(u32 *)key;
struct socket *sock;
struct sock *sk;
int ret;
+ u64 ufd;
+
+ if (map->value_size == sizeof(u64))
+ ufd = *(u64 *)value;
+ else
+ ufd = *(u32 *)value;
+ if (ufd > S32_MAX)
+ return -EINVAL;
sock = sockfd_lookup(ufd, &ret);
if (!sock)
@@ -423,7 +546,7 @@ static int sock_map_update_elem(struct bpf_map *map, void *key,
}
sock_map_sk_acquire(sk);
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
else
ret = sock_map_update_common(map, idx, sk, flags);
@@ -460,13 +583,17 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct bpf_map *, map, u32, key, u64, flags)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = sk;
return SK_PASS;
}
@@ -483,12 +610,17 @@ const struct bpf_func_proto bpf_sk_redirect_map_proto = {
BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
struct bpf_map *, map, u32, key, u64, flags)
{
+ struct sock *sk;
+
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- msg->flags = flags;
- msg->sk_redir = __sock_map_lookup_elem(map, key);
- if (!msg->sk_redir)
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
return SK_PASS;
}
@@ -506,6 +638,7 @@ const struct bpf_map_ops sock_map_ops = {
.map_alloc = sock_map_alloc,
.map_free = sock_map_free,
.map_get_next_key = sock_map_get_next_key,
+ .map_lookup_elem_sys_only = sock_map_lookup_sys,
.map_update_elem = sock_map_update_elem,
.map_delete_elem = sock_map_delete_elem,
.map_lookup_elem = sock_map_lookup,
@@ -518,7 +651,7 @@ struct bpf_htab_elem {
u32 hash;
struct sock *sk;
struct hlist_node node;
- u8 key[0];
+ u8 key[];
};
struct bpf_htab_bucket {
@@ -662,7 +795,6 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
struct sock *sk, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct inet_connection_sock *icsk = inet_csk(sk);
u32 key_size = map->key_size, hash;
struct bpf_htab_elem *elem, *elem_new;
struct bpf_htab_bucket *bucket;
@@ -673,14 +805,21 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
WARN_ON_ONCE(!rcu_read_lock_held());
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
- if (unlikely(icsk->icsk_ulp_data))
+ if (inet_csk_has_ulp(sk))
return -EINVAL;
link = sk_psock_init_link();
if (!link)
return -ENOMEM;
- ret = sock_map_link(map, &htab->progs, sk);
+ /* Only sockets we can redirect into/from in BPF need to hold
+ * refs to parser/verdict progs and have their sk_data_ready
+ * and sk_write_space callbacks overridden.
+ */
+ if (sock_map_redirect_allowed(sk))
+ ret = sock_map_link(map, &htab->progs, sk);
+ else
+ ret = sock_map_link_no_progs(map, sk);
if (ret < 0)
goto out_free;
@@ -729,10 +868,17 @@ out_free:
static int sock_hash_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
- u32 ufd = *(u32 *)value;
struct socket *sock;
struct sock *sk;
int ret;
+ u64 ufd;
+
+ if (map->value_size == sizeof(u64))
+ ufd = *(u64 *)value;
+ else
+ ufd = *(u32 *)value;
+ if (ufd > S32_MAX)
+ return -EINVAL;
sock = sockfd_lookup(ufd, &ret);
if (!sock)
@@ -748,7 +894,7 @@ static int sock_hash_update_elem(struct bpf_map *map, void *key,
}
sock_map_sk_acquire(sk);
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (!sock_map_sk_state_allowed(sk))
ret = -EOPNOTSUPP;
else
ret = sock_hash_update_common(map, key, sk, flags);
@@ -808,7 +954,8 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
return ERR_PTR(-EPERM);
if (attr->max_entries == 0 ||
attr->key_size == 0 ||
- attr->value_size != 4 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->key_size > MAX_BPF_STACK)
@@ -863,10 +1010,13 @@ static void sock_hash_free(struct bpf_map *map)
struct hlist_node *node;
int i;
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
synchronize_rcu();
for (i = 0; i < htab->buckets_num; i++) {
bucket = sock_hash_select_bucket(htab, i);
- raw_spin_lock_bh(&bucket->lock);
hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
hlist_del_rcu(&elem->node);
lock_sock(elem->sk);
@@ -875,7 +1025,6 @@ static void sock_hash_free(struct bpf_map *map)
rcu_read_unlock();
release_sock(elem->sk);
}
- raw_spin_unlock_bh(&bucket->lock);
}
/* wait for psock readers accessing its map link */
@@ -885,6 +1034,26 @@ static void sock_hash_free(struct bpf_map *map)
kfree(htab);
}
+static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ sock_gen_cookie(sk);
+ return &sk->sk_cookie;
+}
+
+static void *sock_hash_lookup(struct bpf_map *map, void *key)
+{
+ return __sock_hash_lookup_elem(map, key);
+}
+
static void sock_hash_release_progs(struct bpf_map *map)
{
psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs);
@@ -916,13 +1085,17 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
struct bpf_map *, map, void *, key, u64, flags)
{
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct sock *sk;
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ tcb->bpf.flags = flags;
+ tcb->bpf.sk_redir = sk;
return SK_PASS;
}
@@ -939,12 +1112,17 @@ const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
struct bpf_map *, map, void *, key, u64, flags)
{
+ struct sock *sk;
+
if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
- msg->flags = flags;
- msg->sk_redir = __sock_hash_lookup_elem(map, key);
- if (!msg->sk_redir)
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
return SK_PASS;
}
@@ -964,7 +1142,8 @@ const struct bpf_map_ops sock_hash_ops = {
.map_get_next_key = sock_hash_get_next_key,
.map_update_elem = sock_hash_update_elem,
.map_delete_elem = sock_hash_delete_elem,
- .map_lookup_elem = sock_map_lookup,
+ .map_lookup_elem = sock_hash_lookup,
+ .map_lookup_elem_sys_only = sock_hash_lookup_sys,
.map_release_uref = sock_hash_release_progs,
.map_check_btf = map_check_no_btf,
};
@@ -1008,7 +1187,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
return 0;
}
-void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
+static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link)
{
switch (link->map->map_type) {
case BPF_MAP_TYPE_SOCKMAP:
@@ -1021,3 +1200,54 @@ void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
break;
}
}
+
+static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ while ((link = sk_psock_link_pop(psock))) {
+ sock_map_unlink(sk, link);
+ sk_psock_free_link(link);
+ }
+}
+
+void sock_map_unhash(struct sock *sk)
+{
+ void (*saved_unhash)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ if (sk->sk_prot->unhash)
+ sk->sk_prot->unhash(sk);
+ return;
+ }
+
+ saved_unhash = psock->saved_unhash;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ saved_unhash(sk);
+}
+
+void sock_map_close(struct sock *sk, long timeout)
+{
+ void (*saved_close)(struct sock *sk, long timeout);
+ struct sk_psock *psock;
+
+ lock_sock(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ release_sock(sk);
+ return sk->sk_prot->close(sk, timeout);
+ }
+
+ saved_close = psock->saved_close;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ release_sock(sk);
+ saved_close(sk, timeout);
+}
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 91e9f2223c39..adcb3aea576d 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -16,27 +16,8 @@
DEFINE_SPINLOCK(reuseport_lock);
-#define REUSEPORT_MIN_ID 1
static DEFINE_IDA(reuseport_ida);
-int reuseport_get_id(struct sock_reuseport *reuse)
-{
- int id;
-
- if (reuse->reuseport_id)
- return reuse->reuseport_id;
-
- id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
- /* Called under reuseport_lock */
- GFP_ATOMIC);
- if (id < 0)
- return id;
-
- reuse->reuseport_id = id;
-
- return reuse->reuseport_id;
-}
-
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
unsigned int size = sizeof(struct sock_reuseport) +
@@ -55,6 +36,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
+ int id, ret = 0;
/* bh lock used since this function call may precede hlist lock in
* soft irq of receive path or setsockopt from process context
@@ -78,10 +60,18 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
- spin_unlock_bh(&reuseport_lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
+ id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+ if (id < 0) {
+ kfree(reuse);
+ ret = id;
+ goto out;
+ }
+
+ reuse->reuseport_id = id;
reuse->socks[0] = sk;
reuse->num_socks = 1;
reuse->bind_inany = bind_inany;
@@ -90,7 +80,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
out:
spin_unlock_bh(&reuseport_lock);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(reuseport_alloc);
@@ -134,8 +124,7 @@ static void reuseport_free_rcu(struct rcu_head *head)
reuse = container_of(head, struct sock_reuseport, rcu);
sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
- if (reuse->reuseport_id)
- ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
+ ida_free(&reuseport_ida, reuse->reuseport_id);
kfree(reuse);
}
@@ -199,12 +188,15 @@ void reuseport_detach_sock(struct sock *sk)
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
- /* At least one of the sk in this reuseport group is added to
- * a bpf map. Notify the bpf side. The bpf map logic will
- * remove the sk if it is indeed added to a bpf map.
+ /* Notify the bpf side. The sk may be added to a sockarray
+ * map. If so, sockarray logic will remove it from the map.
+ *
+ * Other bpf map types that work with reuseport, like sockmap,
+ * don't need an explicit callback from here. They override sk
+ * unhash/close ops to remove the sk from the map before we
+ * get to this point.
*/
- if (reuse->reuseport_id)
- bpf_sk_reuseport_detach(sk);
+ bpf_sk_reuseport_detach(sk);
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8310714c47fd..4c7ea85486af 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -372,7 +372,7 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
napi_direct &= !xdp_return_frame_no_direct();
- page_pool_put_page(xa->page_pool, page, napi_direct);
+ page_pool_put_full_page(xa->page_pool, page, napi_direct);
rcu_read_unlock();
break;
case MEM_TYPE_PAGE_SHARED:
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 70f88f2b4456..105f3734dadb 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -95,7 +95,7 @@ void ccid_cleanup_builtins(void);
struct ccid {
struct ccid_operations *ccid_ops;
- char ccid_priv[0];
+ char ccid_priv[];
};
static inline void *ccid_priv(const struct ccid *ccid)
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 73ef73a218ff..8a82c5a2c5a8 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -46,16 +46,15 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
}
static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc);
+ inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r);
}
-static int dccp_diag_dump_one(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+static int dccp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req);
+ return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req);
}
static const struct inet_diag_handler dccp_diag_handler = {
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 25187528c308..c5c74a34d139 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -216,6 +216,7 @@ EXPORT_SYMBOL_GPL(dccp_check_req);
*/
int dccp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb)
+ __releases(child)
{
int ret = 0;
const int state = child->sk_state;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 08c3dc45f1a4..06b9983325cc 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1173,7 +1173,7 @@ make_route:
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
- rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, 0);
if (rt == NULL)
goto e_nobufs;
@@ -1439,7 +1439,7 @@ static int dn_route_input_slow(struct sk_buff *skb)
}
make_route:
- rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST);
+ rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, 0);
if (rt == NULL)
goto e_nobufs;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 17281fec710c..ee2610c4d46a 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -88,13 +88,9 @@ const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol)
{
struct dsa_tag_driver *dsa_tag_driver;
const struct dsa_device_ops *ops;
- char module_name[128];
bool found = false;
- snprintf(module_name, 127, "%s%d", DSA_TAG_DRIVER_ALIAS,
- tag_protocol);
-
- request_module(module_name);
+ request_module("%s%d", DSA_TAG_DRIVER_ALIAS, tag_protocol);
mutex_lock(&dsa_tag_drivers_lock);
list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index e7c30b472034..9a271a58a41d 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -18,8 +18,8 @@
#include "dsa_priv.h"
-static LIST_HEAD(dsa_tree_list);
static DEFINE_MUTEX(dsa2_mutex);
+LIST_HEAD(dsa_tree_list);
static const struct devlink_ops dsa_devlink_ops = {
};
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index a7662e7a691d..904cc7c9b882 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -22,6 +22,7 @@ enum {
DSA_NOTIFIER_MDB_DEL,
DSA_NOTIFIER_VLAN_ADD,
DSA_NOTIFIER_VLAN_DEL,
+ DSA_NOTIFIER_MTU,
};
/* DSA_NOTIFIER_AGEING_TIME */
@@ -61,6 +62,14 @@ struct dsa_notifier_vlan_info {
int port;
};
+/* DSA_NOTIFIER_MTU */
+struct dsa_notifier_mtu_info {
+ bool propagate_upstream;
+ int sw_index;
+ int port;
+ int mtu;
+};
+
struct dsa_slave_priv {
/* Copy of CPU port xmit for faster access in slave transmit hot path */
struct sk_buff * (*xmit)(struct sk_buff *skb,
@@ -117,7 +126,9 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
/* port.c */
int dsa_port_set_state(struct dsa_port *dp, u8 state,
struct switchdev_trans *trans);
+int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy);
int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy);
+void dsa_port_disable_rt(struct dsa_port *dp);
void dsa_port_disable(struct dsa_port *dp);
int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br);
void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
@@ -125,6 +136,8 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
struct switchdev_trans *trans);
int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
struct switchdev_trans *trans);
+int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu,
+ bool propagate_upstream);
int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
u16 vid);
int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
@@ -181,4 +194,8 @@ dsa_slave_to_master(const struct net_device *dev)
/* switch.c */
int dsa_switch_register_notifier(struct dsa_switch *ds);
void dsa_switch_unregister_notifier(struct dsa_switch *ds);
+
+/* dsa2.c */
+extern struct list_head dsa_tree_list;
+
#endif
diff --git a/net/dsa/master.c b/net/dsa/master.c
index bd44bde272f4..b5c535af63a3 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -314,20 +314,6 @@ static const struct attribute_group dsa_group = {
.attrs = dsa_slave_attrs,
};
-static void dsa_master_set_mtu(struct net_device *dev, struct dsa_port *cpu_dp)
-{
- unsigned int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead;
- int err;
-
- rtnl_lock();
- if (mtu <= dev->max_mtu) {
- err = dev_set_mtu(dev, mtu);
- if (err)
- netdev_dbg(dev, "Unable to set MTU to include for DSA overheads\n");
- }
- rtnl_unlock();
-}
-
static void dsa_master_reset_mtu(struct net_device *dev)
{
int err;
@@ -344,7 +330,12 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
{
int ret;
- dsa_master_set_mtu(dev, cpu_dp);
+ rtnl_lock();
+ ret = dev_set_mtu(dev, ETH_DATA_LEN + cpu_dp->tag_ops->overhead);
+ rtnl_unlock();
+ if (ret)
+ netdev_warn(dev, "error %d setting MTU to include DSA overhead\n",
+ ret);
/* If we use a tagging format that doesn't have an ethertype
* field, make sure that all packets from this point on get
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 774facb8d547..231b2d494f1c 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -63,7 +63,7 @@ static void dsa_port_set_state_now(struct dsa_port *dp, u8 state)
pr_err("DSA: failed to set STP state %u (%d)\n", state, err);
}
-int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy)
+int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy)
{
struct dsa_switch *ds = dp->ds;
int port = dp->index;
@@ -78,14 +78,31 @@ int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy)
if (!dp->bridge_dev)
dsa_port_set_state_now(dp, BR_STATE_FORWARDING);
+ if (dp->pl)
+ phylink_start(dp->pl);
+
return 0;
}
-void dsa_port_disable(struct dsa_port *dp)
+int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy)
+{
+ int err;
+
+ rtnl_lock();
+ err = dsa_port_enable_rt(dp, phy);
+ rtnl_unlock();
+
+ return err;
+}
+
+void dsa_port_disable_rt(struct dsa_port *dp)
{
struct dsa_switch *ds = dp->ds;
int port = dp->index;
+ if (dp->pl)
+ phylink_stop(dp->pl);
+
if (!dp->bridge_dev)
dsa_port_set_state_now(dp, BR_STATE_DISABLED);
@@ -93,6 +110,13 @@ void dsa_port_disable(struct dsa_port *dp)
ds->ops->port_disable(ds, port);
}
+void dsa_port_disable(struct dsa_port *dp)
+{
+ rtnl_lock();
+ dsa_port_disable_rt(dp);
+ rtnl_unlock();
+}
+
int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br)
{
struct dsa_notifier_bridge_info info = {
@@ -273,6 +297,19 @@ int dsa_port_mrouter(struct dsa_port *dp, bool mrouter,
return ds->ops->port_egress_floods(ds, port, true, mrouter);
}
+int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu,
+ bool propagate_upstream)
+{
+ struct dsa_notifier_mtu_info info = {
+ .sw_index = dp->ds->index,
+ .propagate_upstream = propagate_upstream,
+ .port = dp->index,
+ .mtu = new_mtu,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_MTU, &info);
+}
+
int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
u16 vid)
{
@@ -433,6 +470,7 @@ static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config,
{
struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
struct dsa_switch *ds = dp->ds;
+ int err;
/* Only called for inband modes */
if (!ds->ops->phylink_mac_link_state) {
@@ -440,8 +478,12 @@ static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config,
return;
}
- if (ds->ops->phylink_mac_link_state(ds, dp->index, state) < 0)
+ err = ds->ops->phylink_mac_link_state(ds, dp->index, state);
+ if (err < 0) {
+ dev_err(ds->dev, "p%d: phylink_mac_link_state() failed: %d\n",
+ dp->index, err);
state->link = 0;
+ }
}
static void dsa_port_phylink_mac_config(struct phylink_config *config,
@@ -489,9 +531,11 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
}
static void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+ struct phy_device *phydev,
unsigned int mode,
phy_interface_t interface,
- struct phy_device *phydev)
+ int speed, int duplex,
+ bool tx_pause, bool rx_pause)
{
struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
struct dsa_switch *ds = dp->ds;
@@ -502,7 +546,8 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config,
return;
}
- ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
+ ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev,
+ speed, duplex, tx_pause, rx_pause);
}
const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
@@ -614,10 +659,6 @@ static int dsa_port_phylink_register(struct dsa_port *dp)
goto err_phy_connect;
}
- rtnl_lock();
- phylink_start(dp->pl);
- rtnl_unlock();
-
return 0;
err_phy_connect:
@@ -628,9 +669,14 @@ err_phy_connect:
int dsa_port_link_register_of(struct dsa_port *dp)
{
struct dsa_switch *ds = dp->ds;
+ struct device_node *phy_np;
- if (!ds->ops->adjust_link)
- return dsa_port_phylink_register(dp);
+ if (!ds->ops->adjust_link) {
+ phy_np = of_parse_phandle(dp->dn, "phy-handle", 0);
+ if (of_phy_is_fixed_link(dp->dn) || phy_np)
+ return dsa_port_phylink_register(dp);
+ return 0;
+ }
dev_warn(ds->dev,
"Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n");
@@ -645,11 +691,12 @@ void dsa_port_link_unregister_of(struct dsa_port *dp)
{
struct dsa_switch *ds = dp->ds;
- if (!ds->ops->adjust_link) {
+ if (!ds->ops->adjust_link && dp->pl) {
rtnl_lock();
phylink_disconnect_phy(dp->pl);
rtnl_unlock();
phylink_destroy(dp->pl);
+ dp->pl = NULL;
return;
}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 088c886e609e..5390ff541658 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -88,12 +88,10 @@ static int dsa_slave_open(struct net_device *dev)
goto clear_allmulti;
}
- err = dsa_port_enable(dp, dev->phydev);
+ err = dsa_port_enable_rt(dp, dev->phydev);
if (err)
goto clear_promisc;
- phylink_start(dp->pl);
-
return 0;
clear_promisc:
@@ -114,9 +112,7 @@ static int dsa_slave_close(struct net_device *dev)
struct net_device *master = dsa_slave_to_master(dev);
struct dsa_port *dp = dsa_slave_to_port(dev);
- phylink_stop(dp->pl);
-
- dsa_port_disable(dp);
+ dsa_port_disable_rt(dp);
dev_mc_unsync(master, dev);
dev_uc_unsync(master, dev);
@@ -846,59 +842,137 @@ dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
return NULL;
}
-static int dsa_slave_add_cls_matchall(struct net_device *dev,
- struct tc_cls_matchall_offload *cls,
- bool ingress)
+static int
+dsa_slave_add_cls_matchall_mirred(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_mall_mirror_tc_entry *mirror;
struct dsa_mall_tc_entry *mall_tc_entry;
- __be16 protocol = cls->common.protocol;
struct dsa_switch *ds = dp->ds;
struct flow_action_entry *act;
struct dsa_port *to_dp;
- int err = -EOPNOTSUPP;
+ int err;
+
+ act = &cls->rule->action.entries[0];
if (!ds->ops->port_mirror_add)
- return err;
+ return -EOPNOTSUPP;
- if (!flow_offload_has_one_action(&cls->rule->action))
- return err;
+ if (!act->dev)
+ return -EINVAL;
+
+ if (!flow_action_basic_hw_stats_check(&cls->rule->action,
+ cls->common.extack))
+ return -EOPNOTSUPP;
act = &cls->rule->action.entries[0];
- if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) {
- struct dsa_mall_mirror_tc_entry *mirror;
+ if (!dsa_slave_dev_check(act->dev))
+ return -EOPNOTSUPP;
- if (!act->dev)
- return -EINVAL;
+ mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+ if (!mall_tc_entry)
+ return -ENOMEM;
- if (!dsa_slave_dev_check(act->dev))
- return -EOPNOTSUPP;
+ mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
+ mirror = &mall_tc_entry->mirror;
- mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
- if (!mall_tc_entry)
- return -ENOMEM;
+ to_dp = dsa_slave_to_port(act->dev);
- mall_tc_entry->cookie = cls->cookie;
- mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
- mirror = &mall_tc_entry->mirror;
+ mirror->to_local_port = to_dp->index;
+ mirror->ingress = ingress;
- to_dp = dsa_slave_to_port(act->dev);
+ err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress);
+ if (err) {
+ kfree(mall_tc_entry);
+ return err;
+ }
- mirror->to_local_port = to_dp->index;
- mirror->ingress = ingress;
+ list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
- err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress);
- if (err) {
- kfree(mall_tc_entry);
- return err;
+ return err;
+}
+
+static int
+dsa_slave_add_cls_matchall_police(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ struct netlink_ext_ack *extack = cls->common.extack;
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_mall_policer_tc_entry *policer;
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = dp->ds;
+ struct flow_action_entry *act;
+ int err;
+
+ if (!ds->ops->port_policer_add) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Policing offload not implemented\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (!ingress) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only supported on ingress qdisc\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (!flow_action_basic_hw_stats_check(&cls->rule->action,
+ cls->common.extack))
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
+ if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only one port policer allowed\n");
+ return -EEXIST;
}
+ }
- list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+ act = &cls->rule->action.entries[0];
+
+ mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+ if (!mall_tc_entry)
+ return -ENOMEM;
+
+ mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->type = DSA_PORT_MALL_POLICER;
+ policer = &mall_tc_entry->policer;
+ policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
+ policer->burst = act->police.burst;
+
+ err = ds->ops->port_policer_add(ds, dp->index, policer);
+ if (err) {
+ kfree(mall_tc_entry);
+ return err;
}
- return 0;
+ list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+ return err;
+}
+
+static int dsa_slave_add_cls_matchall(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ int err = -EOPNOTSUPP;
+
+ if (cls->common.protocol == htons(ETH_P_ALL) &&
+ flow_offload_has_one_action(&cls->rule->action) &&
+ cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED)
+ err = dsa_slave_add_cls_matchall_mirred(dev, cls, ingress);
+ else if (flow_offload_has_one_action(&cls->rule->action) &&
+ cls->rule->action.entries[0].id == FLOW_ACTION_POLICE)
+ err = dsa_slave_add_cls_matchall_police(dev, cls, ingress);
+
+ return err;
}
static void dsa_slave_del_cls_matchall(struct net_device *dev,
@@ -908,9 +982,6 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev,
struct dsa_mall_tc_entry *mall_tc_entry;
struct dsa_switch *ds = dp->ds;
- if (!ds->ops->port_mirror_del)
- return;
-
mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie);
if (!mall_tc_entry)
return;
@@ -919,7 +990,13 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev,
switch (mall_tc_entry->type) {
case DSA_PORT_MALL_MIRROR:
- ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror);
+ if (ds->ops->port_mirror_del)
+ ds->ops->port_mirror_del(ds, dp->index,
+ &mall_tc_entry->mirror);
+ break;
+ case DSA_PORT_MALL_POLICER:
+ if (ds->ops->port_policer_del)
+ ds->ops->port_policer_del(ds, dp->index);
break;
default:
WARN_ON(1);
@@ -946,6 +1023,64 @@ static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev,
}
}
+static int dsa_slave_add_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->cls_flower_add)
+ return -EOPNOTSUPP;
+
+ return ds->ops->cls_flower_add(ds, port, cls, ingress);
+}
+
+static int dsa_slave_del_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->cls_flower_del)
+ return -EOPNOTSUPP;
+
+ return ds->ops->cls_flower_del(ds, port, cls, ingress);
+}
+
+static int dsa_slave_stats_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->cls_flower_stats)
+ return -EOPNOTSUPP;
+
+ return ds->ops->cls_flower_stats(ds, port, cls, ingress);
+}
+
+static int dsa_slave_setup_tc_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ switch (cls->command) {
+ case FLOW_CLS_REPLACE:
+ return dsa_slave_add_cls_flower(dev, cls, ingress);
+ case FLOW_CLS_DESTROY:
+ return dsa_slave_del_cls_flower(dev, cls, ingress);
+ case FLOW_CLS_STATS:
+ return dsa_slave_stats_cls_flower(dev, cls, ingress);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
static int dsa_slave_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
void *cb_priv, bool ingress)
{
@@ -957,6 +1092,8 @@ static int dsa_slave_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
switch (type) {
case TC_SETUP_CLSMATCHALL:
return dsa_slave_setup_tc_cls_matchall(dev, type_data, ingress);
+ case TC_SETUP_CLSFLOWER:
+ return dsa_slave_setup_tc_cls_flower(dev, type_data, ingress);
default:
return -EOPNOTSUPP;
}
@@ -1158,6 +1295,208 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
return dsa_port_vid_del(dp, vid);
}
+struct dsa_hw_port {
+ struct list_head list;
+ struct net_device *dev;
+ int old_mtu;
+};
+
+static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu)
+{
+ const struct dsa_hw_port *p;
+ int err;
+
+ list_for_each_entry(p, hw_port_list, list) {
+ if (p->dev->mtu == mtu)
+ continue;
+
+ err = dev_set_mtu(p->dev, mtu);
+ if (err)
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ list_for_each_entry_continue_reverse(p, hw_port_list, list) {
+ if (p->dev->mtu == p->old_mtu)
+ continue;
+
+ if (dev_set_mtu(p->dev, p->old_mtu))
+ netdev_err(p->dev, "Failed to restore MTU\n");
+ }
+
+ return err;
+}
+
+static void dsa_hw_port_list_free(struct list_head *hw_port_list)
+{
+ struct dsa_hw_port *p, *n;
+
+ list_for_each_entry_safe(p, n, hw_port_list, list)
+ kfree(p);
+}
+
+/* Make the hardware datapath to/from @dev limited to a common MTU */
+void dsa_bridge_mtu_normalization(struct dsa_port *dp)
+{
+ struct list_head hw_port_list;
+ struct dsa_switch_tree *dst;
+ int min_mtu = ETH_MAX_MTU;
+ struct dsa_port *other_dp;
+ int err;
+
+ if (!dp->ds->mtu_enforcement_ingress)
+ return;
+
+ if (!dp->bridge_dev)
+ return;
+
+ INIT_LIST_HEAD(&hw_port_list);
+
+ /* Populate the list of ports that are part of the same bridge
+ * as the newly added/modified port
+ */
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ list_for_each_entry(other_dp, &dst->ports, list) {
+ struct dsa_hw_port *hw_port;
+ struct net_device *slave;
+
+ if (other_dp->type != DSA_PORT_TYPE_USER)
+ continue;
+
+ if (other_dp->bridge_dev != dp->bridge_dev)
+ continue;
+
+ if (!other_dp->ds->mtu_enforcement_ingress)
+ continue;
+
+ slave = other_dp->slave;
+
+ if (min_mtu > slave->mtu)
+ min_mtu = slave->mtu;
+
+ hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL);
+ if (!hw_port)
+ goto out;
+
+ hw_port->dev = slave;
+ hw_port->old_mtu = slave->mtu;
+
+ list_add(&hw_port->list, &hw_port_list);
+ }
+ }
+
+ /* Attempt to configure the entire hardware bridge to the newly added
+ * interface's MTU first, regardless of whether the intention of the
+ * user was to raise or lower it.
+ */
+ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->slave->mtu);
+ if (!err)
+ goto out;
+
+ /* Clearly that didn't work out so well, so just set the minimum MTU on
+ * all hardware bridge ports now. If this fails too, then all ports will
+ * still have their old MTU rolled back anyway.
+ */
+ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu);
+
+out:
+ dsa_hw_port_list_free(&hw_port_list);
+}
+
+static int dsa_slave_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct net_device *master = dsa_slave_to_master(dev);
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->dp->ds;
+ struct dsa_port *cpu_dp;
+ int port = p->dp->index;
+ int largest_mtu = 0;
+ int new_master_mtu;
+ int old_master_mtu;
+ int mtu_limit;
+ int cpu_mtu;
+ int err, i;
+
+ if (!ds->ops->port_change_mtu)
+ return -EOPNOTSUPP;
+
+ for (i = 0; i < ds->num_ports; i++) {
+ int slave_mtu;
+
+ if (!dsa_is_user_port(ds, i))
+ continue;
+
+ /* During probe, this function will be called for each slave
+ * device, while not all of them have been allocated. That's
+ * ok, it doesn't change what the maximum is, so ignore it.
+ */
+ if (!dsa_to_port(ds, i)->slave)
+ continue;
+
+ /* Pretend that we already applied the setting, which we
+ * actually haven't (still haven't done all integrity checks)
+ */
+ if (i == port)
+ slave_mtu = new_mtu;
+ else
+ slave_mtu = dsa_to_port(ds, i)->slave->mtu;
+
+ if (largest_mtu < slave_mtu)
+ largest_mtu = slave_mtu;
+ }
+
+ cpu_dp = dsa_to_port(ds, port)->cpu_dp;
+
+ mtu_limit = min_t(int, master->max_mtu, dev->max_mtu);
+ old_master_mtu = master->mtu;
+ new_master_mtu = largest_mtu + cpu_dp->tag_ops->overhead;
+ if (new_master_mtu > mtu_limit)
+ return -ERANGE;
+
+ /* If the master MTU isn't over limit, there's no need to check the CPU
+ * MTU, since that surely isn't either.
+ */
+ cpu_mtu = largest_mtu;
+
+ /* Start applying stuff */
+ if (new_master_mtu != old_master_mtu) {
+ err = dev_set_mtu(master, new_master_mtu);
+ if (err < 0)
+ goto out_master_failed;
+
+ /* We only need to propagate the MTU of the CPU port to
+ * upstream switches.
+ */
+ err = dsa_port_mtu_change(cpu_dp, cpu_mtu, true);
+ if (err)
+ goto out_cpu_failed;
+ }
+
+ err = dsa_port_mtu_change(dp, new_mtu, false);
+ if (err)
+ goto out_port_failed;
+
+ dev->mtu = new_mtu;
+
+ dsa_bridge_mtu_normalization(dp);
+
+ return 0;
+
+out_port_failed:
+ if (new_master_mtu != old_master_mtu)
+ dsa_port_mtu_change(cpu_dp, old_master_mtu -
+ cpu_dp->tag_ops->overhead,
+ true);
+out_cpu_failed:
+ if (new_master_mtu != old_master_mtu)
+ dev_set_mtu(master, old_master_mtu);
+out_master_failed:
+ return err;
+}
+
static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_drvinfo = dsa_slave_get_drvinfo,
.get_regs_len = dsa_slave_get_regs_len,
@@ -1235,6 +1574,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid,
.ndo_get_devlink_port = dsa_slave_get_devlink_port,
+ .ndo_change_mtu = dsa_slave_change_mtu,
};
static struct device_type dsa_type = {
@@ -1245,7 +1585,8 @@ void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
{
const struct dsa_port *dp = dsa_to_port(ds, port);
- phylink_mac_change(dp->pl, up);
+ if (dp->pl)
+ phylink_mac_change(dp->pl, up);
}
EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
@@ -1405,7 +1746,10 @@ int dsa_slave_create(struct dsa_port *port)
slave_dev->priv_flags |= IFF_NO_QUEUE;
slave_dev->netdev_ops = &dsa_slave_netdev_ops;
slave_dev->min_mtu = 0;
- slave_dev->max_mtu = ETH_MAX_MTU;
+ if (ds->ops->port_max_mtu)
+ slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
+ else
+ slave_dev->max_mtu = ETH_MAX_MTU;
SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
SET_NETDEV_DEV(slave_dev, port->ds->dev);
@@ -1423,6 +1767,15 @@ int dsa_slave_create(struct dsa_port *port)
p->xmit = cpu_dp->tag_ops->xmit;
port->slave = slave_dev;
+ rtnl_lock();
+ ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN);
+ rtnl_unlock();
+ if (ret && ret != -EOPNOTSUPP) {
+ dev_err(ds->dev, "error %d setting MTU on port %d\n",
+ ret, port->index);
+ goto out_free;
+ }
+
netif_carrier_off(slave_dev);
ret = dsa_slave_phy_setup(slave_dev);
@@ -1485,6 +1838,8 @@ static int dsa_slave_changeupper(struct net_device *dev,
if (netif_is_bridge_master(info->upper_dev)) {
if (info->linking) {
err = dsa_port_bridge_join(dp, info->upper_dev);
+ if (!err)
+ dsa_bridge_mtu_normalization(dp);
err = notifier_from_errno(err);
} else {
dsa_port_bridge_leave(dp, info->upper_dev);
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index df4abe897ed6..f3c32ff552b3 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -52,6 +52,40 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds,
return 0;
}
+static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port,
+ struct dsa_notifier_mtu_info *info)
+{
+ if (ds->index == info->sw_index)
+ return (port == info->port) || dsa_is_dsa_port(ds, port);
+
+ if (!info->propagate_upstream)
+ return false;
+
+ if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port))
+ return true;
+
+ return false;
+}
+
+static int dsa_switch_mtu(struct dsa_switch *ds,
+ struct dsa_notifier_mtu_info *info)
+{
+ int port, ret;
+
+ if (!ds->ops->port_change_mtu)
+ return -EOPNOTSUPP;
+
+ for (port = 0; port < ds->num_ports; port++) {
+ if (dsa_switch_mtu_match(ds, port, info)) {
+ ret = ds->ops->port_change_mtu(ds, port, info->mtu);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static int dsa_switch_bridge_join(struct dsa_switch *ds,
struct dsa_notifier_bridge_info *info)
{
@@ -328,6 +362,9 @@ static int dsa_switch_event(struct notifier_block *nb,
case DSA_NOTIFIER_VLAN_DEL:
err = dsa_switch_vlan_del(ds, info);
break;
+ case DSA_NOTIFIER_MTU:
+ err = dsa_switch_mtu(ds, info);
+ break;
default:
err = -EOPNOTSUPP;
break;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 2fb6c26294b5..b97ad93d1c1a 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -298,47 +298,4 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
}
EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
-/* In the DSA packet_type handler, skb->data points in the middle of the VLAN
- * tag, after tpid and before tci. This is because so far, ETH_HLEN
- * (DMAC, SMAC, EtherType) bytes were pulled.
- * There are 2 bytes of VLAN tag left in skb->data, and upper
- * layers expect the 'real' EtherType to be consumed as well.
- * Coincidentally, a VLAN header is also of the same size as
- * the number of bytes that need to be pulled.
- *
- * skb_mac_header skb->data
- * | |
- * v v
- * | | | | | | | | | | | | | | | | | | |
- * +-----------------------+-----------------------+-------+-------+-------+
- * | Destination MAC | Source MAC | TPID | TCI | EType |
- * +-----------------------+-----------------------+-------+-------+-------+
- * ^ | |
- * |<--VLAN_HLEN-->to <---VLAN_HLEN--->
- * from |
- * >>>>>>> v
- * >>>>>>> | | | | | | | | | | | | | | |
- * >>>>>>> +-----------------------+-----------------------+-------+
- * >>>>>>> | Destination MAC | Source MAC | EType |
- * +-----------------------+-----------------------+-------+
- * ^ ^
- * (now part of | |
- * skb->head) skb_mac_header skb->data
- */
-struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb)
-{
- u8 *from = skb_mac_header(skb);
- u8 *dest = from + VLAN_HLEN;
-
- memmove(dest, from, ETH_HLEN - VLAN_HLEN);
- skb_pull(skb, VLAN_HLEN);
- skb_push(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- skb_reset_mac_len(skb);
- skb_pull_rcsum(skb, ETH_HLEN);
-
- return skb;
-}
-EXPORT_SYMBOL_GPL(dsa_8021q_remove_header);
-
MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
index 466ffa92a474..55b00694cdba 100644
--- a/net/dsa/tag_ar9331.c
+++ b/net/dsa/tag_ar9331.c
@@ -31,7 +31,7 @@ static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb,
__le16 *phdr;
u16 hdr;
- if (skb_cow_head(skb, 0) < 0)
+ if (skb_cow_head(skb, AR9331_HDR_LEN) < 0)
return NULL;
phdr = skb_push(skb, AR9331_HDR_LEN);
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 9c3114179690..cc8512b5f9e2 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -140,8 +140,31 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
/* Remove Broadcom tag and update checksum */
skb_pull_rcsum(skb, BRCM_TAG_LEN);
+ skb->offload_fwd_mark = 1;
+
return skb;
}
+
+static int brcm_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto,
+ int *offset)
+{
+ /* We have been called on the DSA master network device after
+ * eth_type_trans() which pulled the Ethernet header already.
+ * Frames have one of these two layouts:
+ * -----------------------------------
+ * | MAC DA | MAC SA | 4b tag | Type | DSA_TAG_PROTO_BRCM
+ * -----------------------------------
+ * -----------------------------------
+ * | 4b tag | MAC DA | MAC SA | Type | DSA_TAG_PROTO_BRCM_PREPEND
+ * -----------------------------------
+ * skb->data points 2 bytes before the actual Ethernet type field and
+ * we have an offset of 4bytes between where skb->data and where the
+ * payload starts.
+ */
+ *offset = BRCM_TAG_LEN;
+ *proto = ((__be16 *)skb->data)[1];
+ return 0;
+}
#endif
#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM)
@@ -177,6 +200,7 @@ static const struct dsa_device_ops brcm_netdev_ops = {
.xmit = brcm_tag_xmit,
.rcv = brcm_tag_rcv,
.overhead = BRCM_TAG_LEN,
+ .flow_dissect = brcm_tag_flow_dissect,
};
DSA_TAG_DRIVER(brcm_netdev_ops);
@@ -205,6 +229,7 @@ static const struct dsa_device_ops brcm_prepend_netdev_ops = {
.xmit = brcm_tag_xmit_prepend,
.rcv = brcm_tag_rcv_prepend,
.overhead = BRCM_TAG_LEN,
+ .flow_dissect = brcm_tag_flow_dissect,
};
DSA_TAG_DRIVER(brcm_prepend_netdev_ops);
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 8e3e7283d430..59de1315100f 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -153,7 +153,8 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
memset(injection, 0, OCELOT_TAG_LEN);
- src = dsa_upstream_port(ds, port);
+ /* Set the source port as the CPU port module and not the NPI port */
+ src = ocelot->num_phys_ports;
dest = BIT(port);
bypass = true;
qos_class = skb->priority;
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index c8a128c9e5e0..70db7c909f74 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -33,7 +33,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
struct dsa_port *dp = dsa_slave_to_port(dev);
u16 *phdr, hdr;
- if (skb_cow_head(skb, 0) < 0)
+ if (skb_cow_head(skb, QCA_HDR_LEN) < 0)
return NULL;
skb_push(skb, QCA_HDR_LEN);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 5366ea430349..d553bf36bd41 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -250,14 +250,14 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
{
struct sja1105_meta meta = {0};
int source_port, switch_id;
- struct vlan_ethhdr *hdr;
+ struct ethhdr *hdr;
u16 tpid, vid, tci;
bool is_link_local;
bool is_tagged;
bool is_meta;
- hdr = vlan_eth_hdr(skb);
- tpid = ntohs(hdr->h_vlan_proto);
+ hdr = eth_hdr(skb);
+ tpid = ntohs(hdr->h_proto);
is_tagged = (tpid == ETH_P_SJA1105);
is_link_local = sja1105_is_link_local(skb);
is_meta = sja1105_is_meta_frame(skb);
@@ -266,7 +266,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
if (is_tagged) {
/* Normal traffic path. */
- tci = ntohs(hdr->h_vlan_TCI);
+ skb_push_rcsum(skb, ETH_HLEN);
+ __skb_vlan_pop(skb, &tci);
+ skb_pull_rcsum(skb, ETH_HLEN);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
vid = tci & VLAN_VID_MASK;
source_port = dsa_8021q_rx_source_port(vid);
switch_id = dsa_8021q_rx_switch_id(vid);
@@ -295,12 +300,6 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
return NULL;
}
- /* Delete/overwrite fake VLAN header, DSA expects to not find
- * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN).
- */
- if (is_tagged)
- skb = dsa_8021q_remove_header(skb);
-
return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
is_meta);
}
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 424545a4aaec..6c360c9c9370 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -5,4 +5,5 @@ obj-y += ioctl.o common.o
obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o
ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
- linkstate.o debug.o wol.o
+ linkstate.o debug.o wol.o features.o privflags.o rings.o \
+ channels.o coalesce.o pause.o eee.o tsinfo.o
diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c
index fce45dac4205..dae7402eaca3 100644
--- a/net/ethtool/bitset.c
+++ b/net/ethtool/bitset.c
@@ -305,7 +305,8 @@ nla_put_failure:
static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = {
[ETHTOOL_A_BITSET_UNSPEC] = { .type = NLA_REJECT },
[ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG },
- [ETHTOOL_A_BITSET_SIZE] = { .type = NLA_U32 },
+ [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32,
+ ETHNL_MAX_BITSET_SIZE),
[ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED },
[ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY },
[ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY },
@@ -447,7 +448,10 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits,
"mask only allowed in compact bitset");
return -EINVAL;
}
+
no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ if (no_mask)
+ ethnl_bitmap32_clear(bitmap, 0, nbits, mod);
nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) {
bool old_val, new_val;
@@ -584,6 +588,100 @@ int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits,
return 0;
}
+/**
+ * ethnl_parse_bitset() - Compute effective value and mask from bitset nest
+ * @val: unsigned long based bitmap to put value into
+ * @mask: unsigned long based bitmap to put mask into
+ * @nbits: size of @val and @mask bitmaps
+ * @attr: nest attribute to parse and apply
+ * @names: array of bit names; may be null for compact format
+ * @extack: extack for error reporting
+ *
+ * Provide @nbits size long bitmaps for value and mask so that
+ * x = (val & mask) | (x & ~mask) would modify any @nbits sized bitmap x
+ * the same way ethnl_update_bitset() with the same bitset attribute would.
+ *
+ * Return: negative error code on failure, 0 on success
+ */
+int ethnl_parse_bitset(unsigned long *val, unsigned long *mask,
+ unsigned int nbits, const struct nlattr *attr,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ETHTOOL_A_BITSET_MAX + 1];
+ const struct nlattr *bit_attr;
+ bool no_mask;
+ int rem;
+ int ret;
+
+ if (!attr)
+ return 0;
+ ret = nla_parse_nested(tb, ETHTOOL_A_BITSET_MAX, attr, bitset_policy,
+ extack);
+ if (ret < 0)
+ return ret;
+ no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+
+ if (!tb[ETHTOOL_A_BITSET_BITS]) {
+ unsigned int change_bits;
+
+ ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack);
+ if (ret < 0)
+ return ret;
+
+ change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]);
+ bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]),
+ change_bits);
+ if (change_bits < nbits)
+ bitmap_clear(val, change_bits, nbits - change_bits);
+ if (no_mask) {
+ bitmap_fill(mask, nbits);
+ } else {
+ bitmap_from_arr32(mask,
+ nla_data(tb[ETHTOOL_A_BITSET_MASK]),
+ change_bits);
+ if (change_bits < nbits)
+ bitmap_clear(mask, change_bits,
+ nbits - change_bits);
+ }
+
+ return 0;
+ }
+
+ if (tb[ETHTOOL_A_BITSET_VALUE]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE],
+ "value only allowed in compact bitset");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "mask only allowed in compact bitset");
+ return -EINVAL;
+ }
+
+ bitmap_zero(val, nbits);
+ if (no_mask)
+ bitmap_fill(mask, nbits);
+ else
+ bitmap_zero(mask, nbits);
+
+ nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) {
+ unsigned int idx;
+ bool bit_val;
+
+ ret = ethnl_parse_bit(&idx, &bit_val, nbits, bit_attr, no_mask,
+ names, extack);
+ if (ret < 0)
+ return ret;
+ if (bit_val)
+ __set_bit(idx, val);
+ if (!no_mask)
+ __set_bit(idx, mask);
+ }
+
+ return 0;
+}
+
#if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN)
/* 64-bit big endian architectures are the only case when u32 based bitmaps
diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h
index b8247e34109d..c2c2e0051d00 100644
--- a/net/ethtool/bitset.h
+++ b/net/ethtool/bitset.h
@@ -3,6 +3,8 @@
#ifndef _NET_ETHTOOL_BITSET_H
#define _NET_ETHTOOL_BITSET_H
+#define ETHNL_MAX_BITSET_SIZE S16_MAX
+
typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN];
int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact);
@@ -24,5 +26,9 @@ int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits,
const struct nlattr *attr, ethnl_string_array_t names,
struct netlink_ext_ack *extack, bool *mod);
+int ethnl_parse_bitset(unsigned long *val, unsigned long *mask,
+ unsigned int nbits, const struct nlattr *attr,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack);
#endif /* _NET_ETHTOOL_BITSET_H */
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
new file mode 100644
index 000000000000..389924b65d05
--- /dev/null
+++ b/net/ethtool/channels.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/xdp_sock.h>
+
+#include "netlink.h"
+#include "common.h"
+
+struct channels_req_info {
+ struct ethnl_req_info base;
+};
+
+struct channels_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_channels channels;
+};
+
+#define CHANNELS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct channels_reply_data, base)
+
+static const struct nla_policy
+channels_get_policy[ETHTOOL_A_CHANNELS_MAX + 1] = {
+ [ETHTOOL_A_CHANNELS_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_CHANNELS_RX_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_TX_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_OTHER_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_COMBINED_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_REJECT },
+};
+
+static int channels_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct channels_reply_data *data = CHANNELS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ dev->ethtool_ops->get_channels(dev, &data->channels);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int channels_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_COMBINED_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_COUNT */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_COUNT */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_COUNT */
+ nla_total_size(sizeof(u32)); /* _CHANNELS_COMBINED_COUNT */
+}
+
+static int channels_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct channels_reply_data *data = CHANNELS_REPDATA(reply_base);
+ const struct ethtool_channels *channels = &data->channels;
+
+ if ((channels->max_rx &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_MAX,
+ channels->max_rx) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_COUNT,
+ channels->rx_count))) ||
+ (channels->max_tx &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_MAX,
+ channels->max_tx) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_COUNT,
+ channels->tx_count))) ||
+ (channels->max_other &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_MAX,
+ channels->max_other) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_COUNT,
+ channels->other_count))) ||
+ (channels->max_combined &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_MAX,
+ channels->max_combined) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_COUNT,
+ channels->combined_count))))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_channels_request_ops = {
+ .request_cmd = ETHTOOL_MSG_CHANNELS_GET,
+ .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_CHANNELS_HEADER,
+ .max_attr = ETHTOOL_A_CHANNELS_MAX,
+ .req_info_size = sizeof(struct channels_req_info),
+ .reply_data_size = sizeof(struct channels_reply_data),
+ .request_policy = channels_get_policy,
+
+ .prepare_data = channels_prepare_data,
+ .reply_size = channels_reply_size,
+ .fill_reply = channels_fill_reply,
+};
+
+/* CHANNELS_SET */
+
+static const struct nla_policy
+channels_set_policy[ETHTOOL_A_CHANNELS_MAX + 1] = {
+ [ETHTOOL_A_CHANNELS_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_CHANNELS_RX_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_TX_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_OTHER_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_COMBINED_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_U32 },
+ [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_U32 },
+ [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_U32 },
+ [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_U32 },
+};
+
+int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_CHANNELS_MAX + 1];
+ unsigned int from_channel, old_total, i;
+ struct ethtool_channels channels = {};
+ struct ethnl_req_info req_info = {};
+ const struct nlattr *err_attr;
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ u32 max_rx_in_use = 0;
+ bool mod = false;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+ ETHTOOL_A_CHANNELS_MAX, channels_set_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_CHANNELS_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
+ if (!ops->get_channels || !ops->set_channels)
+ goto out_dev;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+ ops->get_channels(dev, &channels);
+ old_total = channels.combined_count +
+ max(channels.rx_count, channels.tx_count);
+
+ ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT],
+ &mod);
+ ethnl_update_u32(&channels.tx_count, tb[ETHTOOL_A_CHANNELS_TX_COUNT],
+ &mod);
+ ethnl_update_u32(&channels.other_count,
+ tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod);
+ ethnl_update_u32(&channels.combined_count,
+ tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod);
+ ret = 0;
+ if (!mod)
+ goto out_ops;
+
+ /* ensure new channel counts are within limits */
+ if (channels.rx_count > channels.max_rx)
+ err_attr = tb[ETHTOOL_A_CHANNELS_RX_COUNT];
+ else if (channels.tx_count > channels.max_tx)
+ err_attr = tb[ETHTOOL_A_CHANNELS_TX_COUNT];
+ else if (channels.other_count > channels.max_other)
+ err_attr = tb[ETHTOOL_A_CHANNELS_OTHER_COUNT];
+ else if (channels.combined_count > channels.max_combined)
+ err_attr = tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT];
+ else
+ err_attr = NULL;
+ if (err_attr) {
+ ret = -EINVAL;
+ NL_SET_ERR_MSG_ATTR(info->extack, err_attr,
+ "requested channel count exceeds maximum");
+ goto out_ops;
+ }
+
+ /* ensure the new Rx count fits within the configured Rx flow
+ * indirection table settings
+ */
+ if (netif_is_rxfh_configured(dev) &&
+ !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
+ (channels.combined_count + channels.rx_count) <= max_rx_in_use) {
+ GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings");
+ return -EINVAL;
+ }
+
+ /* Disabling channels, query zero-copy AF_XDP sockets */
+ from_channel = channels.combined_count +
+ min(channels.rx_count, channels.tx_count);
+ for (i = from_channel; i < old_total; i++)
+ if (xdp_get_umem_from_qid(dev, i)) {
+ GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets");
+ return -EINVAL;
+ }
+
+ ret = dev->ethtool_ops->set_channels(dev, &channels);
+ if (ret < 0)
+ goto out_ops;
+ ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+out_dev:
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
new file mode 100644
index 000000000000..6afd99042d67
--- /dev/null
+++ b/net/ethtool/coalesce.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct coalesce_req_info {
+ struct ethnl_req_info base;
+};
+
+struct coalesce_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_coalesce coalesce;
+ u32 supported_params;
+};
+
+#define COALESCE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct coalesce_reply_data, base)
+
+#define __SUPPORTED_OFFSET ETHTOOL_A_COALESCE_RX_USECS
+static u32 attr_to_mask(unsigned int attr_type)
+{
+ return BIT(attr_type - __SUPPORTED_OFFSET);
+}
+
+/* build time check that indices in ethtool_ops::supported_coalesce_params
+ * match corresponding attribute types with an offset
+ */
+#define __CHECK_SUPPORTED_OFFSET(x) \
+ static_assert((ETHTOOL_ ## x) == \
+ BIT((ETHTOOL_A_ ## x) - __SUPPORTED_OFFSET))
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_STATS_BLOCK_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_RX);
+__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_TX);
+__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RATE_SAMPLE_INTERVAL);
+
+static const struct nla_policy
+coalesce_get_policy[ETHTOOL_A_COALESCE_MAX + 1] = {
+ [ETHTOOL_A_COALESCE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_REJECT },
+};
+
+static int coalesce_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_coalesce)
+ return -EOPNOTSUPP;
+ data->supported_params = dev->ethtool_ops->supported_coalesce_params;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int coalesce_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u32)) + /* _RX_USECS */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_IRQ */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_IRQ */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_IRQ */
+ nla_total_size(sizeof(u32)) + /* _STATS_BLOCK_USECS */
+ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_RX */
+ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_TX */
+ nla_total_size(sizeof(u32)) + /* _PKT_RATE_LOW */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_LOW */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_LOW */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_LOW */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_LOW */
+ nla_total_size(sizeof(u32)) + /* _PKT_RATE_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */
+ nla_total_size(sizeof(u32)); /* _RATE_SAMPLE_INTERVAL */
+}
+
+static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val,
+ u32 supported_params)
+{
+ if (!val && !(supported_params & attr_to_mask(attr_type)))
+ return false;
+ return nla_put_u32(skb, attr_type, val);
+}
+
+static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val,
+ u32 supported_params)
+{
+ if (!val && !(supported_params & attr_to_mask(attr_type)))
+ return false;
+ return nla_put_u8(skb, attr_type, !!val);
+}
+
+static int coalesce_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+ const struct ethtool_coalesce *coal = &data->coalesce;
+ u32 supported = data->supported_params;
+
+ if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS,
+ coal->rx_coalesce_usecs, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES,
+ coal->rx_max_coalesced_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_IRQ,
+ coal->rx_coalesce_usecs_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ,
+ coal->rx_max_coalesced_frames_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS,
+ coal->tx_coalesce_usecs, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES,
+ coal->tx_max_coalesced_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_IRQ,
+ coal->tx_coalesce_usecs_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ,
+ coal->tx_max_coalesced_frames_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_STATS_BLOCK_USECS,
+ coal->stats_block_coalesce_usecs, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX,
+ coal->use_adaptive_rx_coalesce, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX,
+ coal->use_adaptive_tx_coalesce, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_LOW,
+ coal->pkt_rate_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_LOW,
+ coal->rx_coalesce_usecs_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW,
+ coal->rx_max_coalesced_frames_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_LOW,
+ coal->tx_coalesce_usecs_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW,
+ coal->tx_max_coalesced_frames_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_HIGH,
+ coal->pkt_rate_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_HIGH,
+ coal->rx_coalesce_usecs_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH,
+ coal->rx_max_coalesced_frames_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_HIGH,
+ coal->tx_coalesce_usecs_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH,
+ coal->tx_max_coalesced_frames_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL,
+ coal->rate_sample_interval, supported))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_coalesce_request_ops = {
+ .request_cmd = ETHTOOL_MSG_COALESCE_GET,
+ .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_COALESCE_HEADER,
+ .max_attr = ETHTOOL_A_COALESCE_MAX,
+ .req_info_size = sizeof(struct coalesce_req_info),
+ .reply_data_size = sizeof(struct coalesce_reply_data),
+ .request_policy = coalesce_get_policy,
+
+ .prepare_data = coalesce_prepare_data,
+ .reply_size = coalesce_reply_size,
+ .fill_reply = coalesce_fill_reply,
+};
+
+/* COALESCE_SET */
+
+static const struct nla_policy
+coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1] = {
+ [ETHTOOL_A_COALESCE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_U8 },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_U8 },
+ [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 },
+};
+
+int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_COALESCE_MAX + 1];
+ struct ethtool_coalesce coalesce = {};
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ u32 supported_params;
+ bool mod = false;
+ int ret;
+ u16 a;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+ ETHTOOL_A_COALESCE_MAX, coalesce_set_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_COALESCE_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
+ if (!ops->get_coalesce || !ops->set_coalesce)
+ goto out_dev;
+
+ /* make sure that only supported parameters are present */
+ supported_params = ops->supported_coalesce_params;
+ for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++)
+ if (tb[a] && !(supported_params & attr_to_mask(a))) {
+ ret = -EINVAL;
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[a],
+ "cannot modify an unsupported parameter");
+ goto out_dev;
+ }
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+ ret = ops->get_coalesce(dev, &coalesce);
+ if (ret < 0)
+ goto out_ops;
+
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_RX_USECS], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_irq,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_IRQ], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_irq,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_TX_USECS], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_irq,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_IRQ], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_irq,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ], &mod);
+ ethnl_update_u32(&coalesce.stats_block_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_STATS_BLOCK_USECS], &mod);
+ ethnl_update_bool32(&coalesce.use_adaptive_rx_coalesce,
+ tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX], &mod);
+ ethnl_update_bool32(&coalesce.use_adaptive_tx_coalesce,
+ tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX], &mod);
+ ethnl_update_u32(&coalesce.pkt_rate_low,
+ tb[ETHTOOL_A_COALESCE_PKT_RATE_LOW], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_low,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_LOW], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_low,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_low,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_LOW], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_low,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW], &mod);
+ ethnl_update_u32(&coalesce.pkt_rate_high,
+ tb[ETHTOOL_A_COALESCE_PKT_RATE_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_high,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_high,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_high,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_HIGH], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_high,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rate_sample_interval,
+ tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod);
+ ret = 0;
+ if (!mod)
+ goto out_ops;
+
+ ret = dev->ethtool_ops->set_coalesce(dev, &coalesce);
+ if (ret < 0)
+ goto out_ops;
+ ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+out_dev:
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 636ec6d5110e..423e640e3876 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -1,5 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+
#include "common.h"
const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
@@ -60,6 +63,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
[NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
[NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
[NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
+ [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
};
const char
@@ -168,6 +172,7 @@ const char link_mode_names[][ETH_GSTRING_LEN] = {
__DEFINE_LINK_MODE_NAME(400000, LR8_ER8_FR8, Full),
__DEFINE_LINK_MODE_NAME(400000, DR8, Full),
__DEFINE_LINK_MODE_NAME(400000, CR8, Full),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_LLRS, "LLRS"),
};
static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
@@ -202,6 +207,53 @@ const char wol_mode_names[][ETH_GSTRING_LEN] = {
};
static_assert(ARRAY_SIZE(wol_mode_names) == WOL_MODE_COUNT);
+const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(SOF_TIMESTAMPING_TX_HARDWARE)] = "hardware-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_TX_SOFTWARE)] = "software-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_RX_HARDWARE)] = "hardware-receive",
+ [const_ilog2(SOF_TIMESTAMPING_RX_SOFTWARE)] = "software-receive",
+ [const_ilog2(SOF_TIMESTAMPING_SOFTWARE)] = "software-system-clock",
+ [const_ilog2(SOF_TIMESTAMPING_SYS_HARDWARE)] = "hardware-legacy-clock",
+ [const_ilog2(SOF_TIMESTAMPING_RAW_HARDWARE)] = "hardware-raw-clock",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_ID)] = "option-id",
+ [const_ilog2(SOF_TIMESTAMPING_TX_SCHED)] = "sched-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_TX_ACK)] = "ack-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_CMSG)] = "option-cmsg",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_TSONLY)] = "option-tsonly",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_STATS)] = "option-stats",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)] = "option-pktinfo",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw",
+};
+static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
+
+const char ts_tx_type_names[][ETH_GSTRING_LEN] = {
+ [HWTSTAMP_TX_OFF] = "off",
+ [HWTSTAMP_TX_ON] = "on",
+ [HWTSTAMP_TX_ONESTEP_SYNC] = "onestep-sync",
+ [HWTSTAMP_TX_ONESTEP_P2P] = "onestep-p2p",
+};
+static_assert(ARRAY_SIZE(ts_tx_type_names) == __HWTSTAMP_TX_CNT);
+
+const char ts_rx_filter_names[][ETH_GSTRING_LEN] = {
+ [HWTSTAMP_FILTER_NONE] = "none",
+ [HWTSTAMP_FILTER_ALL] = "all",
+ [HWTSTAMP_FILTER_SOME] = "some",
+ [HWTSTAMP_FILTER_PTP_V1_L4_EVENT] = "ptpv1-l4-event",
+ [HWTSTAMP_FILTER_PTP_V1_L4_SYNC] = "ptpv1-l4-sync",
+ [HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ] = "ptpv1-l4-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_L4_EVENT] = "ptpv2-l4-event",
+ [HWTSTAMP_FILTER_PTP_V2_L4_SYNC] = "ptpv2-l4-sync",
+ [HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ] = "ptpv2-l4-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_L2_EVENT] = "ptpv2-l2-event",
+ [HWTSTAMP_FILTER_PTP_V2_L2_SYNC] = "ptpv2-l2-sync",
+ [HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ] = "ptpv2-l2-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_EVENT] = "ptpv2-event",
+ [HWTSTAMP_FILTER_PTP_V2_SYNC] = "ptpv2-sync",
+ [HWTSTAMP_FILTER_PTP_V2_DELAY_REQ] = "ptpv2-delay-req",
+ [HWTSTAMP_FILTER_NTP_ALL] = "ntp-all",
+};
+static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT);
+
/* return false if legacy contained non-0 deprecated fields
* maxtxpkt/maxrxpkt. rest of ksettings always updated
*/
@@ -257,3 +309,65 @@ int __ethtool_get_link(struct net_device *dev)
return netif_running(dev) && dev->ethtool_ops->get_link(dev);
}
+
+int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
+{
+ u32 dev_size, current_max = 0;
+ u32 *indir;
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+ if (ret)
+ goto out;
+
+ while (dev_size--)
+ current_max = max(current_max, indir[dev_size]);
+
+ *max = current_max;
+
+out:
+ kfree(indir);
+ return ret;
+}
+
+int ethtool_check_ops(const struct ethtool_ops *ops)
+{
+ if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params))
+ return -EINVAL;
+ /* NOTE: sufficiently insane drivers may swap ethtool_ops at runtime,
+ * the fact that ops are checked at registration time does not
+ * mean the ops attached to a netdev later on are sane.
+ */
+ return 0;
+}
+
+int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ memset(info, 0, sizeof(*info));
+ info->cmd = ETHTOOL_GET_TS_INFO;
+
+ if (phy_has_tsinfo(phydev))
+ return phy_ts_info(phydev, info);
+ if (ops->get_ts_info)
+ return ops->get_ts_info(dev, info);
+
+ info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+
+ return 0;
+}
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index 40ba74e0b9bb..a62f68ccc43a 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -6,10 +6,14 @@
#include <linux/netdevice.h>
#include <linux/ethtool.h>
+#define ETHTOOL_DEV_FEATURE_WORDS DIV_ROUND_UP(NETDEV_FEATURE_COUNT, 32)
+
/* compose link mode index from speed, type and duplex */
#define ETHTOOL_LINK_MODE(speed, type, duplex) \
ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT
+#define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1)
+
extern const char
netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN];
extern const char
@@ -21,11 +25,16 @@ phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN];
extern const char link_mode_names[][ETH_GSTRING_LEN];
extern const char netif_msg_class_names[][ETH_GSTRING_LEN];
extern const char wol_mode_names[][ETH_GSTRING_LEN];
+extern const char sof_timestamping_names[][ETH_GSTRING_LEN];
+extern const char ts_tx_type_names[][ETH_GSTRING_LEN];
+extern const char ts_rx_filter_names[][ETH_GSTRING_LEN];
int __ethtool_get_link(struct net_device *dev);
bool convert_legacy_settings_to_link_ksettings(
struct ethtool_link_ksettings *link_ksettings,
const struct ethtool_cmd *legacy_settings);
+int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max);
+int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info);
#endif /* _ETHTOOL_COMMON_H */
diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c
index aaef4843e6ba..1bd026a29f3f 100644
--- a/net/ethtool/debug.c
+++ b/net/ethtool/debug.c
@@ -102,13 +102,16 @@ int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info)
info->extack);
if (ret < 0)
return ret;
- ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_DEBUG_HEADER],
- genl_info_net(info), info->extack, true);
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_DEBUG_HEADER],
+ genl_info_net(info), info->extack,
+ true);
if (ret < 0)
return ret;
dev = req_info.dev;
+ ret = -EOPNOTSUPP;
if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel)
- return -EOPNOTSUPP;
+ goto out_dev;
rtnl_lock();
ret = ethnl_ops_begin(dev);
@@ -129,6 +132,7 @@ out_ops:
ethnl_ops_complete(dev);
out_rtnl:
rtnl_unlock();
+out_dev:
dev_put(dev);
return ret;
}
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
new file mode 100644
index 000000000000..94aa19cff22f
--- /dev/null
+++ b/net/ethtool/eee.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+#define EEE_MODES_COUNT \
+ (sizeof_field(struct ethtool_eee, supported) * BITS_PER_BYTE)
+
+struct eee_req_info {
+ struct ethnl_req_info base;
+};
+
+struct eee_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_eee eee;
+};
+
+#define EEE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct eee_reply_data, base)
+
+static const struct nla_policy
+eee_get_policy[ETHTOOL_A_EEE_MAX + 1] = {
+ [ETHTOOL_A_EEE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_MODES_PEER] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_ACTIVE] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_ENABLED] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_REJECT },
+};
+
+static int eee_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_eee)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_eee(dev, &data->eee);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int eee_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ const struct ethtool_eee *eee = &data->eee;
+ int len = 0;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(eee->advertised) * BITS_PER_BYTE !=
+ EEE_MODES_COUNT);
+ BUILD_BUG_ON(sizeof(eee->lp_advertised) * BITS_PER_BYTE !=
+ EEE_MODES_COUNT);
+
+ /* MODES_OURS */
+ ret = ethnl_bitset32_size(&eee->advertised, &eee->supported,
+ EEE_MODES_COUNT, link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ /* MODES_PEERS */
+ ret = ethnl_bitset32_size(&eee->lp_advertised, NULL,
+ EEE_MODES_COUNT, link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+
+ len += nla_total_size(sizeof(u8)) + /* _EEE_ACTIVE */
+ nla_total_size(sizeof(u8)) + /* _EEE_ENABLED */
+ nla_total_size(sizeof(u8)) + /* _EEE_TX_LPI_ENABLED */
+ nla_total_size(sizeof(u32)); /* _EEE_TX_LPI_TIMER */
+
+ return len;
+}
+
+static int eee_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ const struct ethtool_eee *eee = &data->eee;
+ int ret;
+
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_OURS,
+ &eee->advertised, &eee->supported,
+ EEE_MODES_COUNT, link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_PEER,
+ &eee->lp_advertised, NULL, EEE_MODES_COUNT,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+
+ if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, !!eee->eee_active) ||
+ nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, !!eee->eee_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_EEE_TX_LPI_ENABLED,
+ !!eee->tx_lpi_enabled) ||
+ nla_put_u32(skb, ETHTOOL_A_EEE_TX_LPI_TIMER, eee->tx_lpi_timer))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_eee_request_ops = {
+ .request_cmd = ETHTOOL_MSG_EEE_GET,
+ .reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_EEE_HEADER,
+ .max_attr = ETHTOOL_A_EEE_MAX,
+ .req_info_size = sizeof(struct eee_req_info),
+ .reply_data_size = sizeof(struct eee_reply_data),
+ .request_policy = eee_get_policy,
+
+ .prepare_data = eee_prepare_data,
+ .reply_size = eee_reply_size,
+ .fill_reply = eee_fill_reply,
+};
+
+/* EEE_SET */
+
+static const struct nla_policy
+eee_set_policy[ETHTOOL_A_EEE_MAX + 1] = {
+ [ETHTOOL_A_EEE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_EEE_MODES_PEER] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_ACTIVE] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_ENABLED] = { .type = NLA_U8 },
+ [ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_U8 },
+ [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_U32 },
+};
+
+int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_EEE_MAX + 1];
+ struct ethtool_eee eee = {};
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ bool mod = false;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_EEE_MAX,
+ eee_set_policy, info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_EEE_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
+ if (!ops->get_eee || !ops->set_eee)
+ goto out_dev;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+ ret = ops->get_eee(dev, &eee);
+ if (ret < 0)
+ goto out_ops;
+
+ ret = ethnl_update_bitset32(&eee.advertised, EEE_MODES_COUNT,
+ tb[ETHTOOL_A_EEE_MODES_OURS],
+ link_mode_names, info->extack, &mod);
+ if (ret < 0)
+ goto out_ops;
+ ethnl_update_bool32(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod);
+ ethnl_update_bool32(&eee.tx_lpi_enabled,
+ tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], &mod);
+ ethnl_update_bool32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER],
+ &mod);
+ ret = 0;
+ if (!mod)
+ goto out_ops;
+
+ ret = dev->ethtool_ops->set_eee(dev, &eee);
+ if (ret < 0)
+ goto out_ops;
+ ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+out_dev:
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/features.c b/net/ethtool/features.c
new file mode 100644
index 000000000000..4e632dc987d8
--- /dev/null
+++ b/net/ethtool/features.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct features_req_info {
+ struct ethnl_req_info base;
+};
+
+struct features_reply_data {
+ struct ethnl_reply_data base;
+ u32 hw[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 wanted[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 active[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 nochange[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 all[ETHTOOL_DEV_FEATURE_WORDS];
+};
+
+#define FEATURES_REPDATA(__reply_base) \
+ container_of(__reply_base, struct features_reply_data, base)
+
+static const struct nla_policy
+features_get_policy[ETHTOOL_A_FEATURES_MAX + 1] = {
+ [ETHTOOL_A_FEATURES_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_FEATURES_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_FEATURES_HW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_REJECT },
+ [ETHTOOL_A_FEATURES_ACTIVE] = { .type = NLA_REJECT },
+ [ETHTOOL_A_FEATURES_NOCHANGE] = { .type = NLA_REJECT },
+};
+
+static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src)
+{
+ unsigned int i;
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; i++)
+ dest[i] = src >> (32 * i);
+}
+
+static int features_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct features_reply_data *data = FEATURES_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ netdev_features_t all_features;
+
+ ethnl_features_to_bitmap32(data->hw, dev->hw_features);
+ ethnl_features_to_bitmap32(data->wanted, dev->wanted_features);
+ ethnl_features_to_bitmap32(data->active, dev->features);
+ ethnl_features_to_bitmap32(data->nochange, NETIF_F_NEVER_CHANGE);
+ all_features = GENMASK_ULL(NETDEV_FEATURE_COUNT - 1, 0);
+ ethnl_features_to_bitmap32(data->all, all_features);
+
+ return 0;
+}
+
+static int features_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct features_reply_data *data = FEATURES_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ unsigned int len = 0;
+ int ret;
+
+ ret = ethnl_bitset32_size(data->hw, data->all, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ ret = ethnl_bitset32_size(data->wanted, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ ret = ethnl_bitset32_size(data->active, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ ret = ethnl_bitset32_size(data->nochange, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+
+ return len;
+}
+
+static int features_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct features_reply_data *data = FEATURES_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int ret;
+
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_HW, data->hw,
+ data->all, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_WANTED, data->wanted,
+ NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_ACTIVE, data->active,
+ NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ return ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_NOCHANGE,
+ data->nochange, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+}
+
+const struct ethnl_request_ops ethnl_features_request_ops = {
+ .request_cmd = ETHTOOL_MSG_FEATURES_GET,
+ .reply_cmd = ETHTOOL_MSG_FEATURES_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_FEATURES_HEADER,
+ .max_attr = ETHTOOL_A_FEATURES_MAX,
+ .req_info_size = sizeof(struct features_req_info),
+ .reply_data_size = sizeof(struct features_reply_data),
+ .request_policy = features_get_policy,
+
+ .prepare_data = features_prepare_data,
+ .reply_size = features_reply_size,
+ .fill_reply = features_fill_reply,
+};
+
+/* FEATURES_SET */
+
+static const struct nla_policy
+features_set_policy[ETHTOOL_A_FEATURES_MAX + 1] = {
+ [ETHTOOL_A_FEATURES_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_FEATURES_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_FEATURES_HW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_NESTED },
+ [ETHTOOL_A_FEATURES_ACTIVE] = { .type = NLA_REJECT },
+ [ETHTOOL_A_FEATURES_NOCHANGE] = { .type = NLA_REJECT },
+};
+
+static void ethnl_features_to_bitmap(unsigned long *dest, netdev_features_t val)
+{
+ const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT);
+ unsigned int i;
+
+ bitmap_zero(dest, NETDEV_FEATURE_COUNT);
+ for (i = 0; i < words; i++)
+ dest[i] = (unsigned long)(val >> (i * BITS_PER_LONG));
+}
+
+static netdev_features_t ethnl_bitmap_to_features(unsigned long *src)
+{
+ const unsigned int nft_bits = sizeof(netdev_features_t) * BITS_PER_BYTE;
+ const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT);
+ netdev_features_t ret = 0;
+ unsigned int i;
+
+ for (i = 0; i < words; i++)
+ ret |= (netdev_features_t)(src[i]) << (i * BITS_PER_LONG);
+ ret &= ~(netdev_features_t)0 >> (nft_bits - NETDEV_FEATURE_COUNT);
+ return ret;
+}
+
+static int features_send_reply(struct net_device *dev, struct genl_info *info,
+ const unsigned long *wanted,
+ const unsigned long *wanted_mask,
+ const unsigned long *active,
+ const unsigned long *active_mask, bool compact)
+{
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int reply_len = 0;
+ int ret;
+
+ reply_len = ethnl_reply_header_size();
+ ret = ethnl_bitset_size(wanted, wanted_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto err;
+ reply_len += ret;
+ ret = ethnl_bitset_size(active, active_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto err;
+ reply_len += ret;
+
+ ret = -ENOMEM;
+ rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_FEATURES_SET_REPLY,
+ ETHTOOL_A_FEATURES_HEADER, info,
+ &reply_payload);
+ if (!rskb)
+ goto err;
+
+ ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_WANTED, wanted,
+ wanted_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto nla_put_failure;
+ ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_ACTIVE, active,
+ active_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(rskb, reply_payload);
+ ret = genlmsg_reply(rskb, info);
+ return ret;
+
+nla_put_failure:
+ nlmsg_free(rskb);
+ WARN_ONCE(1, "calculated message payload length (%d) not sufficient\n",
+ reply_len);
+err:
+ GENL_SET_ERR_MSG(info, "failed to send reply message");
+ return ret;
+}
+
+int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
+{
+ DECLARE_BITMAP(wanted_diff_mask, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(active_diff_mask, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(old_active, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(new_active, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT);
+ struct nlattr *tb[ETHTOOL_A_FEATURES_MAX + 1];
+ struct ethnl_req_info req_info = {};
+ struct net_device *dev;
+ bool mod;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+ ETHTOOL_A_FEATURES_MAX, features_set_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ if (!tb[ETHTOOL_A_FEATURES_WANTED])
+ return -EINVAL;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_FEATURES_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+
+ rtnl_lock();
+ ethnl_features_to_bitmap(old_active, dev->features);
+ ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT,
+ tb[ETHTOOL_A_FEATURES_WANTED],
+ netdev_features_strings, info->extack);
+ if (ret < 0)
+ goto out_rtnl;
+ if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) {
+ GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features");
+ ret = -EINVAL;
+ goto out_rtnl;
+ }
+
+ /* set req_wanted bits not in req_mask from old_active */
+ bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT);
+ bitmap_andnot(new_active, old_active, req_mask, NETDEV_FEATURE_COUNT);
+ bitmap_or(req_wanted, new_active, req_wanted, NETDEV_FEATURE_COUNT);
+ if (bitmap_equal(req_wanted, old_active, NETDEV_FEATURE_COUNT)) {
+ ret = 0;
+ goto out_rtnl;
+ }
+
+ dev->wanted_features = ethnl_bitmap_to_features(req_wanted);
+ __netdev_update_features(dev);
+ ethnl_features_to_bitmap(new_active, dev->features);
+ mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT);
+
+ ret = 0;
+ if (!(req_info.flags & ETHTOOL_FLAG_OMIT_REPLY)) {
+ bool compact = req_info.flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ bitmap_xor(wanted_diff_mask, req_wanted, new_active,
+ NETDEV_FEATURE_COUNT);
+ bitmap_xor(active_diff_mask, old_active, new_active,
+ NETDEV_FEATURE_COUNT);
+ bitmap_and(wanted_diff_mask, wanted_diff_mask, req_mask,
+ NETDEV_FEATURE_COUNT);
+ bitmap_and(req_wanted, req_wanted, wanted_diff_mask,
+ NETDEV_FEATURE_COUNT);
+ bitmap_and(new_active, new_active, active_diff_mask,
+ NETDEV_FEATURE_COUNT);
+
+ ret = features_send_reply(dev, info, req_wanted,
+ wanted_diff_mask, new_active,
+ active_diff_mask, compact);
+ }
+ if (mod)
+ ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL);
+
+out_rtnl:
+ rtnl_unlock();
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index b987052d91ef..89d0b1827aaf 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -56,8 +56,6 @@ EXPORT_SYMBOL(ethtool_op_get_ts_info);
/* Handlers for each ethtool command */
-#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
-
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -198,13 +196,14 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
switch (eth_cmd) {
case ETHTOOL_GTXCSUM:
case ETHTOOL_STXCSUM:
- return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC;
+ return NETIF_F_CSUM_MASK | NETIF_F_FCOE_CRC |
+ NETIF_F_SCTP_CRC;
case ETHTOOL_GRXCSUM:
case ETHTOOL_SRXCSUM:
return NETIF_F_RXCSUM;
case ETHTOOL_GSG:
case ETHTOOL_SSG:
- return NETIF_F_SG;
+ return NETIF_F_SG | NETIF_F_FRAGLIST;
case ETHTOOL_GTSO:
case ETHTOOL_STSO:
return NETIF_F_ALL_TSO;
@@ -459,6 +458,24 @@ static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
return 0;
}
+/* Check if the user is trying to change anything besides speed/duplex */
+bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd)
+{
+ struct ethtool_link_settings base2 = {};
+
+ base2.speed = cmd->base.speed;
+ base2.port = PORT_OTHER;
+ base2.duplex = cmd->base.duplex;
+ base2.cmd = cmd->base.cmd;
+ base2.link_mode_masks_nwords = cmd->base.link_mode_masks_nwords;
+
+ return !memcmp(&base2, &cmd->base, sizeof(base2)) &&
+ bitmap_empty(cmd->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS) &&
+ bitmap_empty(cmd->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
/* convert a kernel internal ethtool_link_ksettings to
* ethtool_link_usettings in user space. return 0 on success, errno on
* error.
@@ -581,6 +598,27 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
return err;
}
+int ethtool_virtdev_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd,
+ u32 *dev_speed, u8 *dev_duplex)
+{
+ u32 speed;
+ u8 duplex;
+
+ speed = cmd->base.speed;
+ duplex = cmd->base.duplex;
+ /* don't allow custom speed and duplex */
+ if (!ethtool_validate_speed(speed) ||
+ !ethtool_validate_duplex(duplex) ||
+ !ethtool_virtdev_validate_cmd(cmd))
+ return -EINVAL;
+ *dev_speed = speed;
+ *dev_duplex = duplex;
+
+ return 0;
+}
+EXPORT_SYMBOL(ethtool_virtdev_set_link_ksettings);
+
/* Query device for its ethtool_cmd settings.
*
* Backward compatibility note: for compatibility with legacy ethtool, this is
@@ -891,37 +929,6 @@ void netdev_rss_key_fill(void *buffer, size_t len)
}
EXPORT_SYMBOL(netdev_rss_key_fill);
-static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
-{
- u32 dev_size, current_max = 0;
- u32 *indir;
- int ret;
-
- if (!dev->ethtool_ops->get_rxfh_indir_size ||
- !dev->ethtool_ops->get_rxfh)
- return -EOPNOTSUPP;
- dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
- if (dev_size == 0)
- return -EOPNOTSUPP;
-
- indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
- if (!indir)
- return -ENOMEM;
-
- ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
- if (ret)
- goto out;
-
- while (dev_size--)
- current_max = max(current_max, indir[dev_size]);
-
- *max = current_max;
-
-out:
- kfree(indir);
- return ret;
-}
-
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
@@ -1347,6 +1354,7 @@ static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
{
struct ethtool_eee edata;
+ int ret;
if (!dev->ethtool_ops->set_eee)
return -EOPNOTSUPP;
@@ -1354,7 +1362,10 @@ static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
if (copy_from_user(&edata, useraddr, sizeof(edata)))
return -EFAULT;
- return dev->ethtool_ops->set_eee(dev, &edata);
+ ret = dev->ethtool_ops->set_eee(dev, &edata);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL);
+ return ret;
}
static int ethtool_nway_reset(struct net_device *dev)
@@ -1505,10 +1516,66 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
return 0;
}
+static bool
+ethtool_set_coalesce_supported(struct net_device *dev,
+ struct ethtool_coalesce *coalesce)
+{
+ u32 supported_params = dev->ethtool_ops->supported_coalesce_params;
+ u32 nonzero_params = 0;
+
+ if (coalesce->rx_coalesce_usecs)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS;
+ if (coalesce->rx_max_coalesced_frames)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES;
+ if (coalesce->rx_coalesce_usecs_irq)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS_IRQ;
+ if (coalesce->rx_max_coalesced_frames_irq)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ;
+ if (coalesce->tx_coalesce_usecs)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS;
+ if (coalesce->tx_max_coalesced_frames)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES;
+ if (coalesce->tx_coalesce_usecs_irq)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS_IRQ;
+ if (coalesce->tx_max_coalesced_frames_irq)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ;
+ if (coalesce->stats_block_coalesce_usecs)
+ nonzero_params |= ETHTOOL_COALESCE_STATS_BLOCK_USECS;
+ if (coalesce->use_adaptive_rx_coalesce)
+ nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_RX;
+ if (coalesce->use_adaptive_tx_coalesce)
+ nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_TX;
+ if (coalesce->pkt_rate_low)
+ nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_LOW;
+ if (coalesce->rx_coalesce_usecs_low)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS_LOW;
+ if (coalesce->rx_max_coalesced_frames_low)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW;
+ if (coalesce->tx_coalesce_usecs_low)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS_LOW;
+ if (coalesce->tx_max_coalesced_frames_low)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW;
+ if (coalesce->pkt_rate_high)
+ nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_HIGH;
+ if (coalesce->rx_coalesce_usecs_high)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS_HIGH;
+ if (coalesce->rx_max_coalesced_frames_high)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH;
+ if (coalesce->tx_coalesce_usecs_high)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS_HIGH;
+ if (coalesce->tx_max_coalesced_frames_high)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH;
+ if (coalesce->rate_sample_interval)
+ nonzero_params |= ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL;
+
+ return (supported_params & nonzero_params) == nonzero_params;
+}
+
static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
void __user *useraddr)
{
struct ethtool_coalesce coalesce;
+ int ret;
if (!dev->ethtool_ops->set_coalesce)
return -EOPNOTSUPP;
@@ -1516,7 +1583,13 @@ static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
return -EFAULT;
- return dev->ethtool_ops->set_coalesce(dev, &coalesce);
+ if (!ethtool_set_coalesce_supported(dev, &coalesce))
+ return -EOPNOTSUPP;
+
+ ret = dev->ethtool_ops->set_coalesce(dev, &coalesce);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL);
+ return ret;
}
static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
@@ -1536,6 +1609,7 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
{
struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM };
+ int ret;
if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam)
return -EOPNOTSUPP;
@@ -1552,7 +1626,10 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
ringparam.tx_pending > max.tx_max_pending)
return -EINVAL;
- return dev->ethtool_ops->set_ringparam(dev, &ringparam);
+ ret = dev->ethtool_ops->set_ringparam(dev, &ringparam);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL);
+ return ret;
}
static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
@@ -1577,6 +1654,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
u16 from_channel, to_channel;
u32 max_rx_in_use = 0;
unsigned int i;
+ int ret;
if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
return -EOPNOTSUPP;
@@ -1608,7 +1686,10 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
if (xdp_get_umem_from_qid(dev, i))
return -EINVAL;
- return dev->ethtool_ops->set_channels(dev, &channels);
+ ret = dev->ethtool_ops->set_channels(dev, &channels);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL);
+ return ret;
}
static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
@@ -1628,6 +1709,7 @@ static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
{
struct ethtool_pauseparam pauseparam;
+ int ret;
if (!dev->ethtool_ops->set_pauseparam)
return -EOPNOTSUPP;
@@ -1635,7 +1717,10 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
return -EFAULT;
- return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+ ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL);
+ return ret;
}
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
@@ -2055,32 +2140,17 @@ out:
static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
{
- int err = 0;
struct ethtool_ts_info info;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- struct phy_device *phydev = dev->phydev;
-
- memset(&info, 0, sizeof(info));
- info.cmd = ETHTOOL_GET_TS_INFO;
-
- if (phy_has_tsinfo(phydev)) {
- err = phy_ts_info(phydev, &info);
- } else if (ops->get_ts_info) {
- err = ops->get_ts_info(dev, &info);
- } else {
- info.so_timestamping =
- SOF_TIMESTAMPING_RX_SOFTWARE |
- SOF_TIMESTAMPING_SOFTWARE;
- info.phc_index = -1;
- }
+ int err;
+ err = __ethtool_get_ts_info(dev, &info);
if (err)
return err;
if (copy_to_user(useraddr, &info, sizeof(info)))
- err = -EFAULT;
+ return -EFAULT;
- return err;
+ return 0;
}
static int __ethtool_get_module_info(struct net_device *dev,
@@ -2297,6 +2367,11 @@ ethtool_set_per_queue_coalesce(struct net_device *dev,
goto roll_back;
}
+ if (!ethtool_set_coalesce_supported(dev, &coalesce)) {
+ ret = -EOPNOTSUPP;
+ goto roll_back;
+ }
+
ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
if (ret != 0)
goto roll_back;
@@ -2612,6 +2687,8 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GPFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
dev->ethtool_ops->get_priv_flags);
+ if (!rc)
+ ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL);
break;
case ETHTOOL_SPFLAGS:
rc = ethtool_set_value(dev, useraddr,
diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c
index 5d16cb4e8693..677068deb68c 100644
--- a/net/ethtool/linkinfo.c
+++ b/net/ethtool/linkinfo.c
@@ -121,14 +121,17 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info)
info->extack);
if (ret < 0)
return ret;
- ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKINFO_HEADER],
- genl_info_net(info), info->extack, true);
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_LINKINFO_HEADER],
+ genl_info_net(info), info->extack,
+ true);
if (ret < 0)
return ret;
dev = req_info.dev;
+ ret = -EOPNOTSUPP;
if (!dev->ethtool_ops->get_link_ksettings ||
!dev->ethtool_ops->set_link_ksettings)
- return -EOPNOTSUPP;
+ goto out_dev;
rtnl_lock();
ret = ethnl_ops_begin(dev);
@@ -162,6 +165,7 @@ out_ops:
ethnl_ops_complete(dev);
out_rtnl:
rtnl_unlock();
+out_dev:
dev_put(dev);
return ret;
}
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index 96f20be64553..452608c6d856 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -237,6 +237,7 @@ static const struct link_mode_info link_mode_params[] = {
__DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full),
__DEFINE_LINK_MODE_PARAMS(400000, DR8, Full),
__DEFINE_LINK_MODE_PARAMS(400000, CR8, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS),
};
static const struct nla_policy
@@ -333,14 +334,17 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info)
info->extack);
if (ret < 0)
return ret;
- ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_LINKMODES_HEADER],
- genl_info_net(info), info->extack, true);
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_LINKMODES_HEADER],
+ genl_info_net(info), info->extack,
+ true);
if (ret < 0)
return ret;
dev = req_info.dev;
+ ret = -EOPNOTSUPP;
if (!dev->ethtool_ops->get_link_ksettings ||
!dev->ethtool_ops->set_link_ksettings)
- return -EOPNOTSUPP;
+ goto out_dev;
rtnl_lock();
ret = ethnl_ops_begin(dev);
@@ -370,6 +374,7 @@ out_ops:
ethnl_ops_complete(dev);
out_rtnl:
rtnl_unlock();
+out_dev:
dev_put(dev);
return ret;
}
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 180c194fab07..0c772318c023 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -18,7 +18,7 @@ static const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_MAX + 1] = {
};
/**
- * ethnl_parse_header() - parse request header
+ * ethnl_parse_header_dev_get() - parse request header
* @req_info: structure to put results into
* @header: nest attribute with request header
* @net: request netns
@@ -33,13 +33,14 @@ static const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_MAX + 1] = {
*
* Return: 0 on success or negative error code
*/
-int ethnl_parse_header(struct ethnl_req_info *req_info,
- const struct nlattr *header, struct net *net,
- struct netlink_ext_ack *extack, bool require_dev)
+int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
+ const struct nlattr *header, struct net *net,
+ struct netlink_ext_ack *extack, bool require_dev)
{
struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1];
const struct nlattr *devname_attr;
struct net_device *dev = NULL;
+ u32 flags = 0;
int ret;
if (!header) {
@@ -50,8 +51,17 @@ int ethnl_parse_header(struct ethnl_req_info *req_info,
ethnl_header_policy, extack);
if (ret < 0)
return ret;
- devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME];
+ if (tb[ETHTOOL_A_HEADER_FLAGS]) {
+ flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]);
+ if (flags & ~ETHTOOL_FLAG_ALL) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_FLAGS],
+ "unrecognized request flags");
+ nl_set_extack_cookie_u32(extack, ETHTOOL_FLAG_ALL);
+ return -EOPNOTSUPP;
+ }
+ }
+ devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME];
if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) {
u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]);
@@ -90,9 +100,7 @@ int ethnl_parse_header(struct ethnl_req_info *req_info,
}
req_info->dev = dev;
- if (tb[ETHTOOL_A_HEADER_FLAGS])
- req_info->flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]);
-
+ req_info->flags = flags;
return 0;
}
@@ -215,6 +223,14 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
[ETHTOOL_MSG_LINKSTATE_GET] = &ethnl_linkstate_request_ops,
[ETHTOOL_MSG_DEBUG_GET] = &ethnl_debug_request_ops,
[ETHTOOL_MSG_WOL_GET] = &ethnl_wol_request_ops,
+ [ETHTOOL_MSG_FEATURES_GET] = &ethnl_features_request_ops,
+ [ETHTOOL_MSG_PRIVFLAGS_GET] = &ethnl_privflags_request_ops,
+ [ETHTOOL_MSG_RINGS_GET] = &ethnl_rings_request_ops,
+ [ETHTOOL_MSG_CHANNELS_GET] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_COALESCE_GET] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_PAUSE_GET] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_EEE_GET] = &ethnl_eee_request_ops,
+ [ETHTOOL_MSG_TSINFO_GET] = &ethnl_tsinfo_request_ops,
};
static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -253,8 +269,8 @@ static int ethnl_default_parse(struct ethnl_req_info *req_info,
request_ops->request_policy, extack);
if (ret < 0)
goto out;
- ret = ethnl_parse_header(req_info, tb[request_ops->hdr_attr], net,
- extack, require_dev);
+ ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr],
+ net, extack, require_dev);
if (ret < 0)
goto out;
@@ -527,6 +543,13 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
[ETHTOOL_MSG_LINKMODES_NTF] = &ethnl_linkmodes_request_ops,
[ETHTOOL_MSG_DEBUG_NTF] = &ethnl_debug_request_ops,
[ETHTOOL_MSG_WOL_NTF] = &ethnl_wol_request_ops,
+ [ETHTOOL_MSG_FEATURES_NTF] = &ethnl_features_request_ops,
+ [ETHTOOL_MSG_PRIVFLAGS_NTF] = &ethnl_privflags_request_ops,
+ [ETHTOOL_MSG_RINGS_NTF] = &ethnl_rings_request_ops,
+ [ETHTOOL_MSG_CHANNELS_NTF] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_COALESCE_NTF] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_PAUSE_NTF] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_EEE_NTF] = &ethnl_eee_request_ops,
};
/* default notification handler */
@@ -612,6 +635,13 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
[ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify,
[ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify,
[ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify,
};
void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data)
@@ -629,6 +659,29 @@ void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data)
}
EXPORT_SYMBOL(ethtool_notify);
+static void ethnl_notify_features(struct netdev_notifier_info *info)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(info);
+
+ ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL);
+}
+
+static int ethnl_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ switch (event) {
+ case NETDEV_FEAT_CHANGE:
+ ethnl_notify_features(ptr);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ethnl_netdev_notifier = {
+ .notifier_call = ethnl_netdev_event,
+};
+
/* genetlink setup */
static const struct genl_ops ethtool_genl_ops[] = {
@@ -695,6 +748,97 @@ static const struct genl_ops ethtool_genl_ops[] = {
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_wol,
},
+ {
+ .cmd = ETHTOOL_MSG_FEATURES_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_FEATURES_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_features,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PRIVFLAGS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PRIVFLAGS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_privflags,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RINGS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RINGS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_rings,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CHANNELS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CHANNELS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_channels,
+ },
+ {
+ .cmd = ETHTOOL_MSG_COALESCE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_COALESCE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_coalesce,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PAUSE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PAUSE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_pause,
+ },
+ {
+ .cmd = ETHTOOL_MSG_EEE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_EEE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_eee,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TSINFO_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
};
static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
@@ -723,7 +867,9 @@ static int __init ethnl_init(void)
return ret;
ethnl_ok = true;
- return 0;
+ ret = register_netdevice_notifier(&ethnl_netdev_notifier);
+ WARN(ret < 0, "ethtool: net device notifier registration failed");
+ return ret;
}
subsys_initcall(ethnl_init);
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 60efd87686ad..81b8fa020bcb 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -10,9 +10,10 @@
struct ethnl_req_info;
-int ethnl_parse_header(struct ethnl_req_info *req_info,
- const struct nlattr *nest, struct net *net,
- struct netlink_ext_ack *extack, bool require_dev);
+int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
+ const struct nlattr *nest, struct net *net,
+ struct netlink_ext_ack *extack,
+ bool require_dev);
int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
u16 attrtype);
struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
@@ -336,10 +337,25 @@ extern const struct ethnl_request_ops ethnl_linkmodes_request_ops;
extern const struct ethnl_request_ops ethnl_linkstate_request_ops;
extern const struct ethnl_request_ops ethnl_debug_request_ops;
extern const struct ethnl_request_ops ethnl_wol_request_ops;
+extern const struct ethnl_request_ops ethnl_features_request_ops;
+extern const struct ethnl_request_ops ethnl_privflags_request_ops;
+extern const struct ethnl_request_ops ethnl_rings_request_ops;
+extern const struct ethnl_request_ops ethnl_channels_request_ops;
+extern const struct ethnl_request_ops ethnl_coalesce_request_ops;
+extern const struct ethnl_request_ops ethnl_pause_request_ops;
+extern const struct ethnl_request_ops ethnl_eee_request_ops;
+extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_features(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info);
#endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c
new file mode 100644
index 000000000000..7aea35d1e8a5
--- /dev/null
+++ b/net/ethtool/pause.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct pause_req_info {
+ struct ethnl_req_info base;
+};
+
+struct pause_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_pauseparam pauseparam;
+};
+
+#define PAUSE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct pause_reply_data, base)
+
+static const struct nla_policy
+pause_get_policy[ETHTOOL_A_PAUSE_MAX + 1] = {
+ [ETHTOOL_A_PAUSE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PAUSE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PAUSE_RX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PAUSE_TX] = { .type = NLA_REJECT },
+};
+
+static int pause_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct pause_reply_data *data = PAUSE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_pauseparam)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int pause_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */
+ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */
+ nla_total_size(sizeof(u8)); /* _PAUSE_TX */
+}
+
+static int pause_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct pause_reply_data *data = PAUSE_REPDATA(reply_base);
+ const struct ethtool_pauseparam *pauseparam = &data->pauseparam;
+
+ if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) ||
+ nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) ||
+ nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_pause_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PAUSE_GET,
+ .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PAUSE_HEADER,
+ .max_attr = ETHTOOL_A_PAUSE_MAX,
+ .req_info_size = sizeof(struct pause_req_info),
+ .reply_data_size = sizeof(struct pause_reply_data),
+ .request_policy = pause_get_policy,
+
+ .prepare_data = pause_prepare_data,
+ .reply_size = pause_reply_size,
+ .fill_reply = pause_fill_reply,
+};
+
+/* PAUSE_SET */
+
+static const struct nla_policy
+pause_set_policy[ETHTOOL_A_PAUSE_MAX + 1] = {
+ [ETHTOOL_A_PAUSE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PAUSE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 },
+};
+
+int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_PAUSE_MAX + 1];
+ struct ethtool_pauseparam params = {};
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ bool mod = false;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_PAUSE_MAX,
+ pause_set_policy, info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_PAUSE_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
+ if (!ops->get_pauseparam || !ops->set_pauseparam)
+ goto out_dev;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+ ops->get_pauseparam(dev, &params);
+
+ ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod);
+ ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod);
+ ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod);
+ ret = 0;
+ if (!mod)
+ goto out_ops;
+
+ ret = dev->ethtool_ops->set_pauseparam(dev, &params);
+ if (ret < 0)
+ goto out_ops;
+ ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+out_dev:
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c
new file mode 100644
index 000000000000..77447dceb109
--- /dev/null
+++ b/net/ethtool/privflags.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct privflags_req_info {
+ struct ethnl_req_info base;
+};
+
+struct privflags_reply_data {
+ struct ethnl_reply_data base;
+ const char (*priv_flag_names)[ETH_GSTRING_LEN];
+ unsigned int n_priv_flags;
+ u32 priv_flags;
+};
+
+#define PRIVFLAGS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct privflags_reply_data, base)
+
+static const struct nla_policy
+privflags_get_policy[ETHTOOL_A_PRIVFLAGS_MAX + 1] = {
+ [ETHTOOL_A_PRIVFLAGS_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PRIVFLAGS_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_REJECT },
+};
+
+static int ethnl_get_priv_flags_info(struct net_device *dev,
+ unsigned int *count,
+ const char (**names)[ETH_GSTRING_LEN])
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int nflags;
+
+ nflags = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
+ if (nflags < 0)
+ return nflags;
+
+ if (names) {
+ *names = kcalloc(nflags, ETH_GSTRING_LEN, GFP_KERNEL);
+ if (!*names)
+ return -ENOMEM;
+ ops->get_strings(dev, ETH_SS_PRIV_FLAGS, (u8 *)*names);
+ }
+
+ /* We can pass more than 32 private flags to userspace via netlink but
+ * we cannot get more with ethtool_ops::get_priv_flags(). Note that we
+ * must not adjust nflags before allocating the space for flag names
+ * as the buffer must be large enough for all flags.
+ */
+ if (WARN_ONCE(nflags > 32,
+ "device %s reports more than 32 private flags (%d)\n",
+ netdev_name(dev), nflags))
+ nflags = 32;
+ *count = nflags;
+
+ return 0;
+}
+
+static int privflags_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ const char (*names)[ETH_GSTRING_LEN];
+ const struct ethtool_ops *ops;
+ unsigned int nflags;
+ int ret;
+
+ ops = dev->ethtool_ops;
+ if (!ops->get_priv_flags || !ops->get_sset_count || !ops->get_strings)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = ethnl_get_priv_flags_info(dev, &nflags, &names);
+ if (ret < 0)
+ goto out_ops;
+ data->priv_flags = ops->get_priv_flags(dev);
+ data->priv_flag_names = names;
+ data->n_priv_flags = nflags;
+
+out_ops:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int privflags_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags);
+
+ return ethnl_bitset32_size(&data->priv_flags, &all_flags,
+ data->n_priv_flags,
+ data->priv_flag_names, compact);
+}
+
+static int privflags_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags);
+
+ return ethnl_put_bitset32(skb, ETHTOOL_A_PRIVFLAGS_FLAGS,
+ &data->priv_flags, &all_flags,
+ data->n_priv_flags, data->priv_flag_names,
+ compact);
+}
+
+static void privflags_cleanup_data(struct ethnl_reply_data *reply_data)
+{
+ struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_data);
+
+ kfree(data->priv_flag_names);
+}
+
+const struct ethnl_request_ops ethnl_privflags_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET,
+ .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER,
+ .max_attr = ETHTOOL_A_PRIVFLAGS_MAX,
+ .req_info_size = sizeof(struct privflags_req_info),
+ .reply_data_size = sizeof(struct privflags_reply_data),
+ .request_policy = privflags_get_policy,
+
+ .prepare_data = privflags_prepare_data,
+ .reply_size = privflags_reply_size,
+ .fill_reply = privflags_fill_reply,
+ .cleanup_data = privflags_cleanup_data,
+};
+
+/* PRIVFLAGS_SET */
+
+static const struct nla_policy
+privflags_set_policy[ETHTOOL_A_PRIVFLAGS_MAX + 1] = {
+ [ETHTOOL_A_PRIVFLAGS_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PRIVFLAGS_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED },
+};
+
+int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_PRIVFLAGS_MAX + 1];
+ const char (*names)[ETH_GSTRING_LEN] = NULL;
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ unsigned int nflags;
+ bool mod = false;
+ bool compact;
+ u32 flags;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+ ETHTOOL_A_PRIVFLAGS_MAX, privflags_set_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ if (!tb[ETHTOOL_A_PRIVFLAGS_FLAGS])
+ return -EINVAL;
+ ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_PRIVFLAGS_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
+ if (!ops->get_priv_flags || !ops->set_priv_flags ||
+ !ops->get_sset_count || !ops->get_strings)
+ goto out_dev;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+ ret = ethnl_get_priv_flags_info(dev, &nflags, compact ? NULL : &names);
+ if (ret < 0)
+ goto out_ops;
+ flags = ops->get_priv_flags(dev);
+
+ ret = ethnl_update_bitset32(&flags, nflags,
+ tb[ETHTOOL_A_PRIVFLAGS_FLAGS], names,
+ info->extack, &mod);
+ if (ret < 0 || !mod)
+ goto out_free;
+ ret = ops->set_priv_flags(dev, flags);
+ if (ret < 0)
+ goto out_free;
+ ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL);
+
+out_free:
+ kfree(names);
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+out_dev:
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
new file mode 100644
index 000000000000..5422526f4eef
--- /dev/null
+++ b/net/ethtool/rings.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct rings_req_info {
+ struct ethnl_req_info base;
+};
+
+struct rings_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_ringparam ringparam;
+};
+
+#define RINGS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct rings_reply_data, base)
+
+static const struct nla_policy
+rings_get_policy[ETHTOOL_A_RINGS_MAX + 1] = {
+ [ETHTOOL_A_RINGS_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_RINGS_RX_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_RX_MINI_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_RX_JUMBO_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_TX_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_RX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_TX] = { .type = NLA_REJECT },
+};
+
+static int rings_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct rings_reply_data *data = RINGS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_ringparam)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ dev->ethtool_ops->get_ringparam(dev, &data->ringparam);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int rings_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */
+ nla_total_size(sizeof(u32)); /* _RINGS_TX */
+}
+
+static int rings_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct rings_reply_data *data = RINGS_REPDATA(reply_base);
+ const struct ethtool_ringparam *ringparam = &data->ringparam;
+
+ if ((ringparam->rx_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX,
+ ringparam->rx_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_RX,
+ ringparam->rx_pending))) ||
+ (ringparam->rx_mini_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX,
+ ringparam->rx_mini_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI,
+ ringparam->rx_mini_pending))) ||
+ (ringparam->rx_jumbo_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX,
+ ringparam->rx_jumbo_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO,
+ ringparam->rx_jumbo_pending))) ||
+ (ringparam->tx_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX,
+ ringparam->tx_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_TX,
+ ringparam->tx_pending))))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_rings_request_ops = {
+ .request_cmd = ETHTOOL_MSG_RINGS_GET,
+ .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_RINGS_HEADER,
+ .max_attr = ETHTOOL_A_RINGS_MAX,
+ .req_info_size = sizeof(struct rings_req_info),
+ .reply_data_size = sizeof(struct rings_reply_data),
+ .request_policy = rings_get_policy,
+
+ .prepare_data = rings_prepare_data,
+ .reply_size = rings_reply_size,
+ .fill_reply = rings_fill_reply,
+};
+
+/* RINGS_SET */
+
+static const struct nla_policy
+rings_set_policy[ETHTOOL_A_RINGS_MAX + 1] = {
+ [ETHTOOL_A_RINGS_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_RINGS_RX_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_RX_MINI_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_RX_JUMBO_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_TX_MAX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 },
+};
+
+int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_RINGS_MAX + 1];
+ struct ethtool_ringparam ringparam = {};
+ struct ethnl_req_info req_info = {};
+ const struct nlattr *err_attr;
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ bool mod = false;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+ ETHTOOL_A_RINGS_MAX, rings_set_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_RINGS_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
+ if (!ops->get_ringparam || !ops->set_ringparam)
+ goto out_dev;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+ ops->get_ringparam(dev, &ringparam);
+
+ ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod);
+ ethnl_update_u32(&ringparam.rx_mini_pending,
+ tb[ETHTOOL_A_RINGS_RX_MINI], &mod);
+ ethnl_update_u32(&ringparam.rx_jumbo_pending,
+ tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod);
+ ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod);
+ ret = 0;
+ if (!mod)
+ goto out_ops;
+
+ /* ensure new ring parameters are within limits */
+ if (ringparam.rx_pending > ringparam.rx_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_RX];
+ else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_RX_MINI];
+ else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO];
+ else if (ringparam.tx_pending > ringparam.tx_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_TX];
+ else
+ err_attr = NULL;
+ if (err_attr) {
+ ret = -EINVAL;
+ NL_SET_ERR_MSG_ATTR(info->extack, err_attr,
+ "requested ring size exceeds maximum");
+ goto out_ops;
+ }
+
+ ret = dev->ethtool_ops->set_ringparam(dev, &ringparam);
+ if (ret < 0)
+ goto out_ops;
+ ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+out_dev:
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index 8e5911887b4c..95eae5c68a52 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -60,6 +60,21 @@ static const struct strset_info info_template[] = {
.count = WOL_MODE_COUNT,
.strings = wol_mode_names,
},
+ [ETH_SS_SOF_TIMESTAMPING] = {
+ .per_dev = false,
+ .count = __SOF_TIMESTAMPING_CNT,
+ .strings = sof_timestamping_names,
+ },
+ [ETH_SS_TS_TX_TYPES] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_TX_CNT,
+ .strings = ts_tx_type_names,
+ },
+ [ETH_SS_TS_RX_FILTERS] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_FILTER_CNT,
+ .strings = ts_rx_filter_names,
+ },
};
struct strset_req_info {
diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c
new file mode 100644
index 000000000000..7cb5b512b77c
--- /dev/null
+++ b/net/ethtool/tsinfo.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net_tstamp.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct tsinfo_req_info {
+ struct ethnl_req_info base;
+};
+
+struct tsinfo_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_ts_info ts_info;
+};
+
+#define TSINFO_REPDATA(__reply_base) \
+ container_of(__reply_base, struct tsinfo_reply_data, base)
+
+static const struct nla_policy
+tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = {
+ [ETHTOOL_A_TSINFO_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_TSINFO_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_TSINFO_TIMESTAMPING] = { .type = NLA_REJECT },
+ [ETHTOOL_A_TSINFO_TX_TYPES] = { .type = NLA_REJECT },
+ [ETHTOOL_A_TSINFO_RX_FILTERS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_TSINFO_PHC_INDEX] = { .type = NLA_REJECT },
+};
+
+static int tsinfo_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = __ethtool_get_ts_info(dev, &data->ts_info);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int tsinfo_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct ethtool_ts_info *ts_info = &data->ts_info;
+ int len = 0;
+ int ret;
+
+ BUILD_BUG_ON(__SOF_TIMESTAMPING_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32);
+
+ if (ts_info->so_timestamping) {
+ ret = ethnl_bitset32_size(&ts_info->so_timestamping, NULL,
+ __SOF_TIMESTAMPING_CNT,
+ sof_timestamping_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_TIMESTAMPING */
+ }
+ if (ts_info->tx_types) {
+ ret = ethnl_bitset32_size(&ts_info->tx_types, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_TX_TYPES */
+ }
+ if (ts_info->rx_filters) {
+ ret = ethnl_bitset32_size(&ts_info->rx_filters, NULL,
+ __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_RX_FILTERS */
+ }
+ if (ts_info->phc_index >= 0)
+ len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */
+
+ return len;
+}
+
+static int tsinfo_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct ethtool_ts_info *ts_info = &data->ts_info;
+ int ret;
+
+ if (ts_info->so_timestamping) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TIMESTAMPING,
+ &ts_info->so_timestamping, NULL,
+ __SOF_TIMESTAMPING_CNT,
+ sof_timestamping_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->tx_types) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TX_TYPES,
+ &ts_info->tx_types, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->rx_filters) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_RX_FILTERS,
+ &ts_info->rx_filters, NULL,
+ __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->phc_index >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_tsinfo_request_ops = {
+ .request_cmd = ETHTOOL_MSG_TSINFO_GET,
+ .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_TSINFO_HEADER,
+ .max_attr = ETHTOOL_A_TSINFO_MAX,
+ .req_info_size = sizeof(struct tsinfo_req_info),
+ .reply_data_size = sizeof(struct tsinfo_reply_data),
+ .request_policy = tsinfo_get_policy,
+
+ .prepare_data = tsinfo_prepare_data,
+ .reply_size = tsinfo_reply_size,
+ .fill_reply = tsinfo_fill_reply,
+};
diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c
index e1b8a65b64c4..1798421e9f1c 100644
--- a/net/ethtool/wol.c
+++ b/net/ethtool/wol.c
@@ -123,13 +123,15 @@ int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info)
wol_set_policy, info->extack);
if (ret < 0)
return ret;
- ret = ethnl_parse_header(&req_info, tb[ETHTOOL_A_WOL_HEADER],
- genl_info_net(info), info->extack, true);
+ ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_WOL_HEADER],
+ genl_info_net(info), info->extack,
+ true);
if (ret < 0)
return ret;
dev = req_info.dev;
+ ret = -EOPNOTSUPP;
if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol)
- return -EOPNOTSUPP;
+ goto out_dev;
rtnl_lock();
ret = ethnl_ops_begin(dev);
@@ -172,6 +174,7 @@ out_ops:
ethnl_ops_complete(dev);
out_rtnl:
rtnl_unlock();
+out_dev:
dev_put(dev);
return ret;
}
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
index d5f709b940ff..9787ef11ca71 100644
--- a/net/hsr/hsr_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -113,7 +113,6 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
priv->node_tbl_root = NULL;
return;
}
- priv->node_tbl_file = de;
}
/* hsr_debugfs_term - Tear down debugfs intrastructure
@@ -125,9 +124,7 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
void
hsr_debugfs_term(struct hsr_priv *priv)
{
- debugfs_remove(priv->node_tbl_file);
- priv->node_tbl_file = NULL;
- debugfs_remove(priv->node_tbl_root);
+ debugfs_remove_recursive(priv->node_tbl_root);
priv->node_tbl_root = NULL;
}
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index c7bd6c49fadf..fc7027314ad8 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -57,24 +57,19 @@ static void hsr_set_operstate(struct hsr_port *master, bool has_carrier)
static bool hsr_check_carrier(struct hsr_port *master)
{
struct hsr_port *port;
- bool has_carrier;
- has_carrier = false;
+ ASSERT_RTNL();
- rcu_read_lock();
- hsr_for_each_port(master->hsr, port)
+ hsr_for_each_port(master->hsr, port) {
if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) {
- has_carrier = true;
- break;
+ netif_carrier_on(master->dev);
+ return true;
}
- rcu_read_unlock();
+ }
- if (has_carrier)
- netif_carrier_on(master->dev);
- else
- netif_carrier_off(master->dev);
+ netif_carrier_off(master->dev);
- return has_carrier;
+ return false;
}
static void hsr_check_announce(struct net_device *hsr_dev,
@@ -118,11 +113,9 @@ int hsr_get_max_mtu(struct hsr_priv *hsr)
struct hsr_port *port;
mtu_max = ETH_DATA_LEN;
- rcu_read_lock();
hsr_for_each_port(hsr, port)
if (port->type != HSR_PT_MASTER)
mtu_max = min(port->dev->mtu, mtu_max);
- rcu_read_unlock();
if (mtu_max < HSR_HLEN)
return 0;
@@ -157,7 +150,6 @@ static int hsr_dev_open(struct net_device *dev)
hsr = netdev_priv(dev);
designation = '\0';
- rcu_read_lock();
hsr_for_each_port(hsr, port) {
if (port->type == HSR_PT_MASTER)
continue;
@@ -175,7 +167,6 @@ static int hsr_dev_open(struct net_device *dev)
netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n",
designation, port->dev->name);
}
- rcu_read_unlock();
if (designation == '\0')
netdev_warn(dev, "No slave devices configured\n");
@@ -350,22 +341,33 @@ static void hsr_announce(struct timer_list *t)
rcu_read_unlock();
}
+static void hsr_del_ports(struct hsr_priv *hsr)
+{
+ struct hsr_port *port;
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (port)
+ hsr_del_port(port);
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port)
+ hsr_del_port(port);
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ if (port)
+ hsr_del_port(port);
+}
+
/* This has to be called after all the readers are gone.
* Otherwise we would have to check the return value of
* hsr_port_get_hsr().
*/
static void hsr_dev_destroy(struct net_device *hsr_dev)
{
- struct hsr_priv *hsr;
- struct hsr_port *port;
- struct hsr_port *tmp;
-
- hsr = netdev_priv(hsr_dev);
+ struct hsr_priv *hsr = netdev_priv(hsr_dev);
hsr_debugfs_term(hsr);
-
- list_for_each_entry_safe(port, tmp, &hsr->ports, port_list)
- hsr_del_port(port);
+ hsr_del_ports(hsr);
del_timer_sync(&hsr->prune_timer);
del_timer_sync(&hsr->announce_timer);
@@ -431,11 +433,10 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
};
int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
- unsigned char multicast_spec, u8 protocol_version)
+ unsigned char multicast_spec, u8 protocol_version,
+ struct netlink_ext_ack *extack)
{
struct hsr_priv *hsr;
- struct hsr_port *port;
- struct hsr_port *tmp;
int res;
hsr = netdev_priv(hsr_dev);
@@ -478,7 +479,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
/* Make sure the 1st call to netif_carrier_on() gets through */
netif_carrier_off(hsr_dev);
- res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER);
+ res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER, extack);
if (res)
goto err_add_master;
@@ -486,11 +487,11 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
if (res)
goto err_unregister;
- res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A);
+ res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A, extack);
if (res)
goto err_add_slaves;
- res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B);
+ res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B, extack);
if (res)
goto err_add_slaves;
@@ -502,8 +503,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
err_add_slaves:
unregister_netdevice(hsr_dev);
err_unregister:
- list_for_each_entry_safe(port, tmp, &hsr->ports, port_list)
- hsr_del_port(port);
+ hsr_del_ports(hsr);
err_add_master:
hsr_del_self_node(hsr);
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
index 6d7759c4f5f9..a099d7de7e79 100644
--- a/net/hsr/hsr_device.h
+++ b/net/hsr/hsr_device.h
@@ -13,7 +13,8 @@
void hsr_dev_setup(struct net_device *dev);
int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
- unsigned char multicast_spec, u8 protocol_version);
+ unsigned char multicast_spec, u8 protocol_version,
+ struct netlink_ext_ack *extack);
void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
bool is_hsr_master(struct net_device *dev);
int hsr_get_max_mtu(struct hsr_priv *hsr);
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 364ea2cc028e..03b891904314 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -155,7 +155,8 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
new_node->seq_out[i] = seq_out;
spin_lock_bh(&hsr->list_lock);
- list_for_each_entry_rcu(node, node_db, mac_list) {
+ list_for_each_entry_rcu(node, node_db, mac_list,
+ lockdep_is_held(&hsr->list_lock)) {
if (ether_addr_equal(node->macaddress_A, addr))
goto out;
if (ether_addr_equal(node->macaddress_B, addr))
@@ -317,7 +318,8 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
node_dst = find_node_by_addr_A(&port->hsr->node_db,
eth_hdr(skb)->h_dest);
if (!node_dst) {
- WARN_ONCE(1, "%s: Unknown node\n", __func__);
+ if (net_ratelimit())
+ netdev_err(skb->dev, "%s: Unknown node\n", __func__);
return;
}
if (port->type != node_dst->addr_B_port)
@@ -481,12 +483,9 @@ int hsr_get_node_data(struct hsr_priv *hsr,
struct hsr_port *port;
unsigned long tdiff;
- rcu_read_lock();
node = find_node_by_addr_A(&hsr->node_db, addr);
- if (!node) {
- rcu_read_unlock();
- return -ENOENT; /* No such entry */
- }
+ if (!node)
+ return -ENOENT;
ether_addr_copy(addr_b, node->macaddress_B);
@@ -521,7 +520,5 @@ int hsr_get_node_data(struct hsr_priv *hsr,
*addr_b_ifindex = -1;
}
- rcu_read_unlock();
-
return 0;
}
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
index 9e389accbfc7..26d6c39f24e1 100644
--- a/net/hsr/hsr_main.c
+++ b/net/hsr/hsr_main.c
@@ -85,7 +85,8 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
master->dev->mtu = mtu_max;
break;
case NETDEV_UNREGISTER:
- hsr_del_port(port);
+ if (!is_hsr_master(dev))
+ hsr_del_port(port);
break;
case NETDEV_PRE_TYPE_CHANGE:
/* HSR works only on Ethernet devices. Refuse slave to change
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 754d84b217f0..7321cf8d6d2c 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -166,7 +166,6 @@ struct hsr_priv {
unsigned char sup_multicast_addr[ETH_ALEN];
#ifdef CONFIG_DEBUG_FS
struct dentry *node_tbl_root;
- struct dentry *node_tbl_file;
#endif
};
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 8dc0547f01d0..5465a395da04 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -35,26 +35,34 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
unsigned char multicast_spec, hsr_version;
if (!data) {
- netdev_info(dev, "HSR: No slave devices specified\n");
+ NL_SET_ERR_MSG_MOD(extack, "No slave devices specified");
return -EINVAL;
}
if (!data[IFLA_HSR_SLAVE1]) {
- netdev_info(dev, "HSR: Slave1 device not specified\n");
+ NL_SET_ERR_MSG_MOD(extack, "Slave1 device not specified");
return -EINVAL;
}
link[0] = __dev_get_by_index(src_net,
nla_get_u32(data[IFLA_HSR_SLAVE1]));
+ if (!link[0]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave1 does not exist");
+ return -EINVAL;
+ }
if (!data[IFLA_HSR_SLAVE2]) {
- netdev_info(dev, "HSR: Slave2 device not specified\n");
+ NL_SET_ERR_MSG_MOD(extack, "Slave2 device not specified");
return -EINVAL;
}
link[1] = __dev_get_by_index(src_net,
nla_get_u32(data[IFLA_HSR_SLAVE2]));
+ if (!link[1]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave2 does not exist");
+ return -EINVAL;
+ }
- if (!link[0] || !link[1])
- return -ENODEV;
- if (link[0] == link[1])
+ if (link[0] == link[1]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave1 and Slave2 are same");
return -EINVAL;
+ }
if (!data[IFLA_HSR_MULTICAST_SPEC])
multicast_spec = 0;
@@ -66,34 +74,25 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
else
hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]);
- return hsr_dev_finalize(dev, link, multicast_spec, hsr_version);
+ return hsr_dev_finalize(dev, link, multicast_spec, hsr_version, extack);
}
static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
- struct hsr_priv *hsr;
+ struct hsr_priv *hsr = netdev_priv(dev);
struct hsr_port *port;
- int res;
- hsr = netdev_priv(dev);
-
- res = 0;
-
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
- if (port)
- res = nla_put_u32(skb, IFLA_HSR_SLAVE1, port->dev->ifindex);
- rcu_read_unlock();
- if (res)
- goto nla_put_failure;
+ if (port) {
+ if (nla_put_u32(skb, IFLA_HSR_SLAVE1, port->dev->ifindex))
+ goto nla_put_failure;
+ }
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
- if (port)
- res = nla_put_u32(skb, IFLA_HSR_SLAVE2, port->dev->ifindex);
- rcu_read_unlock();
- if (res)
- goto nla_put_failure;
+ if (port) {
+ if (nla_put_u32(skb, IFLA_HSR_SLAVE2, port->dev->ifindex))
+ goto nla_put_failure;
+ }
if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN,
hsr->sup_multicast_addr) ||
@@ -251,15 +250,16 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
if (!na)
goto invalid;
- hsr_dev = __dev_get_by_index(genl_info_net(info),
- nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ rcu_read_lock();
+ hsr_dev = dev_get_by_index_rcu(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
if (!hsr_dev)
- goto invalid;
+ goto rcu_unlock;
if (!is_hsr_master(hsr_dev))
- goto invalid;
+ goto rcu_unlock;
/* Send reply */
- skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb_out) {
res = -ENOMEM;
goto fail;
@@ -313,12 +313,10 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq);
if (res < 0)
goto nla_put_failure;
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
if (port)
res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX,
port->dev->ifindex);
- rcu_read_unlock();
if (res < 0)
goto nla_put_failure;
@@ -328,20 +326,22 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq);
if (res < 0)
goto nla_put_failure;
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
if (port)
res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX,
port->dev->ifindex);
- rcu_read_unlock();
if (res < 0)
goto nla_put_failure;
+ rcu_read_unlock();
+
genlmsg_end(skb_out, msg_head);
genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
return 0;
+rcu_unlock:
+ rcu_read_unlock();
invalid:
netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
return 0;
@@ -351,6 +351,7 @@ nla_put_failure:
/* Fall through */
fail:
+ rcu_read_unlock();
return res;
}
@@ -358,16 +359,14 @@ fail:
*/
static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
{
- /* For receiving */
- struct nlattr *na;
+ unsigned char addr[ETH_ALEN];
struct net_device *hsr_dev;
-
- /* For sending */
struct sk_buff *skb_out;
- void *msg_head;
struct hsr_priv *hsr;
- void *pos;
- unsigned char addr[ETH_ALEN];
+ bool restart = false;
+ struct nlattr *na;
+ void *pos = NULL;
+ void *msg_head;
int res;
if (!info)
@@ -377,15 +376,17 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
if (!na)
goto invalid;
- hsr_dev = __dev_get_by_index(genl_info_net(info),
- nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ rcu_read_lock();
+ hsr_dev = dev_get_by_index_rcu(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
if (!hsr_dev)
- goto invalid;
+ goto rcu_unlock;
if (!is_hsr_master(hsr_dev))
- goto invalid;
+ goto rcu_unlock;
+restart:
/* Send reply */
- skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb_out = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC);
if (!skb_out) {
res = -ENOMEM;
goto fail;
@@ -399,18 +400,26 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
goto nla_put_failure;
}
- res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
- if (res < 0)
- goto nla_put_failure;
+ if (!restart) {
+ res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+ }
hsr = netdev_priv(hsr_dev);
- rcu_read_lock();
- pos = hsr_get_next_node(hsr, NULL, addr);
+ if (!pos)
+ pos = hsr_get_next_node(hsr, NULL, addr);
while (pos) {
res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr);
if (res < 0) {
- rcu_read_unlock();
+ if (res == -EMSGSIZE) {
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out,
+ info->snd_portid);
+ restart = true;
+ goto restart;
+ }
goto nla_put_failure;
}
pos = hsr_get_next_node(hsr, pos, addr);
@@ -422,15 +431,18 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
return 0;
+rcu_unlock:
+ rcu_read_unlock();
invalid:
netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
return 0;
nla_put_failure:
- kfree_skb(skb_out);
+ nlmsg_free(skb_out);
/* Fall through */
fail:
+ rcu_read_unlock();
return res;
}
@@ -457,6 +469,7 @@ static struct genl_family hsr_genl_family __ro_after_init = {
.version = 1,
.maxattr = HSR_A_MAX,
.policy = hsr_genl_policy,
+ .netnsok = true,
.module = THIS_MODULE,
.ops = hsr_ops,
.n_ops = ARRAY_SIZE(hsr_ops),
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index fbfd0db182b7..f4b9f7a3ce51 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -25,7 +25,6 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
return RX_HANDLER_PASS;
}
- rcu_read_lock(); /* hsr->node_db, hsr->ports */
port = hsr_port_get_rcu(skb->dev);
if (!port)
goto finish_pass;
@@ -45,11 +44,9 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
hsr_forward_skb(skb, port);
finish_consume:
- rcu_read_unlock(); /* hsr->node_db, hsr->ports */
return RX_HANDLER_CONSUMED;
finish_pass:
- rcu_read_unlock(); /* hsr->node_db, hsr->ports */
return RX_HANDLER_PASS;
}
@@ -58,33 +55,37 @@ bool hsr_port_exists(const struct net_device *dev)
return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame;
}
-static int hsr_check_dev_ok(struct net_device *dev)
+static int hsr_check_dev_ok(struct net_device *dev,
+ struct netlink_ext_ack *extack)
{
/* Don't allow HSR on non-ethernet like devices */
if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
dev->addr_len != ETH_ALEN) {
- netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n");
+ NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave.");
return -EINVAL;
}
/* Don't allow enslaving hsr devices */
if (is_hsr_master(dev)) {
- netdev_info(dev, "Cannot create trees of HSR devices.\n");
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot create trees of HSR devices.");
return -EINVAL;
}
if (hsr_port_exists(dev)) {
- netdev_info(dev, "This device is already a HSR slave.\n");
+ NL_SET_ERR_MSG_MOD(extack,
+ "This device is already a HSR slave.");
return -EINVAL;
}
if (is_vlan_dev(dev)) {
- netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
+ NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver.");
return -EINVAL;
}
if (dev->priv_flags & IFF_DONT_BRIDGE) {
- netdev_info(dev, "This device does not support bridging.\n");
+ NL_SET_ERR_MSG_MOD(extack,
+ "This device does not support bridging.");
return -EOPNOTSUPP;
}
@@ -96,19 +97,25 @@ static int hsr_check_dev_ok(struct net_device *dev)
}
/* Setup device to be added to the HSR bridge. */
-static int hsr_portdev_setup(struct net_device *dev, struct hsr_port *port)
+static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev,
+ struct hsr_port *port,
+ struct netlink_ext_ack *extack)
+
{
+ struct net_device *hsr_dev;
+ struct hsr_port *master;
int res;
- dev_hold(dev);
res = dev_set_promiscuity(dev, 1);
if (res)
- goto fail_promiscuity;
+ return res;
- /* FIXME:
- * What does net device "adjacency" mean? Should we do
- * res = netdev_master_upper_dev_link(port->dev, port->hsr->dev); ?
- */
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ hsr_dev = master->dev;
+
+ res = netdev_upper_dev_link(dev, hsr_dev, extack);
+ if (res)
+ goto fail_upper_dev_link;
res = netdev_rx_handler_register(dev, hsr_handle_frame, port);
if (res)
@@ -118,21 +125,20 @@ static int hsr_portdev_setup(struct net_device *dev, struct hsr_port *port)
return 0;
fail_rx_handler:
+ netdev_upper_dev_unlink(dev, hsr_dev);
+fail_upper_dev_link:
dev_set_promiscuity(dev, -1);
-fail_promiscuity:
- dev_put(dev);
-
return res;
}
int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
- enum hsr_port_type type)
+ enum hsr_port_type type, struct netlink_ext_ack *extack)
{
struct hsr_port *port, *master;
int res;
if (type != HSR_PT_MASTER) {
- res = hsr_check_dev_ok(dev);
+ res = hsr_check_dev_ok(dev, extack);
if (res)
return res;
}
@@ -145,16 +151,16 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
if (!port)
return -ENOMEM;
+ port->hsr = hsr;
+ port->dev = dev;
+ port->type = type;
+
if (type != HSR_PT_MASTER) {
- res = hsr_portdev_setup(dev, port);
+ res = hsr_portdev_setup(hsr, dev, port, extack);
if (res)
goto fail_dev_setup;
}
- port->hsr = hsr;
- port->dev = dev;
- port->type = type;
-
list_add_tail_rcu(&port->port_list, &hsr->ports);
synchronize_rcu();
@@ -179,21 +185,14 @@ void hsr_del_port(struct hsr_port *port)
list_del_rcu(&port->port_list);
if (port != master) {
- if (master) {
- netdev_update_features(master->dev);
- dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
- }
+ netdev_update_features(master->dev);
+ dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
netdev_rx_handler_unregister(port->dev);
dev_set_promiscuity(port->dev, -1);
+ netdev_upper_dev_unlink(port->dev, master->dev);
}
- /* FIXME?
- * netdev_upper_dev_unlink(port->dev, port->hsr->dev);
- */
-
synchronize_rcu();
- if (port != master)
- dev_put(port->dev);
kfree(port);
}
diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h
index 64b549529592..8953ea279ce9 100644
--- a/net/hsr/hsr_slave.h
+++ b/net/hsr/hsr_slave.h
@@ -13,7 +13,7 @@
#include "hsr_main.h"
int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
- enum hsr_port_type pt);
+ enum hsr_port_type pt, struct netlink_ext_ack *extack);
void hsr_del_port(struct hsr_port *port);
bool hsr_port_exists(const struct net_device *dev);
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
index 2c7a38d76a3a..0672b2f01586 100644
--- a/net/ieee802154/nl_policy.c
+++ b/net/ieee802154/nl_policy.c
@@ -21,7 +21,13 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
[IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, },
[IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, },
[IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_BCN_ORD] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_SF_ORD] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_PAN_COORD] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_BAT_EXT] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_COORD_REALIGN] = { .type = NLA_U8, },
[IEEE802154_ATTR_PAGE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_DEV_TYPE] = { .type = NLA_U8, },
[IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, },
[IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, },
[IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, },
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index f96bd489b362..25a8888826b8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -129,7 +129,7 @@ config IP_PNP_DHCP
If unsure, say Y. Note that if you want to use DHCP, a DHCP server
must be operating on your network. Read
- <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+ <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config IP_PNP_BOOTP
bool "IP: BOOTP support"
@@ -144,7 +144,7 @@ config IP_PNP_BOOTP
does BOOTP itself, providing all necessary information on the kernel
command line, you can say N here. If unsure, say Y. Note that if you
want to use BOOTP, a BOOTP server must be operating on your network.
- Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+ Read <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config IP_PNP_RARP
bool "IP: RARP support"
@@ -157,7 +157,7 @@ config IP_PNP_RARP
older protocol which is being obsoleted by BOOTP and DHCP), say Y
here. Note that if you want to use RARP, a RARP server must be
operating on your network. Read
- <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+ <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config NET_IPIP
tristate "IP: tunneling"
@@ -303,6 +303,7 @@ config SYN_COOKIES
config NET_IPVTI
tristate "Virtual (secure) IP: tunneling"
+ depends on IPV6 || IPV6=n
select INET_TUNNEL
select NET_IP_TUNNEL
select XFRM
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9d97bace13c8..9e1a186a3671 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
+obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2fe295432c24..cf58e29cf746 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -872,7 +872,7 @@ int inet_shutdown(struct socket *sock, int how)
err = -ENOTCONN;
/* Hack to wake up other listeners, who can poll for
EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
- /* fall through */
+ fallthrough;
default:
sk->sk_shutdown |= how;
if (sk->sk_prot->shutdown)
@@ -886,7 +886,7 @@ int inet_shutdown(struct socket *sock, int how)
case TCP_LISTEN:
if (!(how & RCV_SHUTDOWN))
break;
- /* fall through */
+ fallthrough;
case TCP_SYN_SENT:
err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -1793,6 +1793,10 @@ static __net_exit void ipv4_mib_exit_net(struct net *net)
free_percpu(net->mib.net_statistics);
free_percpu(net->mib.ip_statistics);
free_percpu(net->mib.tcp_statistics);
+#ifdef CONFIG_MPTCP
+ /* allocated on demand, see mptcp_init_sock() */
+ free_percpu(net->mib.mptcp_statistics);
+#endif
}
static __net_initdata struct pernet_operations ipv4_mib_ops = {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 974179b3b314..d99e1be94019 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -107,7 +107,7 @@ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
if (optlen < 6)
return -EINVAL;
memcpy(daddr, optptr+optlen-4, 4);
- /* Fall through */
+ fallthrough;
default:
memset(optptr, 0, optlen);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 05eb42f347e8..687971d83b4e 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1181,7 +1181,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
case SIOCSARP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- /* fall through */
+ fallthrough;
case SIOCGARP:
err = copy_from_user(&r, arg, sizeof(struct arpreq));
if (err)
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 574972bc7299..e3939f76b024 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -7,6 +7,7 @@
#include <linux/btf.h>
#include <linux/filter.h>
#include <net/tcp.h>
+#include <net/bpf_sk_storage.h>
static u32 optional_ops[] = {
offsetof(struct tcp_congestion_ops, init),
@@ -27,6 +28,27 @@ static u32 unsupported_ops[] = {
static const struct btf_type *tcp_sock_type;
static u32 tcp_sock_id, sock_id;
+static int btf_sk_storage_get_ids[5];
+static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly;
+
+static int btf_sk_storage_delete_ids[5];
+static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly;
+
+static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids,
+ const struct bpf_func_proto *from)
+{
+ int i;
+
+ *to = *from;
+ to->btf_id = to_btf_ids;
+ for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) {
+ if (to->arg_type[i] == ARG_PTR_TO_SOCKET) {
+ to->arg_type[i] = ARG_PTR_TO_BTF_ID;
+ to->btf_id[i] = tcp_sock_id;
+ }
+ }
+}
+
static int bpf_tcp_ca_init(struct btf *btf)
{
s32 type_id;
@@ -42,6 +64,13 @@ static int bpf_tcp_ca_init(struct btf *btf)
tcp_sock_id = type_id;
tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
+ convert_sk_func_proto(&btf_sk_storage_get_proto,
+ btf_sk_storage_get_ids,
+ &bpf_sk_storage_get_proto);
+ convert_sk_func_proto(&btf_sk_storage_delete_proto,
+ btf_sk_storage_delete_ids,
+ &bpf_sk_storage_delete_proto);
+
return 0;
}
@@ -167,6 +196,10 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
switch (func_id) {
case BPF_FUNC_tcp_send_ack:
return &bpf_tcp_send_ack_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &btf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &btf_sk_storage_delete_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -184,7 +217,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
{
const struct tcp_congestion_ops *utcp_ca;
struct tcp_congestion_ops *tcp_ca;
- size_t tcp_ca_name_len;
int prog_fd;
u32 moff;
@@ -199,13 +231,11 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
tcp_ca->flags = utcp_ca->flags;
return 1;
case offsetof(struct tcp_congestion_ops, name):
- tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name));
- if (!tcp_ca_name_len ||
- tcp_ca_name_len == sizeof(utcp_ca->name))
+ if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
+ sizeof(tcp_ca->name)) <= 0)
return -EINVAL;
if (tcp_ca_find(utcp_ca->name))
return -EEXIST;
- memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name));
return 1;
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 376882215919..0bd10a1f477f 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1724,6 +1724,7 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
unsigned char optbuf[sizeof(struct ip_options) + 40];
struct ip_options *opt = (struct ip_options *)optbuf;
+ int res;
if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
return;
@@ -1735,7 +1736,11 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
memset(opt, 0, sizeof(struct ip_options));
opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
- if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL))
+ rcu_read_lock();
+ res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL);
+ rcu_read_unlock();
+
+ if (res)
return;
if (gateway)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e4632bd2026d..30fa42f5997d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1566,11 +1566,11 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
}
}
ip_mc_up(in_dev);
- /* fall through */
+ fallthrough;
case NETDEV_CHANGEADDR:
if (!IN_DEV_ARP_NOTIFY(in_dev))
break;
- /* fall through */
+ fallthrough;
case NETDEV_NOTIFY_PEERS:
/* Send gratuitous ARP to notify of link change */
inetdev_send_gratuitous_arp(dev, in_dev);
@@ -1588,7 +1588,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (inetdev_valid_mtu(dev->mtu))
break;
/* disable IP when MTU is not enough */
- /* fall through */
+ fallthrough;
case NETDEV_UNREGISTER:
inetdev_destroy(in_dev);
break;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 103c7d599a3c..8b07f3a4f2db 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -341,22 +341,6 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
-static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
-{
- /* Fill padding... */
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = proto;
-}
-
static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
int encap_type,
struct esp_info *esp,
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index e2e219c7854a..731022cff600 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -132,6 +132,36 @@ static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
return segs;
}
+static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ int proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ } else if (x->sel.family != AF_INET6) {
+ skb->transport_header -= IPV4_BEET_PHMAXLEN;
+ } else if (proto == IPPROTO_TCP) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ }
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
@@ -141,6 +171,8 @@ static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
return xfrm4_tunnel_gso_segment(x, skb, features);
case XFRM_MODE_TRANSPORT:
return xfrm4_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm4_beet_gso_segment(x, skb, features);
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 577db1d50a24..213be9c050ad 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -997,7 +997,9 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
return -ENOENT;
}
+ rcu_read_lock();
err = fib_table_dump(tb, skb, cb, &filter);
+ rcu_read_unlock();
return skb->len ? : err;
}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c092e9a55790..818916b2a04d 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -35,7 +35,7 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
void fib_release_info(struct fib_info *);
struct fib_info *fib_create_info(struct fib_config *cfg,
struct netlink_ext_ack *extack);
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack);
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a803cdd9400a..6ed8c9317179 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -570,8 +570,9 @@ static int fib_detect_death(struct fib_info *fi, int order,
return 1;
}
-int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap,
- u16 encap_type, void *cfg, gfp_t gfp_flags,
+int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
+ struct nlattr *encap, u16 encap_type,
+ void *cfg, gfp_t gfp_flags,
struct netlink_ext_ack *extack)
{
int err;
@@ -589,8 +590,9 @@ int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap,
err = -EINVAL;
goto lwt_failure;
}
- err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family,
- cfg, &lwtstate, extack);
+ err = lwtunnel_build_state(net, encap_type, encap,
+ nhc->nhc_family, cfg, &lwtstate,
+ extack);
if (err)
goto lwt_failure;
@@ -614,7 +616,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
nh->fib_nh_family = AF_INET;
- err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap,
+ err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap,
cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
if (err)
return err;
@@ -814,7 +816,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
-static int fib_encap_match(u16 encap_type,
+static int fib_encap_match(struct net *net, u16 encap_type,
struct nlattr *encap,
const struct fib_nh *nh,
const struct fib_config *cfg,
@@ -826,7 +828,7 @@ static int fib_encap_match(u16 encap_type,
if (encap_type == LWTUNNEL_ENCAP_NONE)
return 0;
- ret = lwtunnel_build_state(encap_type, encap, AF_INET,
+ ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
cfg, &lwtstate, extack);
if (!ret) {
result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
@@ -836,7 +838,7 @@ static int fib_encap_match(u16 encap_type,
return result;
}
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -857,8 +859,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh = fib_info_nh(fi, 0);
if (cfg->fc_encap) {
- if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
- nh, cfg, extack))
+ if (fib_encap_match(net, cfg->fc_encap_type,
+ cfg->fc_encap, nh, cfg, extack))
return 1;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -1962,7 +1964,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
nexthop_nh->fib_nh_flags |= RTNH_F_DEAD;
- /* fall through */
+ fallthrough;
case NETDEV_CHANGE:
nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
break;
@@ -1984,7 +1986,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
fi->fib_flags |= RTNH_F_DEAD;
- /* fall through */
+ fallthrough;
case NETDEV_CHANGE:
fi->fib_flags |= RTNH_F_LINKDOWN;
break;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ff0c24371e33..4f334b425538 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -35,9 +35,6 @@
* Paul E. McKenney <paulmck@us.ibm.com>
* Patrick McHardy <kaber@trash.net>
*/
-
-#define VERSION "0.409"
-
#include <linux/cache.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
@@ -304,8 +301,6 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
call_rcu(&fa->rcu, __alias_free_mem);
}
-#define TNODE_KMALLOC_MAX \
- ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
#define TNODE_VMALLOC_MAX \
ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
@@ -1684,7 +1679,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi, extack) == 0 &&
+ fib_nh_match(net, cfg, fi, extack) == 0 &&
fib_metrics_match(cfg, fi)) {
fa_to_delete = fa;
break;
@@ -2577,6 +2572,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
" %zd bytes, size of tnode: %zd bytes.\n",
LEAF_SIZE, TNODE_SIZE(0));
+ rcu_read_lock();
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
@@ -2596,7 +2592,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
trie_show_usage(seq, t->stats);
#endif
}
+ cond_resched_rcu();
}
+ rcu_read_unlock();
return 0;
}
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 5fd6e8ed02b5..66fdbfe5447c 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -56,7 +56,9 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
}
EXPORT_SYMBOL_GPL(gre_del_protocol);
-/* Fills in tpi and returns header length to be pulled. */
+/* Fills in tpi and returns header length to be pulled.
+ * Note that caller must use pskb_may_pull() before pulling GRE header.
+ */
int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err, __be16 proto, int nhs)
{
@@ -110,8 +112,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
* - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
*/
if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ u8 _val, *val;
+
+ val = skb_header_pointer(skb, nhs + hdr_len,
+ sizeof(_val), &_val);
+ if (!val)
+ return -EINVAL;
tpi->proto = proto;
- if ((*(u8 *)options & 0xF0) != 0x40)
+ if ((*val & 0xF0) != 0x40)
hdr_len += 4;
}
tpi->hdr_len = hdr_len;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 18068ed42f25..fc61f51d87a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -748,6 +748,39 @@ out:;
}
EXPORT_SYMBOL(__icmp_send);
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+ struct sk_buff *cloned_skb = NULL;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ __be32 orig_ip;
+
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(ct->status & IPS_SRC_NAT)) {
+ icmp_send(skb_in, type, code, info);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct iphdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct iphdr))))
+ goto out;
+
+ orig_ip = ip_hdr(skb_in)->saddr;
+ ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip;
+ icmp_send(skb_in, type, code, info);
+ ip_hdr(skb_in)->saddr = orig_ip;
+out:
+ consume_skb(cloned_skb);
+}
+EXPORT_SYMBOL(icmp_ndo_send);
+#endif
static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
{
@@ -832,7 +865,7 @@ static bool icmp_unreach(struct sk_buff *skb)
case 3:
if (!icmp_tag_validation(iph->protocol))
goto out;
- /* fall through */
+ fallthrough;
case 0:
info = ntohs(icmph->un.frag.mtu);
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 3b9c7a2725a9..47f0502b2101 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,8 +107,6 @@
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
-#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ)
-#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ)
#define IGMP_QUERY_INTERVAL (125*HZ)
#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index a4db79b1b643..5f34eb951627 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -131,7 +131,7 @@ static int inet_csk_bind_conflict(const struct sock *sk,
{
struct sock *sk2;
bool reuse = sk->sk_reuse;
- bool reuseport = !!sk->sk_reuseport && reuseport_ok;
+ bool reuseport = !!sk->sk_reuseport;
kuid_t uid = sock_i_uid((struct sock *)sk);
/*
@@ -146,17 +146,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if ((!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- (!reuseport || !sk2->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- (sk2->sk_state != TCP_TIME_WAIT &&
- !uid_eq(uid, sock_i_uid(sk2))))) {
- if (inet_rcv_saddr_equal(sk, sk2, true))
- break;
- }
- if (!relax && reuse && sk2->sk_reuse &&
+ if (reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
+ if ((!relax ||
+ (!reuseport_ok &&
+ reuseport && sk2->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ (sk2->sk_state == TCP_TIME_WAIT ||
+ uid_eq(uid, sock_i_uid(sk2))))) &&
+ inet_rcv_saddr_equal(sk, sk2, true))
+ break;
+ } else if (!reuseport_ok ||
+ !reuseport || !sk2->sk_reuseport ||
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2)))) {
if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
@@ -176,12 +180,14 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
+ bool relax = false;
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
u32 remaining, offset;
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
+ports_exhausted:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -219,7 +225,7 @@ other_parity_scan:
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) {
- if (!inet_csk_bind_conflict(sk, tb, false, false))
+ if (!inet_csk_bind_conflict(sk, tb, relax, false))
goto success;
goto next_port;
}
@@ -239,6 +245,12 @@ next_port:
attempt_half = 2;
goto other_half_scan;
}
+
+ if (net->ipv4.sysctl_ip_autobind_reuse && !relax) {
+ /* We still have a chance to connect to different destinations */
+ relax = true;
+ goto ports_exhausted;
+ }
return NULL;
success:
*port_ret = port;
@@ -482,8 +494,28 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
}
spin_unlock_bh(&queue->fastopenq.lock);
}
+
out:
release_sock(sk);
+ if (newsk && mem_cgroup_sockets_enabled) {
+ int amt;
+
+ /* atomically get the memory usage, set and charge the
+ * newsk->sk_memcg.
+ */
+ lock_sock(newsk);
+
+ /* The socket has not been accepted yet, no need to look at
+ * newsk->sk_wmem_queued.
+ */
+ amt = sk_mem_pages(newsk->sk_forward_alloc +
+ atomic_read(&newsk->sk_rmem_alloc));
+ mem_cgroup_sk_alloc(newsk);
+ if (newsk->sk_memcg && amt)
+ mem_cgroup_charge_skmem(newsk->sk_memcg, amt);
+
+ release_sock(newsk);
+ }
if (req)
reqsk_put(req);
return newsk;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index f11e997e517b..5d50aad3cdbf 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -23,6 +23,7 @@
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/inet6_hashtables.h>
+#include <net/bpf_sk_storage.h>
#include <net/netlink.h>
#include <linux/inet.h>
@@ -100,13 +101,9 @@ static size_t inet_sk_attr_size(struct sock *sk,
aux = handler->idiag_get_aux_size(sk, net_admin);
return nla_total_size(sizeof(struct tcp_info))
- + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
- + nla_total_size(1) /* INET_DIAG_TOS */
- + nla_total_size(1) /* INET_DIAG_TCLASS */
- + nla_total_size(4) /* INET_DIAG_MARK */
- + nla_total_size(4) /* INET_DIAG_CLASS_ID */
- + nla_total_size(sizeof(struct inet_diag_meminfo))
+ nla_total_size(sizeof(struct inet_diag_msg))
+ + inet_diag_msg_attrs_size()
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+ nla_total_size(TCP_CA_NAME_MAX)
+ nla_total_size(sizeof(struct tcpvegas_info))
@@ -147,6 +144,24 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
goto errout;
+ if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) ||
+ ext & (1 << (INET_DIAG_TCLASS - 1))) {
+ u32 classid = 0;
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+#endif
+ /* Fallback to socket priority if class id isn't set.
+ * Classful qdiscs use it as direct reference to class.
+ * For cgroup2 classid is always zero.
+ */
+ if (!classid)
+ classid = sk->sk_priority;
+
+ if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
+ goto errout;
+ }
+
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = sock_i_ino(sk);
@@ -156,26 +171,28 @@ errout:
}
EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
+#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
- struct sk_buff *skb, const struct inet_diag_req_v2 *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh,
- bool net_admin)
+ struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ u16 nlmsg_flags, bool net_admin)
{
const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
+ struct inet_diag_dump_data *cb_data;
int ext = req->idiag_ext;
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
struct nlattr *attr;
void *info = NULL;
+ cb_data = cb->data;
handler = inet_diag_table[req->sdiag_protocol];
BUG_ON(!handler);
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- nlmsg_flags);
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
if (!nlh)
return -EMSGSIZE;
@@ -187,7 +204,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 0;
r->idiag_retrans = 0;
- if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ net_admin))
goto errout;
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -284,22 +303,46 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto errout;
}
- if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) ||
- ext & (1 << (INET_DIAG_TCLASS - 1))) {
- u32 classid = 0;
-
-#ifdef CONFIG_SOCK_CGROUP_DATA
- classid = sock_cgroup_classid(&sk->sk_cgrp_data);
-#endif
- /* Fallback to socket priority if class id isn't set.
- * Classful qdiscs use it as direct reference to class.
- * For cgroup2 classid is always zero.
- */
- if (!classid)
- classid = sk->sk_priority;
+ /* Keep it at the end for potential retry with a larger skb,
+ * or else do best-effort fitting, which is only done for the
+ * first_nlmsg.
+ */
+ if (cb_data->bpf_stg_diag) {
+ bool first_nlmsg = ((unsigned char *)nlh == skb->data);
+ unsigned int prev_min_dump_alloc;
+ unsigned int total_nla_size = 0;
+ unsigned int msg_len;
+ int err;
+
+ msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
+ err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb,
+ INET_DIAG_SK_BPF_STORAGES,
+ &total_nla_size);
+
+ if (!err)
+ goto out;
+
+ total_nla_size += msg_len;
+ prev_min_dump_alloc = cb->min_dump_alloc;
+ if (total_nla_size > prev_min_dump_alloc)
+ cb->min_dump_alloc = min_t(u32, total_nla_size,
+ MAX_DUMP_ALLOC_SIZE);
+
+ if (!first_nlmsg)
+ goto errout;
- if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
+ if (cb->min_dump_alloc > prev_min_dump_alloc)
+ /* Retry with pskb_expand_head() with
+ * __GFP_DIRECT_RECLAIM
+ */
goto errout;
+
+ WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc);
+
+ /* Send what we have for this sk
+ * and move on to the next sk in the following
+ * dump()
+ */
}
out:
@@ -312,30 +355,19 @@ errout:
}
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
-static int inet_csk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- const struct inet_diag_req_v2 *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh,
- bool net_admin)
-{
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
- portid, seq, nlmsg_flags, unlh, net_admin);
-}
-
static int inet_twsk_diag_fill(struct sock *sk,
struct sk_buff *skb,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ struct netlink_callback *cb,
+ u16 nlmsg_flags)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- nlmsg_flags);
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
+ sizeof(*r), nlmsg_flags);
if (!nlh)
return -EMSGSIZE;
@@ -359,16 +391,16 @@ static int inet_twsk_diag_fill(struct sock *sk,
}
static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh, bool net_admin)
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
{
struct request_sock *reqsk = inet_reqsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- nlmsg_flags);
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
if (!nlh)
return -EMSGSIZE;
@@ -397,21 +429,18 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
}
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh, bool net_admin)
+ u16 nlmsg_flags, bool net_admin)
{
if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh);
+ return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags);
if (sk->sk_state == TCP_NEW_SYN_RECV)
- return inet_req_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh, net_admin);
+ return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
- return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
- nlmsg_flags, unlh, net_admin);
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
+ net_admin);
}
struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -459,10 +488,10 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
- struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+ struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
+ struct sk_buff *in_skb = cb->skb;
bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
struct net *net = sock_net(in_skb->sk);
struct sk_buff *rep;
@@ -479,10 +508,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
goto out;
}
- err = sk_diag_fill(sk, rep, req,
- sk_user_ns(NETLINK_CB(in_skb).sk),
- NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh, net_admin);
+ err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep);
@@ -509,14 +535,21 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
int err;
handler = inet_diag_lock_handler(req->sdiag_protocol);
- if (IS_ERR(handler))
+ if (IS_ERR(handler)) {
err = PTR_ERR(handler);
- else if (cmd == SOCK_DIAG_BY_FAMILY)
- err = handler->dump_one(in_skb, nlh, req);
- else if (cmd == SOCK_DESTROY && handler->destroy)
+ } else if (cmd == SOCK_DIAG_BY_FAMILY) {
+ struct inet_diag_dump_data empty_dump_data = {};
+ struct netlink_callback cb = {
+ .nlh = nlh,
+ .skb = in_skb,
+ .data = &empty_dump_data,
+ };
+ err = handler->dump_one(&cb, req);
+ } else if (cmd == SOCK_DESTROY && handler->destroy) {
err = handler->destroy(in_skb, req);
- else
+ } else {
err = -EOPNOTSUPP;
+ }
inet_diag_unlock_handler(handler);
return err;
@@ -847,23 +880,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return len == 0 ? 0 : -EINVAL;
}
-static int inet_csk_diag_dump(struct sock *sk,
- struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- const struct nlattr *bc,
- bool net_admin)
-{
- if (!inet_diag_bc_sk(bc, sk))
- return 0;
-
- return inet_csk_diag_fill(sk, skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
- net_admin);
-}
-
static void twsk_build_assert(void)
{
BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
@@ -892,14 +908,17 @@ static void twsk_build_assert(void)
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct inet_diag_dump_data *cb_data = cb->data;
struct net *net = sock_net(skb->sk);
u32 idiag_states = r->idiag_states;
int i, num, s_i, s_num;
+ struct nlattr *bc;
struct sock *sk;
+ bc = cb_data->inet_diag_nla_bc;
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
s_i = cb->args[1];
@@ -935,8 +954,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
- if (inet_csk_diag_dump(sk, skb, cb, r,
- bc, net_admin) < 0) {
+ if (!inet_diag_bc_sk(bc, sk))
+ goto next_listen;
+
+ if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
+ cb, r, NLM_F_MULTI,
+ net_admin) < 0) {
spin_unlock(&ilb->lock);
goto done;
}
@@ -1014,11 +1037,8 @@ next_normal:
res = 0;
for (idx = 0; idx < accum; idx++) {
if (res >= 0) {
- res = sk_diag_fill(sk_arr[idx], skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh, net_admin);
+ res = sk_diag_fill(sk_arr[idx], skb, cb, r,
+ NLM_F_MULTI, net_admin);
if (res < 0)
num = num_arr[idx];
}
@@ -1042,31 +1062,101 @@ out:
EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
const struct inet_diag_handler *handler;
+ u32 prev_min_dump_alloc;
int err = 0;
+again:
+ prev_min_dump_alloc = cb->min_dump_alloc;
handler = inet_diag_lock_handler(r->sdiag_protocol);
if (!IS_ERR(handler))
- handler->dump(skb, cb, r, bc);
+ handler->dump(skb, cb, r);
else
err = PTR_ERR(handler);
inet_diag_unlock_handler(handler);
+ /* The skb is not large enough to fit one sk info and
+ * inet_sk_diag_fill() has requested for a larger skb.
+ */
+ if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) {
+ err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL);
+ if (!err)
+ goto again;
+ }
+
return err ? : skb->len;
}
static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- int hdrlen = sizeof(struct inet_diag_req_v2);
- struct nlattr *bc = NULL;
+ return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh));
+}
+
+static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet_diag_dump_data *cb_data;
+ struct sk_buff *skb = cb->skb;
+ struct nlattr *nla;
+ int rem, err;
+
+ cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL);
+ if (!cb_data)
+ return -ENOMEM;
+
+ nla_for_each_attr(nla, nlmsg_attrdata(nlh, hdrlen),
+ nlmsg_attrlen(nlh, hdrlen), rem) {
+ int type = nla_type(nla);
- if (nlmsg_attrlen(cb->nlh, hdrlen))
- bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+ if (type < __INET_DIAG_REQ_MAX)
+ cb_data->req_nlas[type] = nla;
+ }
+
+ nla = cb_data->inet_diag_nla_bc;
+ if (nla) {
+ err = inet_diag_bc_audit(nla, skb);
+ if (err) {
+ kfree(cb_data);
+ return err;
+ }
+ }
+
+ nla = cb_data->inet_diag_nla_bpf_stgs;
+ if (nla) {
+ struct bpf_sk_storage_diag *bpf_stg_diag;
- return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
+ bpf_stg_diag = bpf_sk_storage_diag_alloc(nla);
+ if (IS_ERR(bpf_stg_diag)) {
+ kfree(cb_data);
+ return PTR_ERR(bpf_stg_diag);
+ }
+ cb_data->bpf_stg_diag = bpf_stg_diag;
+ }
+
+ cb->data = cb_data;
+ return 0;
+}
+
+static int inet_diag_dump_start(struct netlink_callback *cb)
+{
+ return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2));
+}
+
+static int inet_diag_dump_start_compat(struct netlink_callback *cb)
+{
+ return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req));
+}
+
+static int inet_diag_dump_done(struct netlink_callback *cb)
+{
+ struct inet_diag_dump_data *cb_data = cb->data;
+
+ bpf_sk_storage_diag_free(cb_data->bpf_stg_diag);
+ kfree(cb->data);
+
+ return 0;
}
static int inet_diag_type2proto(int type)
@@ -1085,9 +1175,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
struct netlink_callback *cb)
{
struct inet_diag_req *rc = nlmsg_data(cb->nlh);
- int hdrlen = sizeof(struct inet_diag_req);
struct inet_diag_req_v2 req;
- struct nlattr *bc = NULL;
req.sdiag_family = AF_UNSPEC; /* compatibility */
req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
@@ -1095,10 +1183,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
req.idiag_states = rc->idiag_states;
req.id = rc->id;
- if (nlmsg_attrlen(cb->nlh, hdrlen))
- bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
-
- return __inet_diag_dump(skb, cb, &req, bc);
+ return __inet_diag_dump(skb, cb, &req);
}
static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
@@ -1126,22 +1211,12 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
- if (nlmsg_attrlen(nlh, hdrlen)) {
- struct nlattr *attr;
- int err;
-
- attr = nlmsg_find_attr(nlh, hdrlen,
- INET_DIAG_REQ_BYTECODE);
- err = inet_diag_bc_audit(attr, skb);
- if (err)
- return err;
- }
- {
- struct netlink_dump_control c = {
- .dump = inet_diag_dump_compat,
- };
- return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
- }
+ struct netlink_dump_control c = {
+ .start = inet_diag_dump_start_compat,
+ .done = inet_diag_dump_done,
+ .dump = inet_diag_dump_compat,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
}
return inet_diag_get_exact_compat(skb, nlh);
@@ -1157,22 +1232,12 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
h->nlmsg_flags & NLM_F_DUMP) {
- if (nlmsg_attrlen(h, hdrlen)) {
- struct nlattr *attr;
- int err;
-
- attr = nlmsg_find_attr(h, hdrlen,
- INET_DIAG_REQ_BYTECODE);
- err = inet_diag_bc_audit(attr, skb);
- if (err)
- return err;
- }
- {
- struct netlink_dump_control c = {
- .dump = inet_diag_dump,
- };
- return netlink_dump_start(net->diag_nlsk, skb, h, &c);
- }
+ struct netlink_dump_control c = {
+ .start = inet_diag_dump_start,
+ .done = inet_diag_dump_done,
+ .dump = inet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
}
return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h));
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8274f98c511c..029b24eeafba 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1153,6 +1153,24 @@ static int ipgre_netlink_parms(struct net_device *dev,
if (data[IFLA_GRE_FWMARK])
*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+ return 0;
+}
+
+static int erspan_netlink_parms(struct net_device *dev,
+ struct nlattr *data[],
+ struct nlattr *tb[],
+ struct ip_tunnel_parm *parms,
+ __u32 *fwmark)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
+
+ err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
+ if (err)
+ return err;
+ if (!data)
+ return 0;
+
if (data[IFLA_GRE_ERSPAN_VER]) {
t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
@@ -1276,45 +1294,70 @@ static void ipgre_tap_setup(struct net_device *dev)
ip_tunnel_setup(dev, gre_tap_net_id);
}
-static int ipgre_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
- struct netlink_ext_ack *extack)
+static int
+ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
{
- struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
- __u32 fwmark = 0;
- int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
- err = ip_tunnel_encap_setup(t, &ipencap);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
+ return 0;
+}
+
+static int ipgre_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel_parm p;
+ __u32 fwmark = 0;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
return err;
return ip_tunnel_newlink(dev, tb, &p, fwmark);
}
+static int erspan_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel_parm p;
+ __u32 fwmark = 0;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err)
+ return err;
+ return ip_tunnel_newlink(dev, tb, &p, fwmark);
+}
+
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_encap ipencap;
__u32 fwmark = t->fwmark;
struct ip_tunnel_parm p;
int err;
- if (ipgre_netlink_encap_parms(data, &ipencap)) {
- err = ip_tunnel_encap_setup(t, &ipencap);
-
- if (err < 0)
- return err;
- }
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
@@ -1327,8 +1370,34 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
t->parms.i_flags = p.i_flags;
t->parms.o_flags = p.o_flags;
- if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
- ipgre_link_update(dev, !tb[IFLA_MTU]);
+ ipgre_link_update(dev, !tb[IFLA_MTU]);
+
+ return 0;
+}
+
+static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ __u32 fwmark = t->fwmark;
+ struct ip_tunnel_parm p;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err < 0)
+ return err;
+
+ err = ip_tunnel_changelink(dev, tb, &p, fwmark);
+ if (err < 0)
+ return err;
+
+ t->parms.i_flags = p.i_flags;
+ t->parms.o_flags = p.o_flags;
return 0;
}
@@ -1519,8 +1588,8 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = {
.priv_size = sizeof(struct ip_tunnel),
.setup = erspan_setup,
.validate = erspan_validate,
- .newlink = ipgre_newlink,
- .changelink = ipgre_changelink,
+ .newlink = erspan_newlink,
+ .changelink = erspan_changelink,
.dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index aa438c6758a7..b0c244af1e4d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -509,7 +509,8 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index d84819893db9..090d3097ee15 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
* insufficent MTU.
*/
features = netif_skb_features(skb);
- BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
@@ -333,7 +333,7 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk,
switch (ret) {
case NET_XMIT_CN:
do_cn = true;
- /* fall through */
+ fallthrough;
case NET_XMIT_SUCCESS:
break;
default:
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 74e1d964a615..cd4b84310d92 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -142,11 +142,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
cand = t;
}
- if (flags & TUNNEL_NO_KEY)
- goto skip_key_lookup;
-
hlist_for_each_entry_rcu(t, head, hash_node) {
- if (t->parms.i_key != key ||
+ if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
t->parms.iph.saddr != 0 ||
t->parms.iph.daddr != 0 ||
!(t->dev->flags & IFF_UP))
@@ -158,7 +155,6 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
cand = t;
}
-skip_key_lookup:
if (cand)
return cand;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 47f8b947eef1..181b7a2a0247 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -432,7 +432,7 @@ static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
return ip_tun_parse_opts(attr, info, extack);
}
-static int ip_tun_build_state(struct nlattr *attr,
+static int ip_tun_build_state(struct net *net, struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
@@ -719,7 +719,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
[LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED },
};
-static int ip6_tun_build_state(struct nlattr *attr,
+static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 37cddd18f282..1b4e6f298648 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -187,17 +187,39 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
int mtu;
if (!dst) {
- struct rtable *rt;
-
- fl->u.ip4.flowi4_oif = dev->ifindex;
- fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
- rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
- if (IS_ERR(rt)) {
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ skb_dst_set(skb, dst);
+ break;
+#endif
+ default:
dev->stats.tx_carrier_errors++;
goto tx_error_icmp;
}
- dst = &rt->dst;
- skb_dst_set(skb, dst);
}
dst_hold(dst);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 4438f6b12335..561f15b5a944 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1621,7 +1621,7 @@ late_initcall(ip_auto_config);
/*
* Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
- * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt.
+ * command line parameter. See Documentation/admin-guide/nfs/nfsroot.rst.
*/
static int __init ic_proto_name(char *name)
{
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 6e68def66822..9cf83cc85e4a 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1465,7 +1465,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
case MRT_ADD_MFC:
case MRT_DEL_MFC:
parent = -1;
- /* fall through */
+ fallthrough;
case MRT_ADD_MFC_PROXY:
case MRT_DEL_MFC_PROXY:
if (optlen != sizeof(mfc)) {
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f1f78a742b36..b167f4a5b684 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1057,7 +1057,7 @@ struct compat_arpt_replace {
u32 underflow[NF_ARP_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters;
- struct compat_arpt_entry entries[0];
+ struct compat_arpt_entry entries[];
};
static inline void compat_release_entry(struct compat_arpt_entry *e)
@@ -1383,7 +1383,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
struct compat_arpt_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_arpt_entry entrytable[0];
+ struct compat_arpt_entry entrytable[];
};
static int compat_get_entries(struct net *net,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 10b91ebdf213..c2670eaa74e6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1211,7 +1211,7 @@ struct compat_ipt_replace {
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters; /* struct xt_counters * */
- struct compat_ipt_entry entries[0];
+ struct compat_ipt_entry entries[];
};
static int
@@ -1562,7 +1562,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
struct compat_ipt_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_ipt_entry entrytable[0];
+ struct compat_ipt_entry entrytable[];
};
static int
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 4b2d49cc9f1a..0c72156130b6 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -173,7 +173,7 @@ static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
case ICMP_REDIRECT:
/* Max length: 24 "GATEWAY=255.255.255.255 " */
nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
- /* Fall through */
+ fallthrough;
case ICMP_DEST_UNREACH:
case ICMP_SOURCE_QUENCH:
case ICMP_TIME_EXCEEDED:
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index b2aeb7bf5dac..3c25a467b3ef 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -168,7 +168,7 @@ pptp_outbound_pkt(struct sk_buff *skb,
pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
pptp_msg_name[0]);
- /* fall through */
+ fallthrough;
case PPTP_SET_LINK_INFO:
/* only need to NAT in case PAC is behind NAT box */
case PPTP_START_SESSION_REQUEST:
@@ -271,7 +271,7 @@ pptp_inbound_pkt(struct sk_buff *skb,
pr_debug("unknown inbound packet %s\n",
msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
pptp_msg_name[0]);
- /* fall through */
+ fallthrough;
case PPTP_START_SESSION_REQUEST:
case PPTP_START_SESSION_REPLY:
case PPTP_STOP_SESSION_REQUEST:
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index d072c326dd64..fdfca534d094 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1327,7 +1327,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
case AF_UNSPEC:
if (tb[NHA_GROUP])
break;
- /* fallthrough */
+ fallthrough;
default:
NL_SET_ERR_MSG(extack, "Invalid address family");
goto out;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 2580303249e2..75545a829a2b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -32,6 +32,7 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/tcp.h>
+#include <net/mptcp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <linux/bottom_half.h>
@@ -485,6 +486,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
offsetof(struct ipstats_mib, syncp)));
seq_putc(seq, '\n');
+ mptcp_seq_show(seq);
return 0;
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 3183413ebc6c..47665919048f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1034,6 +1034,7 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
}
void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&h->lock)
{
struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
@@ -1056,6 +1057,7 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(raw_seq_next);
void raw_seq_stop(struct seq_file *seq, void *v)
+ __releases(&h->lock)
{
struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index e35736b99300..1b5b8af27aaf 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -87,31 +87,30 @@ out_unlock:
return sk ? sk : ERR_PTR(-ENOENT);
}
-static int raw_diag_dump_one(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+static int raw_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
- struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *in_skb = cb->skb;
struct sk_buff *rep;
struct sock *sk;
+ struct net *net;
int err;
+ net = sock_net(in_skb->sk);
sk = raw_sock_get(net, r);
if (IS_ERR(sk))
return PTR_ERR(sk);
- rep = nlmsg_new(sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) + 64,
+ rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+ inet_diag_msg_attrs_size() +
+ nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
GFP_KERNEL);
if (!rep) {
sock_put(sk);
return -ENOMEM;
}
- err = inet_sk_diag_fill(sk, NULL, rep, r,
- sk_user_ns(NETLINK_CB(in_skb).sk),
- NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh,
+ err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0,
netlink_net_capable(in_skb, CAP_NET_ADMIN));
sock_put(sk);
@@ -136,25 +135,25 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
if (!inet_diag_bc_sk(bc, sk))
return 0;
- return inet_sk_diag_fill(sk, NULL, skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh, net_admin);
+ return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
}
static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
struct net *net = sock_net(skb->sk);
+ struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
struct sock *sk = NULL;
+ struct nlattr *bc;
if (IS_ERR(hashinfo))
return;
+ cb_data = cb->data;
+ bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ebe7060d0fc9..788c69d9bfe0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1621,12 +1621,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
struct rtable *rt_dst_alloc(struct net_device *dev,
unsigned int flags, u16 type,
- bool nopolicy, bool noxfrm, bool will_cache)
+ bool nopolicy, bool noxfrm)
{
struct rtable *rt;
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : DST_HOST) |
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
@@ -1674,7 +1673,6 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
new_rt->rt_gw6 = rt->rt_gw6;
INIT_LIST_HEAD(&new_rt->rt_uncached);
- new_rt->dst.flags |= DST_HOST;
new_rt->dst.input = rt->dst.input;
new_rt->dst.output = rt->dst.output;
new_rt->dst.error = rt->dst.error;
@@ -1734,7 +1732,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
flags |= RTCF_LOCAL;
rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
return -ENOBUFS;
@@ -1851,7 +1849,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth = rt_dst_alloc(out_dev->dev, 0, res->type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
+ IN_DEV_CONF_GET(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
goto cleanup;
@@ -2219,7 +2217,7 @@ local_input:
rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
flags | RTCF_LOCAL, res->type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
+ IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
@@ -2443,8 +2441,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
add:
rth = rt_dst_alloc(dev_out, flags, type,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(in_dev, NOXFRM),
- do_cache);
+ IN_DEV_CONF_GET(in_dev, NOXFRM));
if (!rth)
return ERR_PTR(-ENOBUFS);
@@ -2774,6 +2771,54 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
+struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net, __be32 *saddr,
+ const struct ip_tunnel_info *info,
+ u8 protocol, bool use_cache)
+{
+#ifdef CONFIG_DST_CACHE
+ struct dst_cache *dst_cache;
+#endif
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+ __u8 tos;
+
+#ifdef CONFIG_DST_CACHE
+ dst_cache = (struct dst_cache *)&info->dst_cache;
+ if (use_cache) {
+ rt = dst_cache_get_ip4(dst_cache, saddr);
+ if (rt)
+ return rt;
+ }
+#endif
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_proto = protocol;
+ fl4.daddr = info->key.u.ipv4.dst;
+ fl4.saddr = info->key.u.ipv4.src;
+ tos = info->key.tos;
+ fl4.flowi4_tos = RT_TOS(tos);
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
+ return ERR_PTR(-ENETUNREACH);
+ }
+ if (rt->dst.dev == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
+ ip_rt_put(rt);
+ return ERR_PTR(-ELOOP);
+ }
+#ifdef CONFIG_DST_CACHE
+ if (use_cache)
+ dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
+#endif
+ *saddr = fl4.saddr;
+ return rt;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
+
/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
struct rtable *rt, u32 table_id, struct flowi4 *fl4,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9684af02e0a5..81b267e990a1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -555,18 +555,6 @@ static struct ctl_table ipv4_table[] = {
},
#endif /* CONFIG_NETLABEL */
{
- .procname = "tcp_available_congestion_control",
- .maxlen = TCP_CA_BUF_MAX,
- .mode = 0444,
- .proc_handler = proc_tcp_available_congestion_control,
- },
- {
- .procname = "tcp_allowed_congestion_control",
- .maxlen = TCP_CA_BUF_MAX,
- .mode = 0644,
- .proc_handler = proc_allowed_congestion_control,
- },
- {
.procname = "tcp_available_ulp",
.maxlen = TCP_ULP_BUF_MAX,
.mode = 0444,
@@ -776,6 +764,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "ip_autobind_reuse",
+ .data = &init_net.ipv4.sysctl_ip_autobind_reuse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "fwmark_reflect",
.data = &init_net.ipv4.sysctl_fwmark_reflect,
.maxlen = sizeof(int),
@@ -886,6 +883,18 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_tcp_congestion_control,
},
{
+ .procname = "tcp_available_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_tcp_available_congestion_control,
+ },
+ {
+ .procname = "tcp_allowed_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0644,
+ .proc_handler = proc_allowed_congestion_control,
+ },
+ {
.procname = "tcp_keepalive_time",
.data = &init_net.ipv4.sysctl_tcp_keepalive_time,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index eb2d80519f8e..6d87de434377 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2251,7 +2251,7 @@ void tcp_set_state(struct sock *sk, int state)
if (inet_csk(sk)->icsk_bind_hash &&
!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
inet_put_port(sk);
- /* fall through */
+ fallthrough;
default:
if (oldstate == TCP_ESTABLISHED)
TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
@@ -2948,8 +2948,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EPERM;
else if (tp->repair_queue == TCP_SEND_QUEUE)
WRITE_ONCE(tp->write_seq, val);
- else if (tp->repair_queue == TCP_RECV_QUEUE)
+ else if (tp->repair_queue == TCP_RECV_QUEUE) {
WRITE_ONCE(tp->rcv_nxt, val);
+ WRITE_ONCE(tp->copied_seq, val);
+ }
else
err = -EINVAL;
break;
@@ -3344,6 +3346,7 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
0;
}
@@ -3399,6 +3402,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
+ nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
+ max_t(int, 0, tp->write_seq - tp->snd_nxt));
return stats;
}
@@ -3667,13 +3672,35 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
if (get_user(len, optlen))
return -EFAULT;
- if (len != sizeof(zc))
+ if (len < offsetofend(struct tcp_zerocopy_receive, length))
return -EINVAL;
+ if (len > sizeof(zc)) {
+ len = sizeof(zc);
+ if (put_user(len, optlen))
+ return -EFAULT;
+ }
if (copy_from_user(&zc, optval, len))
return -EFAULT;
lock_sock(sk);
err = tcp_zerocopy_receive(sk, &zc);
release_sock(sk);
+ if (len == sizeof(zc))
+ goto zerocopy_rcv_sk_err;
+ switch (len) {
+ case offsetofend(struct tcp_zerocopy_receive, err):
+ goto zerocopy_rcv_sk_err;
+ case offsetofend(struct tcp_zerocopy_receive, inq):
+ goto zerocopy_rcv_inq;
+ case offsetofend(struct tcp_zerocopy_receive, length):
+ default:
+ goto zerocopy_rcv_out;
+ }
+zerocopy_rcv_sk_err:
+ if (!err)
+ zc.err = sock_error(sk);
+zerocopy_rcv_inq:
+ zc.inq = tcp_inq_hint(sk);
+zerocopy_rcv_out:
if (!err && copy_to_user(optval, &zc, len))
err = -EFAULT;
return err;
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 645cc3009e64..f5f588b1f6e9 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -145,12 +145,13 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
- else {
- bictcp_update(ca, tp->snd_cwnd);
- tcp_cong_avoid_ai(tp, ca->cnt, 1);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
}
+ bictcp_update(ca, tp->snd_cwnd);
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
/*
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 8a01428f80c1..5a05327f97c1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -10,38 +10,6 @@
#include <net/inet_common.h>
#include <net/tls.h>
-static bool tcp_bpf_stream_read(const struct sock *sk)
-{
- struct sk_psock *psock;
- bool empty = true;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (likely(psock))
- empty = list_empty(&psock->ingress_msg);
- rcu_read_unlock();
- return !empty;
-}
-
-static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
- int flags, long timeo, int *err)
-{
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
- int ret = 0;
-
- if (!timeo)
- return ret;
-
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- ret = sk_wait_event(sk, &timeo,
- !list_empty(&psock->ingress_msg) ||
- !skb_queue_empty(&sk->sk_receive_queue), &wait);
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
- return ret;
-}
-
int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
struct msghdr *msg, int len, int flags)
{
@@ -115,49 +83,6 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
}
EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
-int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags, int *addr_len)
-{
- struct sk_psock *psock;
- int copied, ret;
-
- psock = sk_psock_get(sk);
- if (unlikely(!psock))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- if (unlikely(flags & MSG_ERRQUEUE))
- return inet_recv_error(sk, msg, len, addr_len);
- if (!skb_queue_empty(&sk->sk_receive_queue) &&
- sk_psock_queue_empty(psock))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- lock_sock(sk);
-msg_bytes_ready:
- copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
- if (!copied) {
- int data, err = 0;
- long timeo;
-
- timeo = sock_rcvtimeo(sk, nonblock);
- data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
- if (data) {
- if (!sk_psock_queue_empty(psock))
- goto msg_bytes_ready;
- release_sock(sk);
- sk_psock_put(sk, psock);
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- }
- if (err) {
- ret = err;
- goto out;
- }
- copied = -EAGAIN;
- }
- ret = copied;
-out:
- release_sock(sk);
- sk_psock_put(sk, psock);
- return ret;
-}
-
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, u32 apply_bytes, int flags)
{
@@ -298,6 +223,82 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
}
EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+#ifdef CONFIG_BPF_STREAM_PARSER
+static bool tcp_bpf_stream_read(const struct sock *sk)
+{
+ struct sk_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock))
+ empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
+ return !empty;
+}
+
+static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
+ int flags, long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
+ sk_psock_queue_empty(psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ lock_sock(sk);
+msg_bytes_ready:
+ copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ int data, err = 0;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+ if (data) {
+ if (!sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ }
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+out:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, int *copied, int flags)
{
@@ -528,57 +529,6 @@ out_err:
return copied ? copied : err;
}
-static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
-{
- struct sk_psock_link *link;
-
- while ((link = sk_psock_link_pop(psock))) {
- sk_psock_unlink(sk, link);
- sk_psock_free_link(link);
- }
-}
-
-static void tcp_bpf_unhash(struct sock *sk)
-{
- void (*saved_unhash)(struct sock *sk);
- struct sk_psock *psock;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- if (sk->sk_prot->unhash)
- sk->sk_prot->unhash(sk);
- return;
- }
-
- saved_unhash = psock->saved_unhash;
- tcp_bpf_remove(sk, psock);
- rcu_read_unlock();
- saved_unhash(sk);
-}
-
-static void tcp_bpf_close(struct sock *sk, long timeout)
-{
- void (*saved_close)(struct sock *sk, long timeout);
- struct sk_psock *psock;
-
- lock_sock(sk);
- rcu_read_lock();
- psock = sk_psock(sk);
- if (unlikely(!psock)) {
- rcu_read_unlock();
- release_sock(sk);
- return sk->sk_prot->close(sk, timeout);
- }
-
- saved_close = psock->saved_close;
- tcp_bpf_remove(sk, psock);
- rcu_read_unlock();
- release_sock(sk);
- saved_close(sk, timeout);
-}
-
enum {
TCP_BPF_IPV4,
TCP_BPF_IPV6,
@@ -599,8 +549,8 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
struct proto *base)
{
prot[TCP_BPF_BASE] = *base;
- prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash;
- prot[TCP_BPF_BASE].close = tcp_bpf_close;
+ prot[TCP_BPF_BASE].unhash = sock_map_unhash;
+ prot[TCP_BPF_BASE].close = sock_map_close;
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
@@ -629,26 +579,6 @@ static int __init tcp_bpf_v4_build_proto(void)
}
core_initcall(tcp_bpf_v4_build_proto);
-static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
-{
- int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
- int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
-
- sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
-}
-
-static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
-{
- int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
- int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
-
- /* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
- * or added requiring sk_prot hook updates. We keep original saved
- * hooks in this case.
- */
- sk->sk_prot = &tcp_bpf_prots[family][config];
-}
-
static int tcp_bpf_assert_proto_ops(struct proto *ops)
{
/* In order to avoid retpoline, we make assumptions when we call
@@ -660,34 +590,34 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
}
-void tcp_bpf_reinit(struct sock *sk)
+struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
{
- struct sk_psock *psock;
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
- sock_owned_by_me(sk);
+ if (!psock->sk_proto) {
+ struct proto *ops = READ_ONCE(sk->sk_prot);
- rcu_read_lock();
- psock = sk_psock(sk);
- tcp_bpf_reinit_sk_prot(sk, psock);
- rcu_read_unlock();
+ if (tcp_bpf_assert_proto_ops(ops))
+ return ERR_PTR(-EINVAL);
+
+ tcp_bpf_check_v6_needs_rebuild(sk, ops);
+ }
+
+ return &tcp_bpf_prots[family][config];
}
-int tcp_bpf_init(struct sock *sk)
+/* If a child got cloned from a listening socket that had tcp_bpf
+ * protocol callbacks installed, we need to restore the callbacks to
+ * the default ones because the child does not inherit the psock state
+ * that tcp_bpf callbacks expect.
+ */
+void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
{
- struct proto *ops = READ_ONCE(sk->sk_prot);
- struct sk_psock *psock;
-
- sock_owned_by_me(sk);
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ struct proto *prot = newsk->sk_prot;
- rcu_read_lock();
- psock = sk_psock(sk);
- if (unlikely(!psock || psock->sk_proto ||
- tcp_bpf_assert_proto_ops(ops))) {
- rcu_read_unlock();
- return -EINVAL;
- }
- tcp_bpf_check_v6_needs_rebuild(sk, ops);
- tcp_bpf_update_sk_prot(sk, psock);
- rcu_read_unlock();
- return 0;
+ if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE])
+ newsk->sk_prot = sk->sk_prot_creator;
}
+#endif /* CONFIG_BPF_STREAM_PARSER */
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 0d08f9e2d8d0..75a1c985f49a 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -179,15 +179,15 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
}
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
+ inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r);
}
-static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int tcp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
+ return inet_diag_dump_one_icsk(&tcp_hashinfo, cb, req);
}
#ifdef CONFIG_INET_DIAG_DESTROY
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 316ebdf8151d..bf4ced9273e8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2865,7 +2865,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
(*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
- /* fall through */
+ fallthrough;
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
@@ -6124,7 +6124,11 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
struct request_sock *req;
- tcp_try_undo_loss(sk, false);
+ /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
+ * undo. If peer SACKs triggered fast recovery, we can't undo here.
+ */
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+ tcp_try_undo_loss(sk, false);
/* Reset rtx states to prevent spurious retransmits_timed_out() */
tcp_sk(sk)->retrans_stamp = 0;
@@ -6363,7 +6367,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
mptcp_incoming_options(sk, skb, &tp->rx_opt);
break;
}
- /* fall through */
+ fallthrough;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
@@ -6378,7 +6382,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 1;
}
}
- /* Fall through */
+ fallthrough;
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index df1166b76126..83a5d24e13b8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1019,7 +1019,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
if (!md5sig)
return NULL;
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
if (key->l3index && key->l3index != l3index)
@@ -1064,7 +1065,8 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
if (family == AF_INET6)
size = sizeof(struct in6_addr);
#endif
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
if (key->l3index && key->l3index != l3index)
@@ -2070,7 +2072,7 @@ do_time_wait:
}
}
/* to ACK */
- /* fall through */
+ fallthrough;
case TCP_TW_ACK:
tcp_v4_timewait_ack(sk, skb);
break;
@@ -2366,7 +2368,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
break;
st->bucket = 0;
st->state = TCP_SEQ_STATE_ESTABLISHED;
- /* Fallthrough */
+ fallthrough;
case TCP_SEQ_STATE_ESTABLISHED:
if (st->bucket > tcp_hashinfo.ehash_mask)
break;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ad3b56d9fa71..7e40322cc5ec 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -548,6 +548,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
+ tcp_bpf_clone(sk, newsk);
+
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
return newsk;
@@ -772,6 +774,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!child)
goto listen_overflow;
+ if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) {
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ return child;
+ }
+
sock_rps_save_rxhash(child, skb);
tcp_synack_rtt_meas(child, req);
*req_stolen = !own_req;
@@ -817,6 +825,7 @@ EXPORT_SYMBOL(tcp_check_req);
int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb)
+ __releases(&((child)->sk_lock.slock))
{
int ret = 0;
int state = child->sk_state;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 306e25d743e8..2f45cde168c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1109,6 +1109,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(!skb))
return -ENOBUFS;
+ /* retransmit skbs might have a non zero value in skb->dev
+ * because skb->dev is aliased with skb->rbnode.rb_left
+ */
+ skb->dev = NULL;
}
inet = inet_sk(sk);
@@ -3037,8 +3041,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
tcp_skb_tsorted_save(skb) {
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
- -ENOBUFS;
+ if (nskb) {
+ nskb->dev = NULL;
+ err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
+ } else {
+ err = -ENOBUFS;
+ }
} tcp_skb_tsorted_restore(skb);
if (!err) {
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 471571e1ab26..6cebf412d590 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -10,10 +10,9 @@
#include <net/tcp.h>
/* These factors derived from the recommended values in the aer:
- * .01 and and 7/8. We use 50 instead of 100 to account for
- * delayed ack.
+ * .01 and and 7/8.
*/
-#define TCP_SCALABLE_AI_CNT 50U
+#define TCP_SCALABLE_AI_CNT 100U
#define TCP_SCALABLE_MD_SCALE 3
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
@@ -23,11 +22,13 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
- else
- tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
- 1);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ acked);
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 38d3ad141161..7c27aa629af1 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -22,7 +22,8 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
{
struct tcp_ulp_ops *e;
- list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
+ list_for_each_entry_rcu(e, &tcp_ulp_list, list,
+ lockdep_is_held(&tcp_ulp_list_lock)) {
if (strcmp(e->name, name) == 0)
return e;
}
@@ -104,12 +105,6 @@ void tcp_update_ulp(struct sock *sk, struct proto *proto,
{
struct inet_connection_sock *icsk = inet_csk(sk);
- if (!icsk->icsk_ulp_ops) {
- sk->sk_write_space = write_space;
- sk->sk_prot = proto;
- return;
- }
-
if (icsk->icsk_ulp_ops->update)
icsk->icsk_ulp_ops->update(sk, proto, write_space);
}
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 3b36bb1a0dda..50a9a6e2c4cd 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -153,31 +153,34 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
if (tcp_in_slow_start(tp)) {
- /* Slow start. */
- tcp_slow_start(tp, acked);
+ /* Slow start. */
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto done;
+ }
+
+ /* Congestion avoidance. */
+ if (veno->diff < beta) {
+ /* In the "non-congestive state", increase cwnd
+ * every rtt.
+ */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
} else {
- /* Congestion avoidance. */
- if (veno->diff < beta) {
- /* In the "non-congestive state", increase cwnd
- * every rtt.
- */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
- } else {
- /* In the "congestive state", increase cwnd
- * every other rtt.
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (veno->inc &&
- tp->snd_cwnd < tp->snd_cwnd_clamp) {
- tp->snd_cwnd++;
- veno->inc = 0;
- } else
- veno->inc = 1;
- tp->snd_cwnd_cnt = 0;
+ /* In the "congestive state", increase cwnd
+ * every other rtt.
+ */
+ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+ if (veno->inc &&
+ tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ tp->snd_cwnd++;
+ veno->inc = 0;
} else
- tp->snd_cwnd_cnt++;
- }
+ veno->inc = 1;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt += acked;
}
+done:
if (tp->snd_cwnd < 2)
tp->snd_cwnd = 2;
else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index e00570dd0a69..3bb448761ca3 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -36,8 +36,6 @@ struct yeah {
u32 reno_count;
u32 fast_count;
-
- u32 pkts_acked;
};
static void tcp_yeah_init(struct sock *sk)
@@ -57,18 +55,6 @@ static void tcp_yeah_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void tcp_yeah_pkts_acked(struct sock *sk,
- const struct ack_sample *sample)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct yeah *yeah = inet_csk_ca(sk);
-
- if (icsk->icsk_ca_state == TCP_CA_Open)
- yeah->pkts_acked = sample->pkts_acked;
-
- tcp_vegas_pkts_acked(sk, sample);
-}
-
static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -77,24 +63,19 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto do_vegas;
+ }
- else if (!yeah->doing_reno_now) {
+ if (!yeah->doing_reno_now) {
/* Scalable */
-
- tp->snd_cwnd_cnt += yeah->pkts_acked;
- if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
-
- yeah->pkts_acked = 1;
-
+ tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+ acked);
} else {
/* Reno */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
}
/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
@@ -118,7 +99,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* of bytes we send in an RTT is often less than our cwnd will allow.
* So we keep track of our cwnd separately, in v_beg_snd_cwnd.
*/
-
+do_vegas:
if (after(ack, yeah->vegas.beg_snd_nxt)) {
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
@@ -232,7 +213,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
.get_info = tcp_vegas_get_info,
- .pkts_acked = tcp_yeah_pkts_acked,
+ .pkts_acked = tcp_vegas_pkts_acked,
.owner = THIS_MODULE,
.name = "yeah",
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index db76b9609299..32564b350823 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1671,10 +1671,11 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
error = -EAGAIN;
do {
spin_lock_bh(&queue->lock);
- skb = __skb_try_recv_from_queue(sk, queue, flags,
- udp_skb_destructor,
- off, err, &last);
+ skb = __skb_try_recv_from_queue(sk, queue, flags, off,
+ err, &last);
if (skb) {
+ if (!(flags & MSG_PEEK))
+ udp_skb_destructor(sk, skb);
spin_unlock_bh(&queue->lock);
return skb;
}
@@ -1692,9 +1693,10 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, queue);
- skb = __skb_try_recv_from_queue(sk, queue, flags,
- udp_skb_dtor_locked,
- off, err, &last);
+ skb = __skb_try_recv_from_queue(sk, queue, flags, off,
+ err, &last);
+ if (skb && !(flags & MSG_PEEK))
+ udp_skb_dtor_locked(sk, skb);
spin_unlock(&sk_queue->lock);
spin_unlock_bh(&queue->lock);
if (skb)
@@ -1857,8 +1859,12 @@ int __udp_disconnect(struct sock *sk, int flags)
inet->inet_dport = 0;
sock_rps_reset_rxhash(sk);
sk->sk_bound_dev_if = 0;
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
inet_reset_saddr(sk);
+ if (sk->sk_prot->rehash &&
+ (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ sk->sk_prot->rehash(sk);
+ }
if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
sk->sk_prot->unhash(sk);
@@ -2103,7 +2109,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (likely(!udp_unexpected_gso(sk, skb)))
return udp_queue_rcv_one_skb(sk, skb);
- BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
__skb_push(skb, -skb_mac_offset(skb));
segs = udp_rcv_segment(sk, skb, true);
skb_list_walk_safe(segs, skb, next) {
@@ -2282,6 +2288,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
+ bool refcounted;
/*
* Validate the packet.
@@ -2307,7 +2314,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -2316,7 +2323,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
udp_sk_rx_dst_set(sk, dst);
ret = udp_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
@@ -2557,7 +2565,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
case UDP_ENCAP_ESPINUDP_NON_IKE:
up->encap_rcv = xfrm4_udp_encap_rcv;
#endif
- /* FALLTHROUGH */
+ fallthrough;
case UDP_ENCAP_L2TPINUDP:
up->encap_type = val;
lock_sock(sk);
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
new file mode 100644
index 000000000000..eddd973e6575
--- /dev/null
+++ b/net/ipv4/udp_bpf.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */
+
+#include <linux/skmsg.h>
+#include <net/sock.h>
+#include <net/udp.h>
+
+enum {
+ UDP_BPF_IPV4,
+ UDP_BPF_IPV6,
+ UDP_BPF_NUM_PROTS,
+};
+
+static struct proto *udpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(udpv6_prot_lock);
+static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
+
+static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+ *prot = *base;
+ prot->unhash = sock_map_unhash;
+ prot->close = sock_map_close;
+}
+
+static void udp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+{
+ if (sk->sk_family == AF_INET6 &&
+ unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) {
+ spin_lock_bh(&udpv6_prot_lock);
+ if (likely(ops != udpv6_prot_saved)) {
+ udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops);
+ smp_store_release(&udpv6_prot_saved, ops);
+ }
+ spin_unlock_bh(&udpv6_prot_lock);
+ }
+}
+
+static int __init udp_bpf_v4_build_proto(void)
+{
+ udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot);
+ return 0;
+}
+core_initcall(udp_bpf_v4_build_proto);
+
+struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock)
+{
+ int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
+
+ if (!psock->sk_proto)
+ udp_bpf_check_v6_needs_rebuild(sk, READ_ONCE(sk->sk_prot));
+
+ return &udp_bpf_prots[family];
+}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 910555a4d9fe..1dbece34496e 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -21,16 +21,15 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
if (!inet_diag_bc_sk(bc, sk))
return 0;
- return inet_sk_diag_fill(sk, NULL, skb, req,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
+ return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
+ net_admin);
}
-static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+static int udp_dump_one(struct udp_table *tbl,
+ struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
+ struct sk_buff *in_skb = cb->skb;
int err = -EINVAL;
struct sock *sk = NULL;
struct sk_buff *rep;
@@ -64,17 +63,15 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
goto out;
err = -ENOMEM;
- rep = nlmsg_new(sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) + 64,
+ rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+ inet_diag_msg_attrs_size() +
+ nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
GFP_KERNEL);
if (!rep)
goto out;
- err = inet_sk_diag_fill(sk, NULL, rep, req,
- sk_user_ns(NETLINK_CB(in_skb).sk),
- NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh,
- netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
kfree_skb(rep);
@@ -93,12 +90,16 @@ out_nosk:
static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
+ struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
+ struct nlattr *bc;
+ cb_data = cb->data;
+ bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -146,15 +147,15 @@ done:
}
static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- udp_dump(&udp_table, skb, cb, r, bc);
+ udp_dump(&udp_table, skb, cb, r);
}
-static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int udp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return udp_dump_one(&udp_table, in_skb, nlh, req);
+ return udp_dump_one(&udp_table, cb, req);
}
static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -249,16 +250,15 @@ static const struct inet_diag_handler udp_diag_handler = {
};
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- udp_dump(&udplite_table, skb, cb, r, bc);
+ udp_dump(&udplite_table, skb, cb, r);
}
-static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int udplite_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return udp_dump_one(&udplite_table, in_skb, nlh, req);
+ return udp_dump_one(&udplite_table, cb, req);
}
static const struct inet_diag_handler udplite_diag_handler = {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 1a98583a79f4..e67a66fbf27b 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -453,6 +453,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
unsigned int off = skb_gro_offset(skb);
int flush = 1;
+ NAPI_GRO_CB(skb)->is_flist = 0;
if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ae1344e4cec5..2ccaee98fddb 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -303,4 +303,14 @@ config IPV6_SEG6_BPF
depends on IPV6_SEG6_LWTUNNEL
depends on IPV6 = y
+config IPV6_RPL_LWTUNNEL
+ bool "IPv6: RPL Source Routing Header support"
+ depends on IPV6
+ select LWTUNNEL
+ ---help---
+ Support for RFC6554 RPL Source Routing Header using the lightweight
+ tunnels mechanism.
+
+ If unsure, say N.
+
endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 8ccf35514015..cf7b47bdb9b3 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -10,7 +10,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
- udp_offload.o seg6.o fib6_notifier.o
+ udp_offload.o seg6.o fib6_notifier.o rpl.o
ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o
@@ -26,6 +26,7 @@ ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
ipv6-$(CONFIG_NETLABEL) += calipso.o
ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
+ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o
ipv6-objs += $(ipv6-y)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index cb493e15959c..a11fd4d67832 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -236,6 +236,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.enhanced_dad = 1,
.addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
.disable_policy = 0,
+ .rpl_seg_enabled = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -290,6 +291,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.enhanced_dad = 1,
.addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
.disable_policy = 0,
+ .rpl_seg_enabled = 0,
};
/* Check if link is ready: is it up and is a valid qdisc available */
@@ -1226,11 +1228,13 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
}
static void
-cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
+cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
+ bool del_rt, bool del_peer)
{
struct fib6_info *f6i;
- f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len,
+ f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr,
+ ifp->prefix_len,
ifp->idev->dev, 0, RTF_DEFAULT, true);
if (f6i) {
if (del_rt)
@@ -1293,7 +1297,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
if (action != CLEANUP_PREFIX_RT_NOP) {
cleanup_prefix_route(ifp, expires,
- action == CLEANUP_PREFIX_RT_DEL);
+ action == CLEANUP_PREFIX_RT_DEL, false);
}
/* clean up prefsrc entries */
@@ -3299,7 +3303,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
switch (idev->cnf.addr_gen_mode) {
case IN6_ADDR_GEN_MODE_RANDOM:
ipv6_gen_mode_random_init(idev);
- /* fallthrough */
+ fallthrough;
case IN6_ADDR_GEN_MODE_STABLE_PRIVACY:
if (!ipv6_generate_stable_address(&addr, 0, idev))
addrconf_add_linklocal(idev, &addr,
@@ -3345,6 +3349,10 @@ static void addrconf_dev_config(struct net_device *dev)
(dev->type != ARPHRD_NONE) &&
(dev->type != ARPHRD_RAWIP)) {
/* Alas, we support only Ethernet autoconfiguration. */
+ idev = __in6_dev_get(dev);
+ if (!IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP &&
+ dev->flags & IFF_MULTICAST)
+ ipv6_mc_up(idev);
return;
}
@@ -3517,9 +3525,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
break;
run_pending = 1;
-
- /* fall through */
-
+ fallthrough;
case NETDEV_UP:
case NETDEV_CHANGE:
if (dev->flags & IFF_SLAVE)
@@ -4394,6 +4400,59 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
}
#endif
+/* RFC6554 has some algorithm to avoid loops in segment routing by
+ * checking if the segments contains any of a local interface address.
+ *
+ * Quote:
+ *
+ * To detect loops in the SRH, a router MUST determine if the SRH
+ * includes multiple addresses assigned to any interface on that router.
+ * If such addresses appear more than once and are separated by at least
+ * one address not assigned to that router.
+ */
+int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
+ unsigned char nsegs)
+{
+ const struct in6_addr *addr;
+ int i, ret = 0, found = 0;
+ struct inet6_ifaddr *ifp;
+ bool separated = false;
+ unsigned int hash;
+ bool hash_found;
+
+ rcu_read_lock();
+ for (i = 0; i < nsegs; i++) {
+ addr = &segs[i];
+ hash = inet6_addr_hash(net, addr);
+
+ hash_found = false;
+ hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
+ if (!net_eq(dev_net(ifp->idev->dev), net))
+ continue;
+
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+ hash_found = true;
+ break;
+ }
+ }
+
+ if (hash_found) {
+ if (found > 1 && separated) {
+ ret = 1;
+ break;
+ }
+
+ separated = false;
+ found++;
+ } else {
+ separated = true;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* Periodic address status verification
*/
@@ -4586,12 +4645,14 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
}
static int modify_prefix_route(struct inet6_ifaddr *ifp,
- unsigned long expires, u32 flags)
+ unsigned long expires, u32 flags,
+ bool modify_peer)
{
struct fib6_info *f6i;
u32 prio;
- f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len,
+ f6i = addrconf_get_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
+ ifp->prefix_len,
ifp->idev->dev, 0, RTF_DEFAULT, true);
if (!f6i)
return -ENOENT;
@@ -4602,7 +4663,8 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp,
ip6_del_rt(dev_net(ifp->idev->dev), f6i);
/* add new one */
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
+ ifp->prefix_len,
ifp->rt_priority, ifp->idev->dev,
expires, flags, GFP_KERNEL);
} else {
@@ -4624,6 +4686,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
unsigned long timeout;
bool was_managetempaddr;
bool had_prefixroute;
+ bool new_peer = false;
ASSERT_RTNL();
@@ -4655,6 +4718,13 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
cfg->preferred_lft = timeout;
}
+ if (cfg->peer_pfx &&
+ memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) {
+ if (!ipv6_addr_any(&ifp->peer_addr))
+ cleanup_prefix_route(ifp, expires, true, true);
+ new_peer = true;
+ }
+
spin_lock_bh(&ifp->lock);
was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR;
had_prefixroute = ifp->flags & IFA_F_PERMANENT &&
@@ -4670,6 +4740,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
ifp->rt_priority = cfg->rt_priority;
+ if (new_peer)
+ ifp->peer_addr = *cfg->peer_pfx;
+
spin_unlock_bh(&ifp->lock);
if (!(ifp->flags&IFA_F_TENTATIVE))
ipv6_ifa_notify(0, ifp);
@@ -4678,7 +4751,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
int rc = -ENOENT;
if (had_prefixroute)
- rc = modify_prefix_route(ifp, expires, flags);
+ rc = modify_prefix_route(ifp, expires, flags, false);
/* prefix route could have been deleted; if so restore it */
if (rc == -ENOENT) {
@@ -4686,6 +4759,15 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
ifp->rt_priority, ifp->idev->dev,
expires, flags, GFP_KERNEL);
}
+
+ if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr))
+ rc = modify_prefix_route(ifp, expires, flags, true);
+
+ if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) {
+ addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ }
} else if (had_prefixroute) {
enum cleanup_prefix_rt_t action;
unsigned long rt_expires;
@@ -4696,7 +4778,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
if (action != CLEANUP_PREFIX_RT_NOP) {
cleanup_prefix_route(ifp, rt_expires,
- action == CLEANUP_PREFIX_RT_DEL);
+ action == CLEANUP_PREFIX_RT_DEL, false);
}
}
@@ -5440,6 +5522,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
+ array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled;
}
static inline size_t inet6_ifla6_size(void)
@@ -5983,9 +6066,9 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
if (!ipv6_addr_any(&ifp->peer_addr))
- addrconf_prefix_route(&ifp->peer_addr, 128, 0,
- ifp->idev->dev, 0, 0,
- GFP_ATOMIC);
+ addrconf_prefix_route(&ifp->peer_addr, 128,
+ ifp->rt_priority, ifp->idev->dev,
+ 0, 0, GFP_ATOMIC);
break;
case RTM_DELADDR:
if (ifp->idev->cnf.forwarding)
@@ -6821,6 +6904,13 @@ static const struct ctl_table addrconf_sysctl[] = {
.extra2 = (void *)&two_five_five,
},
{
+ .procname = "rpl_seg_enabled",
+ .data = &ipv6_devconf.rpl_seg_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
/* sentinel */
}
};
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d727c3b41495..345baa0a754f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -59,6 +59,7 @@
#endif
#include <net/calipso.h>
#include <net/seg6.h>
+#include <net/rpl.h>
#include <linux/uaccess.h>
#include <linux/mroute6.h>
@@ -1114,6 +1115,10 @@ static int __init inet6_init(void)
if (err)
goto seg6_fail;
+ err = rpl_init();
+ if (err)
+ goto rpl_fail;
+
err = igmp6_late_init();
if (err)
goto igmp6_late_err;
@@ -1136,6 +1141,8 @@ sysctl_fail:
igmp6_late_cleanup();
#endif
igmp6_late_err:
+ rpl_exit();
+rpl_fail:
seg6_exit();
seg6_fail:
calipso_exit();
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 95835e8d99aa..45e2adc56610 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -36,7 +36,7 @@ struct tmp_ext {
struct in6_addr saddr;
#endif
struct in6_addr daddr;
- char hdrs[0];
+ char hdrs[];
};
struct ah_skb_cb {
@@ -259,7 +259,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
case NEXTHDR_DEST:
if (dir == XFRM_POLICY_OUT)
ipv6_rearrange_destopt(iph, exthdr.opth);
- /* fall through */
+ fallthrough;
case NEXTHDR_HOP:
if (!zero_out_mutable_opts(exthdr.opth)) {
net_dbg_ratelimited("overrun %sopts\n",
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index a3b403ba8f8f..11143d039f16 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -207,22 +207,6 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
-static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
-{
- /* Fill padding... */
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = proto;
-}
-
int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
u8 *tail;
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index fd535053245b..8eab2c869d61 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -159,6 +159,40 @@ static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x,
return segs;
}
+static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ int proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ }
+
+ if (x->sel.family != AF_INET6) {
+ skb->transport_header -=
+ (sizeof(struct ipv6hdr) - sizeof(struct iphdr));
+
+ if (proto == IPPROTO_TCP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+ }
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
@@ -168,6 +202,8 @@ static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x,
return xfrm6_tunnel_gso_segment(x, skb, features);
case XFRM_MODE_TRANSPORT:
return xfrm6_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm6_beet_gso_segment(x, skb, features);
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index ab5add0fe6b4..5a8bbcdcaf2b 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -48,6 +48,7 @@
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
+#include <net/rpl.h>
#include <linux/uaccess.h>
@@ -97,7 +98,7 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff,
*/
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr))
break;
- /* fall through */
+ fallthrough;
case 2: /* send ICMP PARM PROB regardless and drop packet */
icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff);
return false;
@@ -468,6 +469,195 @@ looped_back:
return -1;
}
+static int ipv6_rpl_srh_rcv(struct sk_buff *skb)
+{
+ struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(skb->dev);
+ struct inet6_dev *idev;
+ struct ipv6hdr *oldhdr;
+ struct in6_addr addr;
+ unsigned char *buf;
+ int accept_rpl_seg;
+ int i, err;
+ u64 n = 0;
+ u32 r;
+
+ idev = __in6_dev_get(skb->dev);
+
+ accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled;
+ if (accept_rpl_seg > idev->cnf.rpl_seg_enabled)
+ accept_rpl_seg = idev->cnf.rpl_seg_enabled;
+
+ if (!accept_rpl_seg) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+looped_back:
+ hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+ if (hdr->segments_left == 0) {
+ if (hdr->nexthdr == NEXTHDR_IPV6) {
+ int offset = (hdr->hdrlen + 1) << 3;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+
+ if (!pskb_pull(skb, offset)) {
+ kfree_skb(skb);
+ return -1;
+ }
+ skb_postpull_rcsum(skb, skb_transport_header(skb),
+ offset);
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ __skb_tunnel_rx(skb, skb->dev, net);
+
+ netif_rx(skb);
+ return -1;
+ }
+
+ opt->srcrt = skb_network_header_len(skb);
+ opt->lastopt = opt->srcrt;
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+ return 1;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(*hdr))) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre);
+ r = do_div(n, (16 - hdr->cmpri));
+ /* checks if calculation was without remainder and n fits into
+ * unsigned char which is segments_left field. Should not be
+ * higher than that.
+ */
+ if (r || (n + 1) > 255) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ if (hdr->segments_left > n + 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
+ return -1;
+ }
+
+ if (skb_cloned(skb)) {
+ if (pskb_expand_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE, 0,
+ GFP_ATOMIC)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return -1;
+ }
+ } else {
+ err = skb_cow_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return -1;
+ }
+ }
+
+ hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+ if (!pskb_may_pull(skb, ipv6_rpl_srh_size(n, hdr->cmpri,
+ hdr->cmpre))) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ hdr->segments_left--;
+ i = n - hdr->segments_left;
+
+ buf = kzalloc(ipv6_rpl_srh_alloc_size(n + 1) * 2, GFP_ATOMIC);
+ if (unlikely(!buf)) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ ohdr = (struct ipv6_rpl_sr_hdr *)buf;
+ ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n);
+ chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3));
+
+ if ((ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) ||
+ (ipv6_addr_type(&ohdr->rpl_segaddr[i]) & IPV6_ADDR_MULTICAST)) {
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1);
+ if (err) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0);
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ addr = ipv6_hdr(skb)->daddr;
+ ipv6_hdr(skb)->daddr = ohdr->rpl_segaddr[i];
+ ohdr->rpl_segaddr[i] = addr;
+
+ ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n);
+
+ oldhdr = ipv6_hdr(skb);
+
+ skb_pull(skb, ((hdr->hdrlen + 1) << 3));
+ skb_postpull_rcsum(skb, oldhdr,
+ sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3));
+ skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr));
+ memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3);
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_postpush_rcsum(skb, ipv6_hdr(skb),
+ sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3));
+
+ kfree(buf);
+
+ skb_dst_drop(skb);
+
+ ip6_route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ dst_input(skb);
+ return -1;
+ }
+
+ if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ kfree_skb(skb);
+ return -1;
+ }
+ ipv6_hdr(skb)->hop_limit--;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ goto looped_back;
+ }
+
+ dst_input(skb);
+
+ return -1;
+}
+
/********************************
Routing header.
********************************/
@@ -506,9 +696,16 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
return -1;
}
- /* segment routing */
- if (hdr->type == IPV6_SRCRT_TYPE_4)
+ switch (hdr->type) {
+ case IPV6_SRCRT_TYPE_4:
+ /* segment routing */
return ipv6_srh_rcv(skb);
+ case IPV6_SRCRT_TYPE_3:
+ /* rpl segment routing */
+ return ipv6_rpl_srh_rcv(skb);
+ default:
+ break;
+ }
looped_back:
if (hdr->segments_left == 0) {
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index ef408a5090a2..2688f3e82165 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -898,7 +898,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
hdr = icmp6_hdr(skb);
/* to notify */
- /* fall through */
+ fallthrough;
case ICMPV6_DEST_UNREACH:
case ICMPV6_TIME_EXCEED:
case ICMPV6_PARAMPROB:
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 422dcc691f71..8c1ce78956ba 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -125,7 +125,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
};
-static int ila_build_state(struct nlattr *nla,
+static int ila_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 58fbde244381..46ed56719476 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1102,8 +1102,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
found++;
break;
}
- if (rt_can_ecmp)
- fallback_ins = fallback_ins ?: ins;
+ fallback_ins = fallback_ins ?: ins;
goto next_iter;
}
@@ -1146,7 +1145,9 @@ next_iter:
}
if (fallback_ins && !found) {
- /* No ECMP-able route found, replace first non-ECMP one */
+ /* No matching route with same ecmp-able-ness found, replace
+ * first matching route
+ */
ins = fallback_ins;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -2067,8 +2068,8 @@ static int fib6_walk_continue(struct fib6_walker *w)
continue;
}
w->state = FWS_L;
+ fallthrough;
#endif
- /* fall through */
case FWS_L:
left = rcu_dereference_protected(fn->left, 1);
if (left) {
@@ -2077,7 +2078,7 @@ static int fib6_walk_continue(struct fib6_walker *w)
continue;
}
w->state = FWS_R;
- /* fall through */
+ fallthrough;
case FWS_R:
right = rcu_dereference_protected(fn->right, 1);
if (right) {
@@ -2087,7 +2088,7 @@ static int fib6_walk_continue(struct fib6_walker *w)
}
w->state = FWS_C;
w->leaf = rcu_dereference_protected(fn->leaf, 1);
- /* fall through */
+ fallthrough;
case FWS_C:
if (w->leaf && fn->fn_flags & RTN_RTINFO) {
int err;
@@ -2106,7 +2107,7 @@ static int fib6_walk_continue(struct fib6_walker *w)
}
skip:
w->state = FWS_U;
- /* fall through */
+ fallthrough;
case FWS_U:
if (fn == w->root)
return 0;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 55bfc5149d0c..781ca8c07a0d 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -437,8 +437,6 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return -ENOENT;
switch (type) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- __u32 teli;
case ICMPV6_DEST_UNREACH:
net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
t->parms.name);
@@ -452,7 +450,10 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
break;
}
return 0;
- case ICMPV6_PARAMPROB:
+ case ICMPV6_PARAMPROB: {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 teli;
+
teli = 0;
if (code == ICMPV6_HDR_FIELD)
teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
@@ -468,6 +469,7 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
t->parms.name);
}
return 0;
+ }
case ICMPV6_PKT_TOOBIG:
ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
return 0;
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
index 02045494c24c..e0086758b6ee 100644
--- a/net/ipv6/ip6_icmp.c
+++ b/net/ipv6/ip6_icmp.c
@@ -45,4 +45,38 @@ out:
rcu_read_unlock();
}
EXPORT_SYMBOL(icmpv6_send);
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
+{
+ struct sk_buff *cloned_skb = NULL;
+ enum ip_conntrack_info ctinfo;
+ struct in6_addr orig_ip;
+ struct nf_conn *ct;
+
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(ct->status & IPS_SRC_NAT)) {
+ icmpv6_send(skb_in, type, code, info);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct ipv6hdr))))
+ goto out;
+
+ orig_ip = ipv6_hdr(skb_in)->saddr;
+ ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6;
+ icmpv6_send(skb_in, type, code, info);
+ ipv6_hdr(skb_in)->saddr = orig_ip;
+out:
+ consume_skb(cloned_skb);
+}
+EXPORT_SYMBOL(icmpv6_ndo_send);
+#endif
#endif
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 7b089d0ac8cd..e96304d8a4a7 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -285,7 +285,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
rcu_read_unlock();
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
err:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 087304427bbb..8a8c2d0cfcc8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -54,6 +54,7 @@
#include <linux/mroute6.h>
#include <net/l3mdev.h>
#include <net/lwtunnel.h>
+#include <net/ip_tunnels.h>
static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
@@ -1196,6 +1197,75 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
}
EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
+/**
+ * ip6_dst_lookup_tunnel - perform route lookup on tunnel
+ * @skb: Packet for which lookup is done
+ * @dev: Tunnel device
+ * @net: Network namespace of tunnel device
+ * @sk: Socket which provides route info
+ * @saddr: Memory to store the src ip address
+ * @info: Tunnel information
+ * @protocol: IP protocol
+ * @use_cahce: Flag to enable cache usage
+ * This function performs a route lookup on a tunnel
+ *
+ * It returns a valid dst pointer and stores src address to be used in
+ * tunnel in param saddr on success, else a pointer encoded error code.
+ */
+
+struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net,
+ struct socket *sock,
+ struct in6_addr *saddr,
+ const struct ip_tunnel_info *info,
+ u8 protocol,
+ bool use_cache)
+{
+ struct dst_entry *dst = NULL;
+#ifdef CONFIG_DST_CACHE
+ struct dst_cache *dst_cache;
+#endif
+ struct flowi6 fl6;
+ __u8 prio;
+
+#ifdef CONFIG_DST_CACHE
+ dst_cache = (struct dst_cache *)&info->dst_cache;
+ if (use_cache) {
+ dst = dst_cache_get_ip6(dst_cache, saddr);
+ if (dst)
+ return dst;
+ }
+#endif
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = protocol;
+ fl6.daddr = info->key.u.ipv6.dst;
+ fl6.saddr = info->key.u.ipv6.src;
+ prio = info->key.tos;
+ fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
+ info->key.label);
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
+ NULL);
+ if (IS_ERR(dst)) {
+ netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
+ return ERR_PTR(-ENETUNREACH);
+ }
+ if (dst->dev == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
+ dst_release(dst);
+ return ERR_PTR(-ELOOP);
+ }
+#ifdef CONFIG_DST_CACHE
+ if (use_cache)
+ dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
+#endif
+ *saddr = fl6.saddr;
+ return dst;
+}
+EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
+
static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
gfp_t gfp)
{
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index b5dd20c4599b..4703b09808d0 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -121,6 +121,7 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
/**
* ip6_tnl_lookup - fetch tunnel matching the end-point addresses
+ * @link: ifindex of underlying interface
* @remote: the address of the tunnel exit-point
* @local: the address of the tunnel entry-point
*
@@ -134,37 +135,56 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
static struct ip6_tnl *
-ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
+ip6_tnl_lookup(struct net *net, int link,
+ const struct in6_addr *remote, const struct in6_addr *local)
{
unsigned int hash = HASH(remote, local);
- struct ip6_tnl *t;
+ struct ip6_tnl *t, *cand = NULL;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct in6_addr any;
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_equal(remote, &t->parms.raddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_equal(remote, &t->parms.raddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else
+ cand = t;
}
memset(&any, 0, sizeof(any));
hash = HASH(&any, local);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_any(&t->parms.raddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_any(&t->parms.raddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else if (!cand)
+ cand = t;
}
hash = HASH(remote, &any);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(remote, &t->parms.raddr) &&
- ipv6_addr_any(&t->parms.laddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(remote, &t->parms.raddr) ||
+ !ipv6_addr_any(&t->parms.laddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else if (!cand)
+ cand = t;
}
+ if (cand)
+ return cand;
+
t = rcu_dereference(ip6n->collect_md_tun);
if (t && t->dev->flags & IFF_UP)
return t;
@@ -351,7 +371,8 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net,
(t = rtnl_dereference(*tp)) != NULL;
tp = &t->next) {
if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_equal(remote, &t->parms.raddr)) {
+ ipv6_addr_equal(remote, &t->parms.raddr) &&
+ p->link == t->parms.link) {
if (create)
return ERR_PTR(-EEXIST);
@@ -485,7 +506,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
processing of the error. */
rcu_read_lock();
- t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, &ipv6h->saddr);
+ t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr);
if (!t)
goto out;
@@ -496,8 +517,6 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
err = 0;
switch (*type) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- __u32 mtu, teli;
case ICMPV6_DEST_UNREACH:
net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
t->parms.name);
@@ -510,7 +529,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
rel_msg = 1;
}
break;
- case ICMPV6_PARAMPROB:
+ case ICMPV6_PARAMPROB: {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 teli;
+
teli = 0;
if ((*code) == ICMPV6_HDR_FIELD)
teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
@@ -527,7 +549,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
t->parms.name);
}
break;
- case ICMPV6_PKT_TOOBIG:
+ }
+ case ICMPV6_PKT_TOOBIG: {
+ __u32 mtu;
+
ip6_update_pmtu(skb, net, htonl(*info), 0, 0,
sock_net_uid(net, NULL));
mtu = *info - offset;
@@ -541,6 +566,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
rel_msg = 1;
}
break;
+ }
case NDISC_REDIRECT:
ip6_redirect(skb, net, skb->dev->ifindex, 0,
sock_net_uid(net, NULL));
@@ -887,7 +913,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
int ret = -1;
rcu_read_lock();
- t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
+ t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr);
if (t) {
u8 tproto = READ_ONCE(t->parms.proto);
@@ -1420,8 +1446,10 @@ tx_err:
static void ip6_tnl_link_config(struct ip6_tnl *t)
{
struct net_device *dev = t->dev;
+ struct net_device *tdev = NULL;
struct __ip6_tnl_parm *p = &t->parms;
struct flowi6 *fl6 = &t->fl.u.ip6;
+ unsigned int mtu;
int t_hlen;
memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
@@ -1457,22 +1485,25 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
struct rt6_info *rt = rt6_lookup(t->net,
&p->raddr, &p->laddr,
p->link, NULL, strict);
+ if (rt) {
+ tdev = rt->dst.dev;
+ ip6_rt_put(rt);
+ }
- if (!rt)
- return;
+ if (!tdev && p->link)
+ tdev = __dev_get_by_index(t->net, p->link);
- if (rt->dst.dev) {
- dev->hard_header_len = rt->dst.dev->hard_header_len +
- t_hlen;
+ if (tdev) {
+ dev->hard_header_len = tdev->hard_header_len + t_hlen;
+ mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU);
- dev->mtu = rt->dst.dev->mtu - t_hlen;
+ dev->mtu = mtu - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
if (dev->mtu < IPV6_MIN_MTU)
dev->mtu = IPV6_MIN_MTU;
}
- ip6_rt_put(rt);
}
}
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 524006aa0d78..cc6180e08a4f 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -311,7 +311,7 @@ static int vti6_rcv(struct sk_buff *skb)
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
rcu_read_unlock();
- return 0;
+ goto discard;
}
ipv6h = ipv6_hdr(skb);
@@ -450,15 +450,33 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
int mtu;
if (!dst) {
- fl->u.ip6.flowi6_oif = dev->ifindex;
- fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
- dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
- if (dst->error) {
- dst_release(dst);
- dst = NULL;
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt))
+ goto tx_err_link_failure;
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
+ break;
+ }
+ case htons(ETH_P_IPV6):
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+ skb_dst_set(skb, dst);
+ break;
+ default:
goto tx_err_link_failure;
}
- skb_dst_set(skb, dst);
}
dst_hold(dst);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index bfa49ff70531..65a54d74acc1 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -97,7 +97,8 @@ static void ipmr_expire_process(struct timer_list *t);
#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
#define ip6mr_for_each_table(mrt, net) \
- list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list)
+ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \
+ lockdep_rtnl_is_held())
static struct mr_table *ip6mr_mr_table_iter(struct net *net,
struct mr_table *mrt)
@@ -1690,7 +1691,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
parent = -1;
- /* fall through */
+ fallthrough;
case MRT6_ADD_MFC_PROXY:
case MRT6_DEL_MFC_PROXY:
if (optlen < sizeof(mfc))
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 79fc012dd2ca..debdaeba5d8c 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -183,9 +183,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
retv = -EBUSY;
break;
}
- } else if (sk->sk_protocol != IPPROTO_TCP)
+ } else if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_prot != &tcpv6_prot) {
+ retv = -EBUSY;
+ break;
+ }
break;
-
+ } else {
+ break;
+ }
if (sk->sk_state != TCP_ESTABLISHED) {
retv = -ENOTCONN;
break;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 53caf59c591e..6ffa153e5166 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -197,6 +197,7 @@ static inline int ndisc_is_useropt(const struct net_device *dev,
return opt->nd_opt_type == ND_OPT_RDNSS ||
opt->nd_opt_type == ND_OPT_DNSSL ||
opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
+ opt->nd_opt_type == ND_OPT_PREF64 ||
ndisc_ops_is_useropt(dev, opt->nd_opt_type);
}
@@ -1782,7 +1783,7 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
case NETDEV_CHANGEADDR:
neigh_changeaddr(&nd_tbl, dev);
fib6_run_gc(0, net, false);
- /* fallthrough */
+ fallthrough;
case NETDEV_UP:
idev = in6_dev_get(dev);
if (!idev)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c973ace208c5..e27393498ecb 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1227,7 +1227,7 @@ struct compat_ip6t_replace {
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters; /* struct xt_counters * */
- struct compat_ip6t_entry entries[0];
+ struct compat_ip6t_entry entries[];
};
static int
@@ -1571,7 +1571,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
struct compat_ip6t_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_ip6t_entry entrytable[0];
+ struct compat_ip6t_entry entrytable[];
};
static int
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 22b80db6d882..da64550a5707 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -248,7 +248,7 @@ static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
/* Max length: 17 "POINTER=ffffffff " */
nf_log_buf_add(m, "POINTER=%08x ",
ntohl(ic->icmp6_pointer));
- /* Fall through */
+ fallthrough;
case ICMPV6_DEST_UNREACH:
case ICMPV6_PKT_TOOBIG:
case ICMPV6_TIME_EXCEED:
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index dfe5e603ffe1..0028aa1d7869 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1076,7 +1076,7 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
if (optname == IPV6_CHECKSUM ||
optname == IPV6_HDRINCL)
break;
- /* fall through */
+ fallthrough;
default:
return ipv6_setsockopt(sk, level, optname, optval, optlen);
}
@@ -1099,7 +1099,7 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
if (optname == IPV6_CHECKSUM ||
optname == IPV6_HDRINCL)
break;
- /* fall through */
+ fallthrough;
default:
return compat_ipv6_setsockopt(sk, level, optname,
optval, optlen);
@@ -1161,7 +1161,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
if (optname == IPV6_CHECKSUM ||
optname == IPV6_HDRINCL)
break;
- /* fall through */
+ fallthrough;
default:
return ipv6_getsockopt(sk, level, optname, optval, optlen);
}
@@ -1184,7 +1184,7 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
if (optname == IPV6_CHECKSUM ||
optname == IPV6_HDRINCL)
break;
- /* fall through */
+ fallthrough;
default:
return compat_ipv6_getsockopt(sk, level, optname,
optval, optlen);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4fbdc60b4e07..310cbddaa533 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1062,8 +1062,6 @@ static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
flags |= DST_NOCOUNT;
if (rt->dst_nopolicy)
flags |= DST_NOPOLICY;
- if (rt->dst_host)
- flags |= DST_HOST;
return flags;
}
@@ -1349,7 +1347,6 @@ static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
ip6_rt_copy_init(rt, res);
rt->rt6i_flags |= RTF_CACHE;
- rt->dst.flags |= DST_HOST;
rt->rt6i_dst.addr = *daddr;
rt->rt6i_dst.plen = 128;
@@ -3142,7 +3139,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
goto out;
}
- rt->dst.flags |= DST_HOST;
rt->dst.input = ip6_input;
rt->dst.output = ip6_output;
rt->rt6i_gateway = fl6->daddr;
@@ -3475,7 +3471,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
!netif_carrier_ok(dev))
fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
- err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
+ err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
cfg->fc_encap_type, cfg, gfp_flags, extack);
if (err)
goto out;
@@ -3645,8 +3641,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
rt->fib6_dst.plen = cfg->fc_dst_len;
- if (rt->fib6_dst.plen == 128)
- rt->dst_host = true;
#ifdef CONFIG_IPV6_SUBTREES
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
@@ -4370,7 +4364,7 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
break;
}
- /* FALLTHROUGH */
+ fallthrough;
case IPSTATS_MIB_OUTNOROUTES:
IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
break;
@@ -5198,6 +5192,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
*/
cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
NLM_F_REPLACE);
+ cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
nhn++;
}
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
new file mode 100644
index 000000000000..dc4f20e23bf7
--- /dev/null
+++ b/net/ipv6/rpl.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/**
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+#define IPV6_PFXTAIL_LEN(x) (sizeof(struct in6_addr) - (x))
+
+static void ipv6_rpl_addr_decompress(struct in6_addr *dst,
+ const struct in6_addr *daddr,
+ const void *post, unsigned char pfx)
+{
+ memcpy(dst, daddr, pfx);
+ memcpy(&dst->s6_addr[pfx], post, IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void ipv6_rpl_addr_compress(void *dst, const struct in6_addr *addr,
+ unsigned char pfx)
+{
+ memcpy(dst, &addr->s6_addr[pfx], IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i)
+{
+ return (void *)&hdr->rpl_segdata[i * IPV6_PFXTAIL_LEN(hdr->cmpri)];
+}
+
+size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri,
+ unsigned char cmpre)
+{
+ return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+}
+
+void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ int i;
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ outhdr->hdrlen = (((n + 1) * sizeof(struct in6_addr)) >> 3);
+ outhdr->pad = 0;
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = 0;
+ outhdr->cmpre = 0;
+
+ for (i = 0; i <= n; i++)
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr,
+ ipv6_rpl_segdata_pos(inhdr, i),
+ inhdr->cmpri);
+
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[n], daddr,
+ ipv6_rpl_segdata_pos(inhdr, n),
+ inhdr->cmpre);
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr,
+ unsigned char n)
+{
+ unsigned char plen;
+ int i;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ for (i = 0; i <= n; i++) {
+ if (daddr->s6_addr[plen] !=
+ inhdr->rpl_segaddr[i].s6_addr[plen])
+ return plen;
+ }
+ }
+
+ return plen;
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpre(const struct in6_addr *daddr,
+ const struct in6_addr *last_segment)
+{
+ unsigned int plen;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ if (daddr->s6_addr[plen] != last_segment->s6_addr[plen])
+ break;
+ }
+
+ return plen;
+}
+
+void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ unsigned char cmpri, cmpre;
+ size_t seglen;
+ int i;
+
+ cmpri = ipv6_rpl_srh_calc_cmpri(inhdr, daddr, n);
+ cmpre = ipv6_rpl_srh_calc_cmpre(daddr, &inhdr->rpl_segaddr[n]);
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ seglen = (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+ outhdr->hdrlen = seglen >> 3;
+ if (seglen & 0x7) {
+ outhdr->hdrlen++;
+ outhdr->pad = 8 - (seglen & 0x7);
+ } else {
+ outhdr->pad = 0;
+ }
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = cmpri;
+ outhdr->cmpre = cmpre;
+
+ for (i = 0; i <= n; i++)
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i),
+ &inhdr->rpl_segaddr[i], cmpri);
+
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, n),
+ &inhdr->rpl_segaddr[n], cmpre);
+}
diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c
new file mode 100644
index 000000000000..a49ddc6cd020
--- /dev/null
+++ b/net/ipv6/rpl_iptunnel.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/**
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <linux/rpl_iptunnel.h>
+
+#include <net/dst_cache.h>
+#include <net/ip6_route.h>
+#include <net/lwtunnel.h>
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+struct rpl_iptunnel_encap {
+ struct ipv6_rpl_sr_hdr srh[0];
+};
+
+struct rpl_lwt {
+ struct dst_cache cache;
+ struct rpl_iptunnel_encap tuninfo;
+};
+
+static inline struct rpl_lwt *rpl_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct rpl_lwt *)lwt->data;
+}
+
+static inline struct rpl_iptunnel_encap *
+rpl_encap_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return &rpl_lwt_lwtunnel(lwt)->tuninfo;
+}
+
+static const struct nla_policy rpl_iptunnel_policy[RPL_IPTUNNEL_MAX + 1] = {
+ [RPL_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+};
+
+static bool rpl_validate_srh(struct net *net, struct ipv6_rpl_sr_hdr *srh,
+ size_t seglen)
+{
+ int err;
+
+ if ((srh->hdrlen << 3) != seglen)
+ return false;
+
+ /* check at least one segment and seglen fit with segments_left */
+ if (!srh->segments_left ||
+ (srh->segments_left * sizeof(struct in6_addr)) != seglen)
+ return false;
+
+ if (srh->cmpri || srh->cmpre)
+ return false;
+
+ err = ipv6_chk_rpl_srh_loop(net, srh->rpl_segaddr,
+ srh->segments_left);
+ if (err)
+ return false;
+
+ if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) &
+ IPV6_ADDR_MULTICAST)
+ return false;
+
+ return true;
+}
+
+static int rpl_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RPL_IPTUNNEL_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct ipv6_rpl_sr_hdr *srh;
+ struct rpl_lwt *rlwt;
+ int err, srh_len;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, RPL_IPTUNNEL_MAX, nla,
+ rpl_iptunnel_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[RPL_IPTUNNEL_SRH])
+ return -EINVAL;
+
+ srh = nla_data(tb[RPL_IPTUNNEL_SRH]);
+ srh_len = nla_len(tb[RPL_IPTUNNEL_SRH]);
+
+ if (srh_len < sizeof(*srh))
+ return -EINVAL;
+
+ /* verify that SRH is consistent */
+ if (!rpl_validate_srh(net, srh, srh_len - sizeof(*srh)))
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(srh_len + sizeof(*rlwt));
+ if (!newts)
+ return -ENOMEM;
+
+ rlwt = rpl_lwt_lwtunnel(newts);
+
+ err = dst_cache_init(&rlwt->cache, GFP_ATOMIC);
+ if (err) {
+ kfree(newts);
+ return err;
+ }
+
+ memcpy(&rlwt->tuninfo.srh, srh, srh_len);
+
+ newts->type = LWTUNNEL_ENCAP_RPL;
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = newts;
+
+ return 0;
+}
+
+static void rpl_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&rpl_lwt_lwtunnel(lwt)->cache);
+}
+
+static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt,
+ const struct ipv6_rpl_sr_hdr *srh)
+{
+ struct ipv6_rpl_sr_hdr *isrh, *csrh;
+ const struct ipv6hdr *oldhdr;
+ struct ipv6hdr *hdr;
+ unsigned char *buf;
+ size_t hdrlen;
+ int err;
+
+ oldhdr = ipv6_hdr(skb);
+
+ buf = kzalloc(ipv6_rpl_srh_alloc_size(srh->segments_left - 1) * 2,
+ GFP_ATOMIC);
+ if (!buf)
+ return -ENOMEM;
+
+ isrh = (struct ipv6_rpl_sr_hdr *)buf;
+ csrh = (struct ipv6_rpl_sr_hdr *)(buf + ((srh->hdrlen + 1) << 3));
+
+ memcpy(isrh, srh, sizeof(*isrh));
+ memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1],
+ (srh->segments_left - 1) * 16);
+ isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr->daddr;
+
+ ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0],
+ isrh->segments_left - 1);
+
+ hdrlen = ((csrh->hdrlen + 1) << 3);
+
+ err = skb_cow_head(skb, hdrlen + skb->mac_len);
+ if (unlikely(err)) {
+ kfree(buf);
+ return err;
+ }
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ sizeof(struct ipv6hdr));
+
+ skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+ memmove(hdr, oldhdr, sizeof(*hdr));
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, csrh, hdrlen);
+
+ isrh->nexthdr = hdr->nexthdr;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+ hdr->daddr = srh->rpl_segaddr[0];
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
+
+ kfree(buf);
+
+ return 0;
+}
+
+static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct rpl_iptunnel_encap *tinfo;
+ int err = 0;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ tinfo = rpl_encap_lwtunnel(dst->lwtstate);
+
+ err = rpl_do_srh_inline(skb, rlwt, tinfo->srh);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct rpl_lwt *rlwt;
+ int err = -EINVAL;
+
+ rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
+
+ err = rpl_do_srh(skb, rlwt);
+ if (unlikely(err))
+ goto drop;
+
+ preempt_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ preempt_enable();
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ dst_release(dst);
+ goto drop;
+ }
+
+ preempt_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr);
+ preempt_enable();
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ goto drop;
+
+ return dst_output(net, sk, skb);
+
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int rpl_input(struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct rpl_lwt *rlwt;
+ int err;
+
+ rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
+
+ err = rpl_do_srh(skb, rlwt);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ preempt_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ preempt_enable();
+
+ skb_dst_drop(skb);
+
+ if (!dst) {
+ ip6_route_input(skb);
+ dst = skb_dst(skb);
+ if (!dst->error) {
+ preempt_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst,
+ &ipv6_hdr(skb)->saddr);
+ preempt_enable();
+ }
+ } else {
+ skb_dst_set(skb, dst);
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ return err;
+
+ return dst_input(skb);
+}
+
+static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype,
+ struct rpl_iptunnel_encap *tuninfo)
+{
+ struct rpl_iptunnel_encap *data;
+ struct nlattr *nla;
+ int len;
+
+ len = RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh);
+
+ nla = nla_reserve(skb, attrtype, len);
+ if (!nla)
+ return -EMSGSIZE;
+
+ data = nla_data(nla);
+ memcpy(data, tuninfo->srh, len);
+
+ return 0;
+}
+
+static int rpl_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ if (nla_put_rpl_srh(skb, RPL_IPTUNNEL_SRH, tuninfo))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rpl_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ return nla_total_size(RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh));
+}
+
+static int rpl_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct rpl_iptunnel_encap *a_hdr = rpl_encap_lwtunnel(a);
+ struct rpl_iptunnel_encap *b_hdr = rpl_encap_lwtunnel(b);
+ int len = RPL_IPTUNNEL_SRH_SIZE(a_hdr->srh);
+
+ if (len != RPL_IPTUNNEL_SRH_SIZE(b_hdr->srh))
+ return 1;
+
+ return memcmp(a_hdr, b_hdr, len);
+}
+
+static const struct lwtunnel_encap_ops rpl_ops = {
+ .build_state = rpl_build_state,
+ .destroy_state = rpl_destroy_state,
+ .output = rpl_output,
+ .input = rpl_input,
+ .fill_encap = rpl_fill_encap_info,
+ .get_encap_size = rpl_encap_nlsize,
+ .cmp_encap = rpl_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init rpl_init(void)
+{
+ int err;
+
+ err = lwtunnel_encap_add_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+ if (err)
+ goto out;
+
+ pr_info("RPL Segment Routing with IPv6\n");
+
+ return 0;
+
+out:
+ return err;
+}
+
+void rpl_exit(void)
+{
+ lwtunnel_encap_del_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+}
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index ab7f124ff5d7..c7cbfeae94f5 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -29,7 +29,7 @@
struct seg6_lwt {
struct dst_cache cache;
- struct seg6_iptunnel_encap tuninfo[0];
+ struct seg6_iptunnel_encap tuninfo[];
};
static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
@@ -268,7 +268,7 @@ static int seg6_do_srh(struct sk_buff *skb)
skb_mac_header_rebuild(skb);
skb_push(skb, skb->mac_len);
- err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE);
+ err = seg6_do_srh_encap(skb, tinfo->srh, IPPROTO_ETHERNET);
if (err)
return err;
@@ -376,7 +376,7 @@ drop:
return err;
}
-static int seg6_build_state(struct nlattr *nla,
+static int seg6_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 7cbc19731997..52493423f329 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -282,7 +282,7 @@ static int input_action_end_dx2(struct sk_buff *skb,
struct net_device *odev;
struct ethhdr *eth;
- if (!decap_and_validate(skb, NEXTHDR_NONE))
+ if (!decap_and_validate(skb, IPPROTO_ETHERNET))
goto drop;
if (!pskb_may_pull(skb, ETH_HLEN))
@@ -970,8 +970,9 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
return 0;
}
-static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
- const void *cfg, struct lwtunnel_state **ts,
+static int seg6_local_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[SEG6_LOCAL_MAX + 1];
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index ec8fcfc60a27..63b657aa8d29 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -203,29 +203,16 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
struct ctl_table *ipv6_table;
struct ctl_table *ipv6_route_table;
struct ctl_table *ipv6_icmp_table;
- int err;
+ int err, i;
err = -ENOMEM;
ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template),
GFP_KERNEL);
if (!ipv6_table)
goto out;
- ipv6_table[0].data = &net->ipv6.sysctl.bindv6only;
- ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply;
- ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency;
- ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels;
- ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect;
- ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
- ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
- ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
- ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
- ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect;
- ipv6_table[10].data = &net->ipv6.sysctl.max_dst_opts_cnt;
- ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt;
- ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
- ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
- ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
- ipv6_table[15].data = &net->ipv6.sysctl.seg6_flowlabel;
+ /* Update the variables to point into the current struct net */
+ for (i = 0; i < ARRAY_SIZE(ipv6_table_template) - 1; i++)
+ ipv6_table[i].data += (void *)net - (void *)&init_net;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index eaf09e6b7844..413b3425ac66 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1742,7 +1742,7 @@ do_time_wait:
}
}
/* to ACK */
- /* fall through */
+ fallthrough;
case TCP_TW_ACK:
tcp_v6_timewait_ack(sk, skb);
break;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5dc439a391fe..7d4151747340 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -843,6 +843,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
struct net *net = dev_net(skb->dev);
struct udphdr *uh;
struct sock *sk;
+ bool refcounted;
u32 ulen = 0;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -879,7 +880,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
goto csum_error;
/* Check if the socket is already available, e.g. due to early demux */
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -888,12 +889,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
udp6_sk_rx_dst_set(sk, dst);
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
goto report_csum_error;
}
ret = udp6_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index e11bdb0aaa15..25b7ebda2fab 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -78,7 +78,7 @@ static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const
hlist_for_each_entry_rcu(x6spi,
&xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
- list_byaddr) {
+ list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) {
if (xfrm6_addr_equal(&x6spi->addr, saddr))
return x6spi;
}
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 370da2f80e3c..25c1007f1098 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -261,7 +261,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
aggregate_strp_stats(&knet->aggregate_strp_stats,
&strp_stats);
- list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) {
+ list_for_each_entry(mux, &knet->mux_list, kcm_mux_list) {
spin_lock_bh(&mux->lock);
aggregate_mux_stats(&mux->stats, &mux_stats);
aggregate_psock_stats(&mux->aggregate_psock_stats,
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index ea9e73428ed9..56fac24a627a 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -380,9 +380,7 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
struct bpf_prog *prog = psock->bpf_prog;
int res;
- preempt_disable();
- res = BPF_PROG_RUN(prog, skb);
- preempt_enable();
+ res = bpf_prog_run_pin_on_cpu(prog, skb);
return res;
}
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 2db3d50d10a4..10cf7c3dcbb3 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -116,7 +116,7 @@ struct l2tp_session {
void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len);
void (*session_close)(struct l2tp_session *session);
void (*show)(struct seq_file *m, void *priv);
- uint8_t priv[0]; /* private data */
+ u8 priv[]; /* private data */
};
/* Describes the tunnel. It contains info to track all the associated
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 2922d4150d88..54fb8d452a7b 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -47,7 +47,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
#if 0
#define dprintk(args...) printk(KERN_DEBUG args)
#else
-#define dprintk(args...)
+#define dprintk(args...) do {} while (0)
#endif
/* Maybe we'll add some more in the future. */
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index f3a36c16a5e7..a4eccb98220a 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -56,7 +56,7 @@ found:
return sk;
}
-static void *llc_seq_start(struct seq_file *seq, loff_t *pos)
+static void *llc_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU)
{
loff_t l = *pos;
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index 57748cab0e28..b31f1021ad9c 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -26,12 +26,20 @@ void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
{
SHASH_DESC_ON_STACK(desc, tfm);
u8 out[AES_BLOCK_SIZE];
+ const __le16 *fc;
desc->tfm = tfm;
crypto_shash_init(desc);
crypto_shash_update(desc, aad, AAD_LEN);
- crypto_shash_update(desc, data, data_len - CMAC_TLEN);
+ fc = (const __le16 *)aad;
+ if (ieee80211_is_beacon(*fc)) {
+ /* mask Timestamp field to zero */
+ crypto_shash_update(desc, zero, 8);
+ crypto_shash_update(desc, data + 8, data_len - 8 - CMAC_TLEN);
+ } else {
+ crypto_shash_update(desc, data, data_len - CMAC_TLEN);
+ }
crypto_shash_finup(desc, zero, CMAC_TLEN, out);
memcpy(mic, out, CMAC_TLEN);
@@ -41,12 +49,21 @@ void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
const u8 *data, size_t data_len, u8 *mic)
{
SHASH_DESC_ON_STACK(desc, tfm);
+ const __le16 *fc;
desc->tfm = tfm;
crypto_shash_init(desc);
crypto_shash_update(desc, aad, AAD_LEN);
- crypto_shash_update(desc, data, data_len - CMAC_TLEN_256);
+ fc = (const __le16 *)aad;
+ if (ieee80211_is_beacon(*fc)) {
+ /* mask Timestamp field to zero */
+ crypto_shash_update(desc, zero, 8);
+ crypto_shash_update(desc, data + 8,
+ data_len - 8 - CMAC_TLEN_256);
+ } else {
+ crypto_shash_update(desc, data, data_len - CMAC_TLEN_256);
+ }
crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic);
}
diff --git a/net/mac80211/aes_gmac.c b/net/mac80211/aes_gmac.c
index 363ad1c1dc0c..16ba09cb5def 100644
--- a/net/mac80211/aes_gmac.c
+++ b/net/mac80211/aes_gmac.c
@@ -17,10 +17,11 @@
int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
const u8 *data, size_t data_len, u8 *mic)
{
- struct scatterlist sg[4];
+ struct scatterlist sg[5];
u8 *zero, *__aad, iv[AES_BLOCK_SIZE];
struct aead_request *aead_req;
int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
+ const __le16 *fc;
if (data_len < GMAC_MIC_LEN)
return -EINVAL;
@@ -33,11 +34,22 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
__aad = zero + GMAC_MIC_LEN;
memcpy(__aad, aad, GMAC_AAD_LEN);
- sg_init_table(sg, 4);
- sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN);
- sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN);
- sg_set_buf(&sg[2], zero, GMAC_MIC_LEN);
- sg_set_buf(&sg[3], mic, GMAC_MIC_LEN);
+ fc = (const __le16 *)aad;
+ if (ieee80211_is_beacon(*fc)) {
+ /* mask Timestamp field to zero */
+ sg_init_table(sg, 5);
+ sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN);
+ sg_set_buf(&sg[1], zero, 8);
+ sg_set_buf(&sg[2], data + 8, data_len - 8 - GMAC_MIC_LEN);
+ sg_set_buf(&sg[3], zero, GMAC_MIC_LEN);
+ sg_set_buf(&sg[4], mic, GMAC_MIC_LEN);
+ } else {
+ sg_init_table(sg, 4);
+ sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN);
+ sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN);
+ sg_set_buf(&sg[2], zero, GMAC_MIC_LEN);
+ sg_set_buf(&sg[3], mic, GMAC_MIC_LEN);
+ }
memcpy(iv, nonce, GMAC_NONCE_LEN);
memset(iv + GMAC_NONCE_LEN, 0, sizeof(iv) - GMAC_NONCE_LEN);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 000c742d0527..0f72813fed53 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -5,8 +5,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
* Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*/
#include <linux/ieee80211.h>
@@ -568,7 +567,8 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
if (pairwise && key_idx < NUM_DEFAULT_KEYS)
key = rcu_dereference(sta->ptk[key_idx]);
else if (!pairwise &&
- key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+ key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS +
+ NUM_DEFAULT_BEACON_KEYS)
key = rcu_dereference(sta->gtk[key_idx]);
} else
key = rcu_dereference(sdata->keys[key_idx]);
@@ -680,6 +680,17 @@ static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy,
return 0;
}
+static int ieee80211_config_default_beacon_key(struct wiphy *wiphy,
+ struct net_device *dev,
+ u8 key_idx)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ ieee80211_set_default_beacon_key(sdata, key_idx);
+
+ return 0;
+}
+
void sta_set_rate_info_tx(struct sta_info *sta,
const struct ieee80211_tx_rate *rate,
struct rate_info *rinfo)
@@ -981,7 +992,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
BSS_CHANGED_P2P_PS |
BSS_CHANGED_TXPOWER |
BSS_CHANGED_TWT |
- BSS_CHANGED_HE_OBSS_PD;
+ BSS_CHANGED_HE_OBSS_PD |
+ BSS_CHANGED_HE_BSS_COLOR;
int err;
int prev_beacon_int;
@@ -989,28 +1001,25 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
if (old)
return -EALREADY;
- switch (params->smps_mode) {
- case NL80211_SMPS_OFF:
- sdata->smps_mode = IEEE80211_SMPS_OFF;
- break;
- case NL80211_SMPS_STATIC:
- sdata->smps_mode = IEEE80211_SMPS_STATIC;
- break;
- case NL80211_SMPS_DYNAMIC:
- sdata->smps_mode = IEEE80211_SMPS_DYNAMIC;
- break;
- default:
- return -EINVAL;
- }
- sdata->u.ap.req_smps = sdata->smps_mode;
+ if (params->smps_mode != NL80211_SMPS_OFF)
+ return -ENOTSUPP;
+
+ sdata->smps_mode = IEEE80211_SMPS_OFF;
sdata->needed_rx_chains = sdata->local->rx_chains;
prev_beacon_int = sdata->vif.bss_conf.beacon_int;
sdata->vif.bss_conf.beacon_int = params->beacon_interval;
- if (params->he_cap)
+ if (params->he_cap && params->he_oper) {
sdata->vif.bss_conf.he_support = true;
+ sdata->vif.bss_conf.htc_trig_based_pkt_ext =
+ le32_get_bits(params->he_oper->he_oper_params,
+ IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK);
+ sdata->vif.bss_conf.frame_time_rts_th =
+ le32_get_bits(params->he_oper->he_oper_params,
+ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK);
+ }
mutex_lock(&local->mtx);
err = ieee80211_vif_use_channel(sdata, &params->chandef,
@@ -1031,6 +1040,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt;
sdata->control_port_over_nl80211 =
params->crypto.control_port_over_nl80211;
+ sdata->control_port_no_preauth =
+ params->crypto.control_port_no_preauth;
sdata->encrypt_headroom = ieee80211_cs_headroom(sdata->local,
&params->crypto,
sdata->vif.type);
@@ -1042,6 +1053,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
params->crypto.control_port_no_encrypt;
vlan->control_port_over_nl80211 =
params->crypto.control_port_over_nl80211;
+ vlan->control_port_no_preauth =
+ params->crypto.control_port_no_preauth;
vlan->encrypt_headroom =
ieee80211_cs_headroom(sdata->local,
&params->crypto,
@@ -1054,6 +1067,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
sdata->vif.bss_conf.twt_responder = params->twt_responder;
memcpy(&sdata->vif.bss_conf.he_obss_pd, &params->he_obss_pd,
sizeof(struct ieee80211_he_obss_pd));
+ memcpy(&sdata->vif.bss_conf.he_bss_color, &params->he_bss_color,
+ sizeof(struct ieee80211_he_bss_color));
sdata->vif.bss_conf.ssid_len = params->ssid_len;
if (params->ssid_len)
@@ -1166,7 +1181,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
kfree_rcu(old_beacon, rcu_head);
if (old_probe_resp)
kfree_rcu(old_probe_resp, rcu_head);
- sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF;
kfree(sdata->vif.bss_conf.ftmr_params);
sdata->vif.bss_conf.ftmr_params = NULL;
@@ -1691,20 +1705,6 @@ static int ieee80211_change_station(struct wiphy *wiphy,
mutex_unlock(&local->sta_mtx);
- if ((sdata->vif.type == NL80211_IFTYPE_AP ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
- sta->known_smps_mode != sta->sdata->bss->req_smps &&
- test_sta_flag(sta, WLAN_STA_AUTHORIZED) &&
- sta_info_tx_streams(sta) != 1) {
- ht_dbg(sta->sdata,
- "%pM just authorized and MIMO capable - update SMPS\n",
- sta->sta.addr);
- ieee80211_send_smps_action(sta->sdata,
- sta->sdata->bss->req_smps,
- sta->sta.addr,
- sta->sdata->vif.bss_conf.bssid);
- }
-
if (sdata->vif.type == NL80211_IFTYPE_STATION &&
params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) {
ieee80211_recalc_ps(local);
@@ -2636,74 +2636,6 @@ static int ieee80211_testmode_dump(struct wiphy *wiphy,
}
#endif
-int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata,
- enum ieee80211_smps_mode smps_mode)
-{
- struct sta_info *sta;
- enum ieee80211_smps_mode old_req;
-
- if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP))
- return -EINVAL;
-
- if (sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
- return 0;
-
- old_req = sdata->u.ap.req_smps;
- sdata->u.ap.req_smps = smps_mode;
-
- /* AUTOMATIC doesn't mean much for AP - don't allow it */
- if (old_req == smps_mode ||
- smps_mode == IEEE80211_SMPS_AUTOMATIC)
- return 0;
-
- ht_dbg(sdata,
- "SMPS %d requested in AP mode, sending Action frame to %d stations\n",
- smps_mode, atomic_read(&sdata->u.ap.num_mcast_sta));
-
- mutex_lock(&sdata->local->sta_mtx);
- list_for_each_entry(sta, &sdata->local->sta_list, list) {
- /*
- * Only stations associated to our AP and
- * associated VLANs
- */
- if (sta->sdata->bss != &sdata->u.ap)
- continue;
-
- /* This station doesn't support MIMO - skip it */
- if (sta_info_tx_streams(sta) == 1)
- continue;
-
- /*
- * Don't wake up a STA just to send the action frame
- * unless we are getting more restrictive.
- */
- if (test_sta_flag(sta, WLAN_STA_PS_STA) &&
- !ieee80211_smps_is_restrictive(sta->known_smps_mode,
- smps_mode)) {
- ht_dbg(sdata, "Won't send SMPS to sleeping STA %pM\n",
- sta->sta.addr);
- continue;
- }
-
- /*
- * If the STA is not authorized, wait until it gets
- * authorized and the action frame will be sent then.
- */
- if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED))
- continue;
-
- ht_dbg(sdata, "Sending SMPS to %pM\n", sta->sta.addr);
- ieee80211_send_smps_action(sdata, smps_mode, sta->sta.addr,
- sdata->vif.bss_conf.bssid);
- }
- mutex_unlock(&sdata->local->sta_mtx);
-
- sdata->smps_mode = smps_mode;
- ieee80211_queue_work(&sdata->local->hw, &sdata->recalc_smps);
-
- return 0;
-}
-
int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
enum ieee80211_smps_mode smps_mode)
{
@@ -3450,7 +3382,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
spin_lock_irqsave(&local->ack_status_lock, spin_flags);
id = idr_alloc(&local->ack_status_frames, ack_skb,
- 1, 0x40, GFP_ATOMIC);
+ 1, 0x2000, GFP_ATOMIC);
spin_unlock_irqrestore(&local->ack_status_lock, spin_flags);
if (id < 0) {
@@ -3964,6 +3896,60 @@ ieee80211_abort_pmsr(struct wiphy *wiphy, struct wireless_dev *dev,
return drv_abort_pmsr(local, sdata, request);
}
+static int ieee80211_set_tid_config(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_tid_config *tid_conf)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct sta_info *sta;
+ int ret;
+
+ if (!sdata->local->ops->set_tid_config)
+ return -EOPNOTSUPP;
+
+ if (!tid_conf->peer)
+ return drv_set_tid_config(sdata->local, sdata, NULL, tid_conf);
+
+ mutex_lock(&sdata->local->sta_mtx);
+ sta = sta_info_get_bss(sdata, tid_conf->peer);
+ if (!sta) {
+ mutex_unlock(&sdata->local->sta_mtx);
+ return -ENOENT;
+ }
+
+ ret = drv_set_tid_config(sdata->local, sdata, &sta->sta, tid_conf);
+ mutex_unlock(&sdata->local->sta_mtx);
+
+ return ret;
+}
+
+static int ieee80211_reset_tid_config(struct wiphy *wiphy,
+ struct net_device *dev,
+ const u8 *peer, u8 tid)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct sta_info *sta;
+ int ret;
+
+ if (!sdata->local->ops->reset_tid_config)
+ return -EOPNOTSUPP;
+
+ if (!peer)
+ return drv_reset_tid_config(sdata->local, sdata, NULL, tid);
+
+ mutex_lock(&sdata->local->sta_mtx);
+ sta = sta_info_get_bss(sdata, peer);
+ if (!sta) {
+ mutex_unlock(&sdata->local->sta_mtx);
+ return -ENOENT;
+ }
+
+ ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tid);
+ mutex_unlock(&sdata->local->sta_mtx);
+
+ return ret;
+}
+
const struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -3975,6 +3961,7 @@ const struct cfg80211_ops mac80211_config_ops = {
.get_key = ieee80211_get_key,
.set_default_key = ieee80211_config_default_key,
.set_default_mgmt_key = ieee80211_config_default_mgmt_key,
+ .set_default_beacon_key = ieee80211_config_default_beacon_key,
.start_ap = ieee80211_start_ap,
.change_beacon = ieee80211_change_beacon,
.stop_ap = ieee80211_stop_ap,
@@ -4063,4 +4050,6 @@ const struct cfg80211_ops mac80211_config_ops = {
.start_pmsr = ieee80211_start_pmsr,
.abort_pmsr = ieee80211_abort_pmsr,
.probe_mesh_link = ieee80211_probe_mesh_link,
+ .set_tid_config = ieee80211_set_tid_config,
+ .reset_tid_config = ieee80211_reset_tid_config,
};
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index ad41d74530c6..54080290d6e2 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -150,6 +150,59 @@ static const struct file_operations aqm_ops = {
.llseek = default_llseek,
};
+static ssize_t airtime_flags_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[128] = {}, *pos, *end;
+
+ pos = buf;
+ end = pos + sizeof(buf) - 1;
+
+ if (local->airtime_flags & AIRTIME_USE_TX)
+ pos += scnprintf(pos, end - pos, "AIRTIME_TX\t(%lx)\n",
+ AIRTIME_USE_TX);
+ if (local->airtime_flags & AIRTIME_USE_RX)
+ pos += scnprintf(pos, end - pos, "AIRTIME_RX\t(%lx)\n",
+ AIRTIME_USE_RX);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf,
+ strlen(buf));
+}
+
+static ssize_t airtime_flags_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[16];
+ size_t len;
+
+ if (count > sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ buf[sizeof(buf) - 1] = 0;
+ len = strlen(buf);
+ if (len > 0 && buf[len - 1] == '\n')
+ buf[len - 1] = 0;
+
+ if (kstrtou16(buf, 0, &local->airtime_flags))
+ return -EINVAL;
+
+ return count;
+}
+
+static const struct file_operations airtime_flags_ops = {
+ .write = airtime_flags_write,
+ .read = airtime_flags_read,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
static ssize_t aql_txq_limit_read(struct file *file,
char __user *user_buf,
size_t count,
@@ -522,8 +575,7 @@ void debugfs_hw_add(struct ieee80211_local *local)
if (local->ops->wake_tx_queue)
DEBUGFS_ADD_MODE(aqm, 0600);
- debugfs_create_u16("airtime_flags", 0600,
- phyd, &local->airtime_flags);
+ DEBUGFS_ADD_MODE(airtime_flags, 0600);
DEBUGFS_ADD(aql_txq_limit);
debugfs_create_u32("aql_threshold", 0600,
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 7b8735ced2a1..98a713475e0f 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -433,6 +433,37 @@ void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sda
sdata->debugfs.default_mgmt_key = NULL;
}
+void
+ieee80211_debugfs_key_add_beacon_default(struct ieee80211_sub_if_data *sdata)
+{
+ char buf[50];
+ struct ieee80211_key *key;
+
+ if (!sdata->vif.debugfs_dir)
+ return;
+
+ key = key_mtx_dereference(sdata->local,
+ sdata->default_beacon_key);
+ if (key) {
+ sprintf(buf, "../keys/%d", key->debugfs.cnt);
+ sdata->debugfs.default_beacon_key =
+ debugfs_create_symlink("default_beacon_key",
+ sdata->vif.debugfs_dir, buf);
+ } else {
+ ieee80211_debugfs_key_remove_beacon_default(sdata);
+ }
+}
+
+void
+ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata)
+{
+ if (!sdata)
+ return;
+
+ debugfs_remove(sdata->debugfs.default_beacon_key);
+ sdata->debugfs.default_beacon_key = NULL;
+}
+
void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
struct sta_info *sta)
{
diff --git a/net/mac80211/debugfs_key.h b/net/mac80211/debugfs_key.h
index 1cd7b8bff56c..af7cf495f8d1 100644
--- a/net/mac80211/debugfs_key.h
+++ b/net/mac80211/debugfs_key.h
@@ -10,6 +10,10 @@ void ieee80211_debugfs_key_add_mgmt_default(
struct ieee80211_sub_if_data *sdata);
void ieee80211_debugfs_key_remove_mgmt_default(
struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_add_beacon_default(
+ struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_key_remove_beacon_default(
+ struct ieee80211_sub_if_data *sdata);
void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
struct sta_info *sta);
#else
@@ -26,6 +30,12 @@ static inline void ieee80211_debugfs_key_add_mgmt_default(
static inline void ieee80211_debugfs_key_remove_mgmt_default(
struct ieee80211_sub_if_data *sdata)
{}
+static inline void ieee80211_debugfs_key_add_beacon_default(
+ struct ieee80211_sub_if_data *sdata)
+{}
+static inline void ieee80211_debugfs_key_remove_beacon_default(
+ struct ieee80211_sub_if_data *sdata)
+{}
static inline void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
struct sta_info *sta)
{}
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 64b544ae9966..3dbe7c5cefd1 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -2,6 +2,7 @@
/*
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
+ * Copyright (C) 2020 Intel Corporation
*/
#include <linux/kernel.h>
@@ -254,15 +255,11 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
smps_mode == IEEE80211_SMPS_AUTOMATIC))
return -EINVAL;
- if (sdata->vif.type != NL80211_IFTYPE_STATION &&
- sdata->vif.type != NL80211_IFTYPE_AP)
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
return -EOPNOTSUPP;
sdata_lock(sdata);
- if (sdata->vif.type == NL80211_IFTYPE_STATION)
- err = __ieee80211_request_smps_mgd(sdata, smps_mode);
- else
- err = __ieee80211_request_smps_ap(sdata, smps_mode);
+ err = __ieee80211_request_smps_mgd(sdata, smps_mode);
sdata_unlock(sdata);
return err;
@@ -282,10 +279,6 @@ static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_sub_if_data *sdata,
return snprintf(buf, buflen, "request: %s\nused: %s\n",
smps_modes[sdata->u.mgd.req_smps],
smps_modes[sdata->smps_mode]);
- if (sdata->vif.type == NL80211_IFTYPE_AP)
- return snprintf(buf, buflen, "request: %s\nused: %s\n",
- smps_modes[sdata->u.ap.req_smps],
- smps_modes[sdata->smps_mode]);
return -EINVAL;
}
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index c80b1e163ea4..829dcad69c2c 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -5,7 +5,7 @@
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2016 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2020 Intel Corporation
*/
#include <linux/debugfs.h>
@@ -78,6 +78,7 @@ static const char * const sta_flag_names[] = {
FLAG(MPSP_OWNER),
FLAG(MPSP_RECIPIENT),
FLAG(PS_DELIVER),
+ FLAG(USES_ENCRYPTION),
#undef FLAG
};
@@ -1025,12 +1026,10 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered);
- if (local->ops->wake_tx_queue)
+ if (local->ops->wake_tx_queue) {
DEBUGFS_ADD(aqm);
-
- if (wiphy_ext_feature_isset(local->hw.wiphy,
- NL80211_EXT_FEATURE_AIRTIME_FAIRNESS))
DEBUGFS_ADD(airtime);
+ }
if (wiphy_ext_feature_isset(local->hw.wiphy,
NL80211_EXT_FEATURE_AQL))
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 2c9b3eb8b652..3877710e3b48 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1358,4 +1358,31 @@ static inline void drv_del_nan_func(struct ieee80211_local *local,
trace_drv_return_void(local);
}
+static inline int drv_set_tid_config(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ struct cfg80211_tid_config *tid_conf)
+{
+ int ret;
+
+ might_sleep();
+ ret = local->ops->set_tid_config(&local->hw, &sdata->vif, sta,
+ tid_conf);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
+static inline int drv_reset_tid_config(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta, u8 tid)
+{
+ int ret;
+
+ might_sleep();
+ ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tid);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
#endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
index 736da0035135..1087f715338b 100644
--- a/net/mac80211/he.c
+++ b/net/mac80211/he.c
@@ -3,6 +3,7 @@
* HE handling
*
* Copyright(c) 2017 Intel Deutschland GmbH
+ * Copyright(c) 2019 - 2020 Intel Corporation
*/
#include "ieee80211_i.h"
@@ -49,6 +50,9 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata,
he_ppe_size);
he_cap->has_he = true;
+
+ sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta);
+ sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
}
void
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index a2e4d6b8fd98..e32906202575 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -9,6 +9,7 @@
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2007-2010, Intel Corporation
* Copyright 2017 Intel Deutschland GmbH
+ * Copyright(c) 2020 Intel Corporation
*/
#include <linux/ieee80211.h>
@@ -144,7 +145,6 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
int i, max_tx_streams;
bool changed;
enum ieee80211_sta_rx_bandwidth bw;
- enum ieee80211_smps_mode smps_mode;
memset(&ht_cap, 0, sizeof(ht_cap));
@@ -270,24 +270,30 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20;
- switch ((ht_cap.cap & IEEE80211_HT_CAP_SM_PS)
- >> IEEE80211_HT_CAP_SM_PS_SHIFT) {
- case WLAN_HT_CAP_SM_PS_INVALID:
- case WLAN_HT_CAP_SM_PS_STATIC:
- smps_mode = IEEE80211_SMPS_STATIC;
- break;
- case WLAN_HT_CAP_SM_PS_DYNAMIC:
- smps_mode = IEEE80211_SMPS_DYNAMIC;
- break;
- case WLAN_HT_CAP_SM_PS_DISABLED:
- smps_mode = IEEE80211_SMPS_OFF;
- break;
- }
-
- if (smps_mode != sta->sta.smps_mode)
- changed = true;
- sta->sta.smps_mode = smps_mode;
+ if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
+ sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
+ enum ieee80211_smps_mode smps_mode;
+
+ switch ((ht_cap.cap & IEEE80211_HT_CAP_SM_PS)
+ >> IEEE80211_HT_CAP_SM_PS_SHIFT) {
+ case WLAN_HT_CAP_SM_PS_INVALID:
+ case WLAN_HT_CAP_SM_PS_STATIC:
+ smps_mode = IEEE80211_SMPS_STATIC;
+ break;
+ case WLAN_HT_CAP_SM_PS_DYNAMIC:
+ smps_mode = IEEE80211_SMPS_DYNAMIC;
+ break;
+ case WLAN_HT_CAP_SM_PS_DISABLED:
+ smps_mode = IEEE80211_SMPS_OFF;
+ break;
+ }
+ if (smps_mode != sta->sta.smps_mode)
+ changed = true;
+ sta->sta.smps_mode = smps_mode;
+ } else {
+ sta->sta.smps_mode = IEEE80211_SMPS_OFF;
+ }
return changed;
}
@@ -544,19 +550,6 @@ void ieee80211_request_smps_mgd_work(struct work_struct *work)
sdata_unlock(sdata);
}
-void ieee80211_request_smps_ap_work(struct work_struct *work)
-{
- struct ieee80211_sub_if_data *sdata =
- container_of(work, struct ieee80211_sub_if_data,
- u.ap.request_smps_work);
-
- sdata_lock(sdata);
- if (sdata_dereference(sdata->u.ap.beacon, sdata))
- __ieee80211_request_smps_ap(sdata,
- sdata->u.ap.driver_smps_mode);
- sdata_unlock(sdata);
-}
-
void ieee80211_request_smps(struct ieee80211_vif *vif,
enum ieee80211_smps_mode smps_mode)
{
@@ -572,15 +565,6 @@ void ieee80211_request_smps(struct ieee80211_vif *vif,
sdata->u.mgd.driver_smps_mode = smps_mode;
ieee80211_queue_work(&sdata->local->hw,
&sdata->u.mgd.request_smps_work);
- } else {
- /* AUTOMATIC is meaningless in AP mode */
- if (WARN_ON_ONCE(smps_mode == IEEE80211_SMPS_AUTOMATIC))
- return;
- if (sdata->u.ap.driver_smps_mode == smps_mode)
- return;
- sdata->u.ap.driver_smps_mode = smps_mode;
- ieee80211_queue_work(&sdata->local->hw,
- &sdata->u.ap.request_smps_work);
}
}
/* this might change ... don't want non-open drivers using it */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index ad15b3be8bb3..f8ed4f621f7f 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -5,7 +5,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*/
#ifndef IEEE80211_I_H
@@ -292,10 +292,7 @@ struct ieee80211_if_ap {
struct ps_data ps;
atomic_t num_mcast_sta; /* number of stations receiving multicast */
- enum ieee80211_smps_mode req_smps, /* requested smps mode */
- driver_smps_mode; /* smps mode request */
- struct work_struct request_smps_work;
bool multicast_to_unicast;
};
@@ -904,14 +901,18 @@ struct ieee80211_sub_if_data {
/* bit field of ACM bits (BIT(802.1D tag)) */
u8 wmm_acm;
- struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
+ struct ieee80211_key __rcu *keys[NUM_DEFAULT_KEYS +
+ NUM_DEFAULT_MGMT_KEYS +
+ NUM_DEFAULT_BEACON_KEYS];
struct ieee80211_key __rcu *default_unicast_key;
struct ieee80211_key __rcu *default_multicast_key;
struct ieee80211_key __rcu *default_mgmt_key;
+ struct ieee80211_key __rcu *default_beacon_key;
u16 sequence_number;
__be16 control_port_protocol;
bool control_port_no_encrypt;
+ bool control_port_no_preauth;
bool control_port_over_nl80211;
int encrypt_headroom;
@@ -981,9 +982,12 @@ struct ieee80211_sub_if_data {
struct dentry *default_unicast_key;
struct dentry *default_multicast_key;
struct dentry *default_mgmt_key;
+ struct dentry *default_beacon_key;
} debugfs;
#endif
+ bool hw_80211_encap;
+
/* must be last, dynamically sized area in this! */
struct ieee80211_vif vif;
};
@@ -1473,6 +1477,7 @@ struct ieee802_11_elems {
const struct ieee80211_tim_ie *tim;
const u8 *challenge;
const u8 *rsn;
+ const u8 *rsnx;
const u8 *erp_info;
const u8 *ext_supp_rates;
const u8 *wmm_info;
@@ -1520,6 +1525,7 @@ struct ieee802_11_elems {
u8 tim_len;
u8 challenge_len;
u8 rsn_len;
+ u8 rsnx_len;
u8 ext_supp_rates_len;
u8 wmm_info_len;
u8 wmm_param_len;
@@ -1727,6 +1733,13 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
struct cfg80211_csa_settings *params);
/* interface handling */
+#define MAC80211_SUPPORTED_FEATURES_TX (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
+ NETIF_F_HW_CSUM | NETIF_F_SG | \
+ NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE)
+#define MAC80211_SUPPORTED_FEATURES_RX (NETIF_F_RXCSUM)
+#define MAC80211_SUPPORTED_FEATURES (MAC80211_SUPPORTED_FEATURES_TX | \
+ MAC80211_SUPPORTED_FEATURES_RX)
+
int ieee80211_iface_init(void);
void ieee80211_iface_exit(void);
int ieee80211_if_add(struct ieee80211_local *local, const char *name,
@@ -1762,6 +1775,8 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
struct net_device *dev);
netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
struct net_device *dev);
+netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
+ struct net_device *dev);
void __ieee80211_subif_start_xmit(struct sk_buff *skb,
struct net_device *dev,
u32 info_flags,
@@ -1948,6 +1963,11 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
enum nl80211_band band, u32 txdata_flags);
+/* sta_out needs to be checked for ERR_PTR() before using */
+int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct sta_info **sta_out);
+
static inline void
ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid,
@@ -2132,8 +2152,6 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata,
enum nl80211_band band, u32 *basic_rates);
int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
enum ieee80211_smps_mode smps_mode);
-int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata,
- enum ieee80211_smps_mode smps_mode);
void ieee80211_recalc_smps(struct ieee80211_sub_if_data *sdata);
void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index af8b09214786..d069825705d6 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -8,7 +8,7 @@
* Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (c) 2016 Intel Deutschland GmbH
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*/
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -519,6 +519,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
master->control_port_no_encrypt;
sdata->control_port_over_nl80211 =
master->control_port_over_nl80211;
+ sdata->control_port_no_preauth =
+ master->control_port_no_preauth;
sdata->vif.cab_queue = master->vif.cab_queue;
memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
sizeof(sdata->vif.hw_queue));
@@ -824,9 +826,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_ADHOC:
ieee80211_ibss_stop(sdata);
break;
- case NL80211_IFTYPE_AP:
- cancel_work_sync(&sdata->u.ap.request_smps_work);
- break;
case NL80211_IFTYPE_MONITOR:
if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)
break;
@@ -1205,6 +1204,72 @@ static const struct net_device_ops ieee80211_monitorif_ops = {
.ndo_get_stats64 = ieee80211_get_stats64,
};
+static const struct net_device_ops ieee80211_dataif_8023_ops = {
+ .ndo_open = ieee80211_open,
+ .ndo_stop = ieee80211_stop,
+ .ndo_uninit = ieee80211_uninit,
+ .ndo_start_xmit = ieee80211_subif_start_xmit_8023,
+ .ndo_set_rx_mode = ieee80211_set_multicast_list,
+ .ndo_set_mac_address = ieee80211_change_mac,
+ .ndo_select_queue = ieee80211_netdev_select_queue,
+ .ndo_get_stats64 = ieee80211_get_stats64,
+};
+
+static void __ieee80211_set_hw_80211_encap(struct ieee80211_sub_if_data *sdata,
+ bool enable)
+{
+ sdata->dev->netdev_ops = enable ? &ieee80211_dataif_8023_ops :
+ &ieee80211_dataif_ops;
+ sdata->hw_80211_encap = enable;
+}
+
+bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_sub_if_data *iter;
+ struct ieee80211_key *key;
+
+ mutex_lock(&local->iflist_mtx);
+ list_for_each_entry(iter, &local->interfaces, list) {
+ struct ieee80211_sub_if_data *disable = NULL;
+
+ if (vif->type == NL80211_IFTYPE_MONITOR) {
+ disable = iter;
+ __ieee80211_set_hw_80211_encap(iter, false);
+ } else if (iter->vif.type == NL80211_IFTYPE_MONITOR) {
+ disable = sdata;
+ enable = false;
+ }
+ if (disable)
+ sdata_dbg(disable,
+ "disable hw 80211 encap due to mon co-exist\n");
+ }
+ mutex_unlock(&local->iflist_mtx);
+
+ if (enable == sdata->hw_80211_encap)
+ return enable;
+
+ if (!sdata->dev)
+ return false;
+
+ if (!ieee80211_hw_check(&local->hw, SUPPORTS_TX_FRAG) &&
+ (local->hw.wiphy->frag_threshold != (u32)-1))
+ enable = false;
+
+ mutex_lock(&sdata->local->key_mtx);
+ list_for_each_entry(key, &sdata->key_list, list) {
+ if (key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)
+ enable = false;
+ }
+ mutex_unlock(&sdata->local->key_mtx);
+
+ __ieee80211_set_hw_80211_encap(sdata, enable);
+
+ return enable;
+}
+EXPORT_SYMBOL(ieee80211_set_hw_80211_encap);
+
static void ieee80211_if_free(struct net_device *dev)
{
free_percpu(dev->tstats);
@@ -1400,10 +1465,14 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE);
sdata->control_port_no_encrypt = false;
+ sdata->control_port_over_nl80211 = false;
+ sdata->control_port_no_preauth = false;
sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM;
sdata->vif.bss_conf.idle = true;
+ sdata->vif.bss_conf.txpower = INT_MIN; /* unset */
sdata->noack_map = 0;
+ sdata->hw_80211_encap = false;
/* only monitor/p2p-device differ */
if (sdata->dev) {
@@ -1427,10 +1496,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_AP:
skb_queue_head_init(&sdata->u.ap.ps.bc_buf);
INIT_LIST_HEAD(&sdata->u.ap.vlans);
- INIT_WORK(&sdata->u.ap.request_smps_work,
- ieee80211_request_smps_ap_work);
sdata->vif.bss_conf.bssid = sdata->vif.addr;
- sdata->u.ap.req_smps = IEEE80211_SMPS_OFF;
break;
case NL80211_IFTYPE_P2P_CLIENT:
type = NL80211_IFTYPE_STATION;
@@ -1772,6 +1838,10 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
if_setup, txqs, 1);
if (!ndev)
return -ENOMEM;
+
+ if (!local->ops->wake_tx_queue && local->hw.wiphy->tx_queue_len)
+ ndev->tx_queue_len = local->hw.wiphy->tx_queue_len;
+
dev_net_set(ndev, wiphy_net(local->hw.wiphy));
ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
@@ -1871,6 +1941,8 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
sdata->u.mgd.use_4addr = params->use_4addr;
ndev->features |= local->hw.netdev_features;
+ ndev->hw_features |= ndev->features &
+ MAC80211_SUPPORTED_FEATURES_TX;
netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops);
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 0f889b919b06..8f403c1bb908 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -6,7 +6,7 @@
* Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2015-2017 Intel Deutschland GmbH
- * Copyright 2018-2019 Intel Corporation
+ * Copyright 2018-2020 Intel Corporation
*/
#include <linux/if_ether.h>
@@ -177,6 +177,13 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
}
}
+ /* TKIP countermeasures don't work in encap offload mode */
+ if (key->conf.cipher == WLAN_CIPHER_SUITE_TKIP &&
+ sdata->hw_80211_encap) {
+ sdata_dbg(sdata, "TKIP is not allowed in hw 80211 encap mode\n");
+ return -EINVAL;
+ }
+
ret = drv_set_key(key->local, SET_KEY, sdata,
sta ? &sta->sta : NULL, &key->conf);
@@ -210,12 +217,20 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
case WLAN_CIPHER_SUITE_TKIP:
case WLAN_CIPHER_SUITE_CCMP:
case WLAN_CIPHER_SUITE_CCMP_256:
+ case WLAN_CIPHER_SUITE_GCMP:
+ case WLAN_CIPHER_SUITE_GCMP_256:
+ /* We cannot do software crypto of data frames with
+ * encapsulation offload enabled. However for 802.11w to
+ * function properly we need cmac/gmac keys.
+ */
+ if (sdata->hw_80211_encap)
+ return -EINVAL;
+ /* Fall through */
+
case WLAN_CIPHER_SUITE_AES_CMAC:
case WLAN_CIPHER_SUITE_BIP_CMAC_256:
case WLAN_CIPHER_SUITE_BIP_GMAC_128:
case WLAN_CIPHER_SUITE_BIP_GMAC_256:
- case WLAN_CIPHER_SUITE_GCMP:
- case WLAN_CIPHER_SUITE_GCMP_256:
/* all of these we can do in software - if driver can */
if (ret == 1)
return 0;
@@ -262,22 +277,29 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
sta ? sta->sta.addr : bcast_addr, ret);
}
-int ieee80211_set_tx_key(struct ieee80211_key *key)
+static int _ieee80211_set_tx_key(struct ieee80211_key *key, bool force)
{
struct sta_info *sta = key->sta;
struct ieee80211_local *local = key->local;
assert_key_lock(local);
+ set_sta_flag(sta, WLAN_STA_USES_ENCRYPTION);
+
sta->ptk_idx = key->conf.keyidx;
- if (!ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT))
+ if (force || !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT))
clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
ieee80211_check_fast_xmit(sta);
return 0;
}
+int ieee80211_set_tx_key(struct ieee80211_key *key)
+{
+ return _ieee80211_set_tx_key(key, false);
+}
+
static void ieee80211_pairwise_rekey(struct ieee80211_key *old,
struct ieee80211_key *new)
{
@@ -392,6 +414,31 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
mutex_unlock(&sdata->local->key_mtx);
}
+static void
+__ieee80211_set_default_beacon_key(struct ieee80211_sub_if_data *sdata, int idx)
+{
+ struct ieee80211_key *key = NULL;
+
+ assert_key_lock(sdata->local);
+
+ if (idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS &&
+ idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS +
+ NUM_DEFAULT_BEACON_KEYS)
+ key = key_mtx_dereference(sdata->local, sdata->keys[idx]);
+
+ rcu_assign_pointer(sdata->default_beacon_key, key);
+
+ ieee80211_debugfs_key_update_default(sdata);
+}
+
+void ieee80211_set_default_beacon_key(struct ieee80211_sub_if_data *sdata,
+ int idx)
+{
+ mutex_lock(&sdata->local->key_mtx);
+ __ieee80211_set_default_beacon_key(sdata, idx);
+ mutex_unlock(&sdata->local->key_mtx);
+}
+
static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta,
bool pairwise,
@@ -400,7 +447,7 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
{
int idx;
int ret = 0;
- bool defunikey, defmultikey, defmgmtkey;
+ bool defunikey, defmultikey, defmgmtkey, defbeaconkey;
/* caller must provide at least one old/new */
if (WARN_ON(!new && !old))
@@ -441,11 +488,8 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
if (pairwise) {
rcu_assign_pointer(sta->ptk[idx], new);
if (new &&
- !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) {
- sta->ptk_idx = idx;
- clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
- ieee80211_check_fast_xmit(sta);
- }
+ !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX))
+ _ieee80211_set_tx_key(new, true);
} else {
rcu_assign_pointer(sta->gtk[idx], new);
}
@@ -465,6 +509,9 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
defmgmtkey = old &&
old == key_mtx_dereference(sdata->local,
sdata->default_mgmt_key);
+ defbeaconkey = old &&
+ old == key_mtx_dereference(sdata->local,
+ sdata->default_beacon_key);
if (defunikey && !new)
__ieee80211_set_default_key(sdata, -1, true, false);
@@ -472,6 +519,8 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
__ieee80211_set_default_key(sdata, -1, false, true);
if (defmgmtkey && !new)
__ieee80211_set_default_mgmt_key(sdata, -1);
+ if (defbeaconkey && !new)
+ __ieee80211_set_default_beacon_key(sdata, -1);
rcu_assign_pointer(sdata->keys[idx], new);
if (defunikey && new)
@@ -483,6 +532,9 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
if (defmgmtkey && new)
__ieee80211_set_default_mgmt_key(sdata,
new->conf.keyidx);
+ if (defbeaconkey && new)
+ __ieee80211_set_default_beacon_key(sdata,
+ new->conf.keyidx);
}
if (old)
@@ -500,7 +552,9 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
struct ieee80211_key *key;
int i, j, err;
- if (WARN_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS))
+ if (WARN_ON(idx < 0 ||
+ idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS +
+ NUM_DEFAULT_BEACON_KEYS))
return ERR_PTR(-EINVAL);
key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL);
@@ -963,6 +1017,7 @@ static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata,
sdata->crypto_tx_tailroom_pending_dec = 0;
ieee80211_debugfs_key_remove_mgmt_default(sdata);
+ ieee80211_debugfs_key_remove_beacon_default(sdata);
list_for_each_entry_safe(key, tmp, &sdata->key_list, list) {
ieee80211_key_replace(key->sdata, key->sta,
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index d6d6e89cf7dd..7ad72e9b4991 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -17,6 +17,7 @@
#define NUM_DEFAULT_KEYS 4
#define NUM_DEFAULT_MGMT_KEYS 2
+#define NUM_DEFAULT_BEACON_KEYS 2
#define INVALID_PTK_KEYIDX 2 /* Keyidx always pointing to a NULL key for PTK */
struct ieee80211_local;
@@ -153,6 +154,8 @@ void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx,
bool uni, bool multi);
void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata,
int idx);
+void ieee80211_set_default_beacon_key(struct ieee80211_sub_if_data *sdata,
+ int idx);
void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata,
bool force_synchronize);
void ieee80211_free_sta_keys(struct ieee80211_local *local,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 4c2b5ba3ac09..8345926193de 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -146,6 +146,8 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local)
continue;
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
continue;
+ if (sdata->vif.bss_conf.txpower == INT_MIN)
+ continue;
power = min(power, sdata->vif.bss_conf.txpower);
}
rcu_read_unlock();
@@ -416,7 +418,20 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = {
},
[NL80211_IFTYPE_STATION] = {
.tx = 0xffff,
+ /*
+ * To support Pre Association Security Negotiation (PASN) while
+ * already associated to one AP, allow user space to register to
+ * Rx authentication frames, so that the user space logic would
+ * be able to receive/handle authentication frames from a
+ * different AP as part of PASN.
+ * It is expected that user space would intelligently register
+ * for Rx authentication frames, i.e., only when PASN is used
+ * and configure a match filter only for PASN authentication
+ * algorithm, as otherwise the MLME functionality of mac80211
+ * would be broken.
+ */
.rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4) |
BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
},
[NL80211_IFTYPE_AP] = {
@@ -561,7 +576,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
WIPHY_FLAG_REPORTS_OBSS |
WIPHY_FLAG_OFFCHAN_TX;
- if (ops->remain_on_channel)
+ if (!use_chanctx || ops->remain_on_channel)
wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL;
wiphy->features |= NL80211_FEATURE_SK_TX_STATUS |
@@ -574,6 +589,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_FILS_STA);
wiphy_ext_feature_set(wiphy,
NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211);
+ wiphy_ext_feature_set(wiphy,
+ NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH);
if (!ops->hw_scan) {
wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
@@ -872,7 +889,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
enum nl80211_band band;
int channels, max_bitrates;
bool supp_ht, supp_vht, supp_he;
- netdev_features_t feature_whitelist;
struct cfg80211_chan_def dflt_chandef = {};
if (ieee80211_hw_check(hw, QUEUE_CONTROL) &&
@@ -931,10 +947,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
}
/* Only HW csum features are currently compatible with mac80211 */
- feature_whitelist = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
- NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA |
- NETIF_F_GSO_SOFTWARE | NETIF_F_RXCSUM;
- if (WARN_ON(hw->netdev_features & ~feature_whitelist))
+ if (WARN_ON(hw->netdev_features & ~MAC80211_SUPPORTED_FEATURES))
return -EINVAL;
if (hw->max_report_rates == 0)
@@ -981,6 +994,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
if (!supp_he)
supp_he = !!ieee80211_get_he_sta_cap(sband);
+ /* HT, VHT, HE require QoS, thus >= 4 queues */
+ if (WARN_ON(local->hw.queues < IEEE80211_NUM_ACS &&
+ (supp_ht || supp_vht || supp_he)))
+ return -EINVAL;
+
if (!sband->ht_cap.ht_supported)
continue;
@@ -1065,6 +1083,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
NL80211_EXT_FEATURE_EXT_KEY_ID);
}
+ if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_ADHOC))
+ wiphy_ext_feature_set(local->hw.wiphy,
+ NL80211_EXT_FEATURE_DEL_IBSS_STA);
+
/*
* Calculate scan IE length -- we need this to alloc
* memory and to subtract from the driver limit. It
@@ -1184,10 +1206,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
if (!local->hw.weight_multiplier)
local->hw.weight_multiplier = 1;
- result = ieee80211_wep_init(local);
- if (result < 0)
- wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n",
- result);
+ ieee80211_wep_init(local);
local->hw.conf.flags = IEEE80211_CONF_IDLE;
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index d69983370381..38a0383dfbcf 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -1152,7 +1152,8 @@ int mesh_nexthop_resolve(struct ieee80211_sub_if_data *sdata,
}
}
- if (!(mpath->flags & MESH_PATH_RESOLVING))
+ if (!(mpath->flags & MESH_PATH_RESOLVING) &&
+ mesh_path_sel_is_hwmp(sdata))
mesh_queue_preq(mpath, PREQ_Q_F_START);
if (skb_queue_len(&mpath->frame_queue) >= MESH_FRAME_QUEUE_LEN)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 5fa13176036f..16d75da0996a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -8,7 +8,7 @@
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2020 Intel Corporation
*/
#include <linux/delay.h>
@@ -164,7 +164,9 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
chandef->center_freq1 = channel->center_freq;
if (!ht_oper || !sta_ht_cap.ht_supported) {
- ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
+ ret = IEEE80211_STA_DISABLE_HT |
+ IEEE80211_STA_DISABLE_VHT |
+ IEEE80211_STA_DISABLE_HE;
goto out;
}
@@ -185,7 +187,9 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
"Wrong control channel: center-freq: %d ht-cfreq: %d ht->primary_chan: %d band: %d - Disabling HT\n",
channel->center_freq, ht_cfreq,
ht_oper->primary_chan, channel->band);
- ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
+ ret = IEEE80211_STA_DISABLE_HT |
+ IEEE80211_STA_DISABLE_VHT |
+ IEEE80211_STA_DISABLE_HE;
goto out;
}
@@ -301,13 +305,18 @@ out:
IEEE80211_CHAN_DISABLED)) {
if (WARN_ON(chandef->width == NL80211_CHAN_WIDTH_20_NOHT)) {
ret = IEEE80211_STA_DISABLE_HT |
- IEEE80211_STA_DISABLE_VHT;
+ IEEE80211_STA_DISABLE_VHT |
+ IEEE80211_STA_DISABLE_HE;
break;
}
ret |= ieee80211_chandef_downgrade(chandef);
}
+ if (!he_oper || !cfg80211_chandef_usable(sdata->wdev.wiphy, chandef,
+ IEEE80211_CHAN_NO_HE))
+ ret |= IEEE80211_STA_DISABLE_HE;
+
if (chandef->width != vht_chandef.width && !tracking)
sdata_info(sdata,
"capabilities/regulatory prevented using AP HT/VHT configuration, downgraded\n");
@@ -393,6 +402,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata,
if (flags != (ifmgd->flags & (IEEE80211_STA_DISABLE_HT |
IEEE80211_STA_DISABLE_VHT |
+ IEEE80211_STA_DISABLE_HE |
IEEE80211_STA_DISABLE_40MHZ |
IEEE80211_STA_DISABLE_80P80MHZ |
IEEE80211_STA_DISABLE_160MHZ)) ||
@@ -616,10 +626,21 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata,
{
u8 *pos;
const struct ieee80211_sta_he_cap *he_cap = NULL;
+ struct ieee80211_chanctx_conf *chanctx_conf;
u8 he_cap_size;
+ bool reg_cap = false;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ if (!WARN_ON_ONCE(!chanctx_conf))
+ reg_cap = cfg80211_chandef_usable(sdata->wdev.wiphy,
+ &chanctx_conf->def,
+ IEEE80211_CHAN_NO_HE);
+
+ rcu_read_unlock();
he_cap = ieee80211_get_he_sta_cap(sband);
- if (!he_cap)
+ if (!he_cap || !reg_cap)
return;
/*
@@ -650,6 +671,13 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
struct ieee80211_chanctx_conf *chanctx_conf;
struct ieee80211_channel *chan;
u32 rates = 0;
+ struct element *ext_capa = NULL;
+
+ /* we know it's writable, cast away the const */
+ if (assoc_data->ie_len)
+ ext_capa = (void *)cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY,
+ assoc_data->ie,
+ assoc_data->ie_len);
sdata_assert_lock(sdata);
@@ -800,7 +828,15 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
*pos++ = ieee80211_chandef_max_power(&chanctx_conf->def);
}
- if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT) {
+ /*
+ * Per spec, we shouldn't include the list of channels if we advertise
+ * support for extended channel switching, but we've always done that;
+ * (for now?) apply this restriction only on the (new) 6 GHz band.
+ */
+ if (capab & WLAN_CAPABILITY_SPECTRUM_MGMT &&
+ (sband->band != NL80211_BAND_6GHZ ||
+ !ext_capa || ext_capa->datalen < 1 ||
+ !(ext_capa->data[0] & WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING))) {
/* TODO: get this in reg domain format */
pos = skb_put(skb, 2 * sband->n_channels + 2);
*pos++ = WLAN_EID_SUPPORTED_CHANNELS;
@@ -814,18 +850,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
/* Set MBSSID support for HE AP if needed */
if (ieee80211_hw_check(&local->hw, SUPPORTS_ONLY_HE_MULTI_BSSID) &&
- !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && assoc_data->ie_len) {
- struct element *elem;
-
- /* we know it's writable, cast away the const */
- elem = (void *)cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY,
- assoc_data->ie,
- assoc_data->ie_len);
-
- /* We can probably assume both always true */
- if (elem && elem->datalen >= 3)
- elem->data[2] |= WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT;
- }
+ !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && assoc_data->ie_len &&
+ ext_capa && ext_capa->datalen >= 3)
+ ext_capa->data[2] |= WLAN_EXT_CAPA3_MULTI_BSSID_SUPPORT;
/* if present, add any custom IEs that go before HT */
if (assoc_data->ie_len) {
@@ -1311,7 +1338,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
if (!res) {
ch_switch.timestamp = timestamp;
ch_switch.device_timestamp = device_timestamp;
- ch_switch.block_tx = beacon ? csa_ie.mode : 0;
+ ch_switch.block_tx = csa_ie.mode;
ch_switch.chandef = csa_ie.chandef;
ch_switch.count = csa_ie.count;
ch_switch.delay = csa_ie.max_switch_time;
@@ -1404,7 +1431,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
sdata->vif.csa_active = true;
sdata->csa_chandef = csa_ie.chandef;
- sdata->csa_block_tx = ch_switch.block_tx;
+ sdata->csa_block_tx = csa_ie.mode;
ifmgd->csa_ignored_same_chan = false;
if (sdata->csa_block_tx)
@@ -1438,7 +1465,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
* reset when the disconnection worker runs.
*/
sdata->vif.csa_active = true;
- sdata->csa_block_tx = ch_switch.block_tx;
+ sdata->csa_block_tx = csa_ie.mode;
ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work);
mutex_unlock(&local->chanctx_mtx);
@@ -2460,7 +2487,7 @@ void ieee80211_sta_tx_notify(struct ieee80211_sub_if_data *sdata,
if (!ieee80211_is_data(hdr->frame_control))
return;
- if (ieee80211_is_nullfunc(hdr->frame_control) &&
+ if (ieee80211_is_any_nullfunc(hdr->frame_control) &&
sdata->u.mgd.probe_send_count > 0) {
if (ack)
ieee80211_sta_reset_conn_monitor(sdata);
@@ -2959,7 +2986,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
(auth_transaction == 2 &&
ifmgd->auth_data->expected_transaction == 2)) {
if (!ieee80211_mark_sta_auth(sdata, bssid))
- goto out_err;
+ return; /* ignore frame -- wait for timeout */
} else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE &&
auth_transaction == 2) {
sdata_info(sdata, "SAE peer confirmed\n");
@@ -2967,10 +2994,6 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
}
cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
- return;
- out_err:
- mutex_unlock(&sdata->local->sta_mtx);
- /* ignore frame -- wait for timeout */
}
#define case_WLAN(type) \
@@ -3368,9 +3391,16 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
}
if (bss_conf->he_support) {
- bss_conf->bss_color =
+ bss_conf->he_bss_color.color =
le32_get_bits(elems->he_operation->he_oper_params,
IEEE80211_HE_OPERATION_BSS_COLOR_MASK);
+ bss_conf->he_bss_color.partial =
+ le32_get_bits(elems->he_operation->he_oper_params,
+ IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR);
+ bss_conf->he_bss_color.disabled =
+ le32_get_bits(elems->he_operation->he_oper_params,
+ IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED);
+ changed |= BSS_CHANGED_HE_BSS_COLOR;
bss_conf->htc_trig_based_pkt_ext =
le32_get_bits(elems->he_operation->he_oper_params,
@@ -3649,13 +3679,28 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt = (void *)skb->data;
struct ieee80211_if_managed *ifmgd;
struct ieee80211_rx_status *rx_status = (void *) skb->cb;
+ struct ieee80211_channel *channel;
size_t baselen, len = skb->len;
ifmgd = &sdata->u.mgd;
sdata_assert_lock(sdata);
- if (!ether_addr_equal(mgmt->da, sdata->vif.addr))
+ /*
+ * According to Draft P802.11ax D6.0 clause 26.17.2.3.2:
+ * "If a 6 GHz AP receives a Probe Request frame and responds with
+ * a Probe Response frame [..], the Address 1 field of the Probe
+ * Response frame shall be set to the broadcast address [..]"
+ * So, on 6GHz band we should also accept broadcast responses.
+ */
+ channel = ieee80211_get_channel(sdata->local->hw.wiphy,
+ rx_status->freq);
+ if (!channel)
+ return;
+
+ if (!ether_addr_equal(mgmt->da, sdata->vif.addr) &&
+ (channel->band != NL80211_BAND_6GHZ ||
+ !is_broadcast_ether_addr(mgmt->da)))
return; /* ignore ProbeResp to foreign address */
baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
@@ -4753,10 +4798,22 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
IEEE80211_STA_DISABLE_80P80MHZ |
IEEE80211_STA_DISABLE_160MHZ);
+ /* disable HT/VHT/HE if we don't support them */
+ if (!sband->ht_cap.ht_supported) {
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+ }
+
+ if (!sband->vht_cap.vht_supported)
+ ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+
+ if (!ieee80211_get_he_sta_cap(sband))
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+
rcu_read_lock();
- if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) &&
- sband->ht_cap.ht_supported) {
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
const u8 *ht_oper_ie, *ht_cap_ie;
ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION);
@@ -4773,8 +4830,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
}
}
- if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) &&
- sband->vht_cap.vht_supported) {
+ if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) {
const u8 *vht_oper_ie, *vht_cap;
vht_oper_ie = ieee80211_bss_get_ie(cbss,
@@ -4784,9 +4840,10 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
if (vht_oper && !ht_oper) {
vht_oper = NULL;
sdata_info(sdata,
- "AP advertised VHT without HT, disabling both\n");
+ "AP advertised VHT without HT, disabling HT/VHT/HE\n");
ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
}
vht_cap = ieee80211_bss_get_ie(cbss, WLAN_EID_VHT_CAPABILITY);
@@ -4796,9 +4853,6 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
}
}
- if (!ieee80211_get_he_sta_cap(sband))
- ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
-
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) {
const struct cfg80211_bss_ies *ies;
const u8 *he_oper_ie;
@@ -5297,27 +5351,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
}
}
- /* Also disable HT if we don't support it or the AP doesn't use WMM */
sband = local->hw.wiphy->bands[req->bss->channel->band];
- if (!sband->ht_cap.ht_supported ||
- local->hw.queues < IEEE80211_NUM_ACS || !bss->wmm_used ||
- ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
- ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
- if (!bss->wmm_used &&
- !(ifmgd->flags & IEEE80211_STA_DISABLE_WMM))
- netdev_info(sdata->dev,
- "disabling HT as WMM/QoS is not supported by the AP\n");
- }
- /* disable VHT if we don't support it or the AP doesn't use WMM */
- if (!sband->vht_cap.vht_supported ||
- local->hw.queues < IEEE80211_NUM_ACS || !bss->wmm_used ||
- ifmgd->flags & IEEE80211_STA_DISABLE_WMM) {
+ /* also disable HT/VHT/HE if the AP doesn't use WMM */
+ if (!bss->wmm_used) {
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
- if (!bss->wmm_used &&
- !(ifmgd->flags & IEEE80211_STA_DISABLE_WMM))
- netdev_info(sdata->dev,
- "disabling VHT as WMM/QoS is not supported by the AP\n");
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
+ netdev_info(sdata->dev,
+ "disabling HT/VHT/HE as WMM/QoS is not supported by the AP\n");
}
memcpy(&ifmgd->ht_capa, &req->ht_capa, sizeof(ifmgd->ht_capa));
@@ -5416,6 +5458,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt;
sdata->control_port_over_nl80211 =
req->crypto.control_port_over_nl80211;
+ sdata->control_port_no_preauth = req->crypto.control_port_no_preauth;
sdata->encrypt_headroom = ieee80211_cs_headroom(local, &req->crypto,
sdata->vif.type);
@@ -5449,6 +5492,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
if (req->flags & ASSOC_REQ_DISABLE_HT) {
ifmgd->flags |= IEEE80211_STA_DISABLE_HT;
ifmgd->flags |= IEEE80211_STA_DISABLE_VHT;
+ ifmgd->flags |= IEEE80211_STA_DISABLE_HE;
}
if (req->flags & ASSOC_REQ_DISABLE_VHT)
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0e05ff037672..91a13aee4378 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -6,7 +6,7 @@
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*/
#include <linux/jiffies.h>
@@ -983,7 +983,8 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
if (skb->len < 24 + sizeof(*mmie) || !is_multicast_ether_addr(hdr->da))
return -1;
- if (!ieee80211_is_robust_mgmt_frame(skb))
+ if (!ieee80211_is_robust_mgmt_frame(skb) &&
+ !ieee80211_is_beacon(hdr->frame_control))
return -1; /* not a robust management frame */
mmie = (struct ieee80211_mmie *)
@@ -1450,8 +1451,7 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
return RX_CONTINUE;
if (ieee80211_is_ctl(hdr->frame_control) ||
- ieee80211_is_nullfunc(hdr->frame_control) ||
- ieee80211_is_qos_nullfunc(hdr->frame_control) ||
+ ieee80211_is_any_nullfunc(hdr->frame_control) ||
is_multicast_ether_addr(hdr->addr1))
return RX_CONTINUE;
@@ -1838,8 +1838,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
* Drop (qos-)data::nullfunc frames silently, since they
* are used only to control station power saving mode.
*/
- if (ieee80211_is_nullfunc(hdr->frame_control) ||
- ieee80211_is_qos_nullfunc(hdr->frame_control)) {
+ if (ieee80211_is_any_nullfunc(hdr->frame_control)) {
I802_DEBUG_INC(rx->local->rx_handlers_drop_nullfunc);
/*
@@ -1870,6 +1869,41 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
return RX_CONTINUE;
} /* ieee80211_rx_h_sta_process */
+static struct ieee80211_key *
+ieee80211_rx_get_bigtk(struct ieee80211_rx_data *rx, int idx)
+{
+ struct ieee80211_key *key = NULL;
+ struct ieee80211_sub_if_data *sdata = rx->sdata;
+ int idx2;
+
+ /* Make sure key gets set if either BIGTK key index is set so that
+ * ieee80211_drop_unencrypted_mgmt() can properly drop both unprotected
+ * Beacon frames and Beacon frames that claim to use another BIGTK key
+ * index (i.e., a key that we do not have).
+ */
+
+ if (idx < 0) {
+ idx = NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS;
+ idx2 = idx + 1;
+ } else {
+ if (idx == NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
+ idx2 = idx + 1;
+ else
+ idx2 = idx - 1;
+ }
+
+ if (rx->sta)
+ key = rcu_dereference(rx->sta->gtk[idx]);
+ if (!key)
+ key = rcu_dereference(sdata->keys[idx]);
+ if (!key && rx->sta)
+ key = rcu_dereference(rx->sta->gtk[idx2]);
+ if (!key)
+ key = rcu_dereference(sdata->keys[idx2]);
+
+ return key;
+}
+
static ieee80211_rx_result debug_noinline
ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
{
@@ -1887,17 +1921,18 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
/*
* Key selection 101
*
- * There are four types of keys:
+ * There are five types of keys:
* - GTK (group keys)
* - IGTK (group keys for management frames)
+ * - BIGTK (group keys for Beacon frames)
* - PTK (pairwise keys)
* - STK (station-to-station pairwise keys)
*
* When selecting a key, we have to distinguish between multicast
* (including broadcast) and unicast frames, the latter can only
- * use PTKs and STKs while the former always use GTKs and IGTKs.
- * Unless, of course, actual WEP keys ("pre-RSNA") are used, then
- * unicast frames can also use key indices like GTKs. Hence, if we
+ * use PTKs and STKs while the former always use GTKs, IGTKs, and
+ * BIGTKs. Unless, of course, actual WEP keys ("pre-RSNA") are used,
+ * then unicast frames can also use key indices like GTKs. Hence, if we
* don't have a PTK/STK we check the key index for a WEP key.
*
* Note that in a regular BSS, multicast frames are sent by the
@@ -1941,6 +1976,20 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
/* Skip decryption if the frame is not protected. */
if (!ieee80211_has_protected(fc))
return RX_CONTINUE;
+ } else if (mmie_keyidx >= 0 && ieee80211_is_beacon(fc)) {
+ /* Broadcast/multicast robust management frame / BIP */
+ if ((status->flag & RX_FLAG_DECRYPTED) &&
+ (status->flag & RX_FLAG_IV_STRIPPED))
+ return RX_CONTINUE;
+
+ if (mmie_keyidx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS ||
+ mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS +
+ NUM_DEFAULT_BEACON_KEYS)
+ return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+
+ rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx);
+ if (!rx->key)
+ return RX_CONTINUE; /* Beacon protection not in use */
} else if (mmie_keyidx >= 0) {
/* Broadcast/multicast robust management frame / BIP */
if ((status->flag & RX_FLAG_DECRYPTED) &&
@@ -1970,11 +2019,12 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
struct ieee80211_sub_if_data *sdata = rx->sdata;
int i;
- if (ieee80211_is_mgmt(fc) &&
- is_multicast_ether_addr(hdr->addr1) &&
- (key = rcu_dereference(rx->sdata->default_mgmt_key)))
- rx->key = key;
- else {
+ if (ieee80211_is_beacon(fc)) {
+ key = ieee80211_rx_get_bigtk(rx, -1);
+ } else if (ieee80211_is_mgmt(fc) &&
+ is_multicast_ether_addr(hdr->addr1)) {
+ key = rcu_dereference(rx->sdata->default_mgmt_key);
+ } else {
if (rx->sta) {
for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
key = rcu_dereference(rx->sta->gtk[i]);
@@ -1989,9 +2039,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
break;
}
}
- if (key)
- rx->key = key;
}
+ if (key)
+ rx->key = key;
return RX_CONTINUE;
} else {
/*
@@ -2319,7 +2369,7 @@ static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc)
/* Drop unencrypted frames if key is set. */
if (unlikely(!ieee80211_has_protected(fc) &&
- !ieee80211_is_nullfunc(fc) &&
+ !ieee80211_is_any_nullfunc(fc) &&
ieee80211_is_data(fc) && rx->key))
return -EACCES;
@@ -2360,6 +2410,9 @@ static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx)
rx->skb->len);
return -EACCES;
}
+ if (unlikely(ieee80211_is_beacon(fc) && rx->key &&
+ ieee80211_get_mmie_keyidx(rx->skb) < 0))
+ return -EACCES;
/*
* When using MFP, Action frames are not allowed prior to
* having configured keys.
@@ -2444,7 +2497,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
struct net_device *dev = sdata->dev;
if (unlikely((skb->protocol == sdata->control_port_protocol ||
- skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) &&
+ (skb->protocol == cpu_to_be16(ETH_P_PREAUTH) &&
+ !sdata->control_port_no_preauth)) &&
sdata->control_port_over_nl80211)) {
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
bool noencrypt = !(status->flag & RX_FLAG_DECRYPTED);
@@ -3084,6 +3138,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
enum ieee80211_smps_mode smps_mode;
struct sta_opmode_info sta_opmode = {};
+ if (sdata->vif.type != NL80211_IFTYPE_AP &&
+ sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
+ goto handled;
+
/* convert to HT capability */
switch (mgmt->u.action.u.ht_smps.smps_control) {
case WLAN_HT_SMPS_CONTROL_DISABLED:
@@ -4114,7 +4172,7 @@ void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
lockdep_assert_held(&local->sta_mtx);
- list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ list_for_each_entry(sta, &local->sta_list, list) {
if (sdata != sta->sdata &&
(!sta->sdata->bss || sta->sdata->bss != sdata->bss))
continue;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 4d31d9688dc2..fdac8192a519 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -201,8 +201,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
mgmt->bssid, cbss->bssid);
/* In case the signal is invalid update the status */
- signal_valid = abs(channel->center_freq - cbss->channel->center_freq)
- <= local->hw.wiphy->max_adj_channel_rssi_comp;
+ signal_valid = channel == cbss->channel;
if (!signal_valid)
rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 0f5f40678885..f8d5c2515829 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -4,7 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*/
#include <linux/module.h>
@@ -1049,6 +1049,11 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
might_sleep();
lockdep_assert_held(&local->sta_mtx);
+ while (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
+ ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC);
+ WARN_ON_ONCE(ret);
+ }
+
/* now keys can no longer be reached */
ieee80211_free_sta_keys(local, sta);
@@ -1351,20 +1356,6 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
atomic_dec(&ps->num_sta_ps);
- /* This station just woke up and isn't aware of our SMPS state */
- if (!ieee80211_vif_is_mesh(&sdata->vif) &&
- !ieee80211_smps_is_restrictive(sta->known_smps_mode,
- sdata->smps_mode) &&
- sta->known_smps_mode != sdata->bss->req_smps &&
- sta_info_tx_streams(sta) != 1) {
- ht_dbg(sdata,
- "%pM just woke up and MIMO capable - update SMPS\n",
- sta->sta.addr);
- ieee80211_send_smps_action(sdata, sdata->bss->req_smps,
- sta->sta.addr,
- sdata->vif.bss_conf.bssid);
- }
-
local->total_ps_buffered -= buffered;
sta_info_recalc_tim(sta);
@@ -2164,19 +2155,41 @@ static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
return 0;
}
+static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
+ int tid)
+{
+ unsigned int start;
+ u64 value;
+
+ do {
+ start = u64_stats_fetch_begin(&rxstats->syncp);
+ value = rxstats->msdu[tid];
+ } while (u64_stats_fetch_retry(&rxstats->syncp, start));
+
+ return value;
+}
+
static void sta_set_tidstats(struct sta_info *sta,
struct cfg80211_tid_stats *tidstats,
int tid)
{
struct ieee80211_local *local = sta->local;
+ int cpu;
if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) {
- unsigned int start;
+ if (!ieee80211_hw_check(&local->hw, USES_RSS))
+ tidstats->rx_msdu +=
+ sta_get_tidstats_msdu(&sta->rx_stats, tid);
+
+ if (sta->pcpu_rx_stats) {
+ for_each_possible_cpu(cpu) {
+ struct ieee80211_sta_rx_stats *cpurxs;
- do {
- start = u64_stats_fetch_begin(&sta->rx_stats.syncp);
- tidstats->rx_msdu = sta->rx_stats.msdu[tid];
- } while (u64_stats_fetch_retry(&sta->rx_stats.syncp, start));
+ cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu);
+ tidstats->rx_msdu +=
+ sta_get_tidstats_msdu(cpurxs, tid);
+ }
+ }
tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU);
}
@@ -2280,7 +2293,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) |
BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) {
- sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats);
+ if (!ieee80211_hw_check(&local->hw, USES_RSS))
+ sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats);
if (sta->pcpu_rx_stats) {
for_each_possible_cpu(cpu) {
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index c00e28585f9d..36f1abaab9ff 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -98,6 +98,7 @@ enum ieee80211_sta_info_flags {
WLAN_STA_MPSP_OWNER,
WLAN_STA_MPSP_RECIPIENT,
WLAN_STA_PS_DELIVER,
+ WLAN_STA_USES_ENCRYPTION,
NUM_WLAN_STA_FLAGS,
};
@@ -533,7 +534,9 @@ struct sta_info {
u8 addr[ETH_ALEN];
struct ieee80211_local *local;
struct ieee80211_sub_if_data *sdata;
- struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS];
+ struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS +
+ NUM_DEFAULT_MGMT_KEYS +
+ NUM_DEFAULT_BEACON_KEYS];
struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS];
u8 ptk_idx;
struct rate_control_ref *rate_ctrl;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index b720feaf9a74..22512805eafb 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -643,8 +643,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
rcu_read_lock();
sdata = ieee80211_sdata_from_skb(local, skb);
if (sdata) {
- if (ieee80211_is_nullfunc(hdr->frame_control) ||
- ieee80211_is_qos_nullfunc(hdr->frame_control))
+ if (ieee80211_is_any_nullfunc(hdr->frame_control))
cfg80211_probe_status(sdata->dev, hdr->addr1,
cookie, acked,
info->status.ack_signal,
@@ -888,6 +887,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
int rates_idx;
bool send_to_cooked;
bool acked;
+ bool noack_success;
struct ieee80211_bar *bar;
int shift = 0;
int tid = IEEE80211_NUM_TIDS;
@@ -906,6 +906,8 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
clear_sta_flag(sta, WLAN_STA_SP);
acked = !!(info->flags & IEEE80211_TX_STAT_ACK);
+ noack_success = !!(info->flags &
+ IEEE80211_TX_STAT_NOACK_TRANSMITTED);
/* mesh Peer Service Period support */
if (ieee80211_vif_is_mesh(&sta->sdata->vif) &&
@@ -970,12 +972,12 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
ieee80211_handle_filtered_frame(local, sta, skb);
return;
} else {
- if (!acked)
+ if (!acked && !noack_success)
sta->status_stats.retry_failed++;
sta->status_stats.retry_count += retry_count;
if (ieee80211_is_data_present(fc)) {
- if (!acked)
+ if (!acked && !noack_success)
sta->status_stats.msdu_failed[tid]++;
sta->status_stats.msdu_retries[tid] +=
@@ -1013,7 +1015,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
}
if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
- if (info->flags & IEEE80211_TX_STAT_ACK) {
+ if (acked) {
if (sta->status_stats.lost_packets)
sta->status_stats.lost_packets = 0;
@@ -1021,6 +1023,8 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
sta->status_stats.last_tdls_pkt_time =
jiffies;
+ } else if (noack_success) {
+ /* nothing to do here, do not account as lost */
} else {
ieee80211_lost_packet(sta, info);
}
@@ -1056,7 +1060,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw,
I802_DEBUG_INC(local->dot11FailedCount);
}
- if ((ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc)) &&
+ if (ieee80211_is_any_nullfunc(fc) &&
ieee80211_has_pm(fc) &&
ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) &&
!(info->flags & IEEE80211_TX_CTL_INJECTED) &&
@@ -1141,7 +1145,7 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
sta = container_of(pubsta, struct sta_info, sta);
- if (!acked)
+ if (!acked && !noack_success)
sta->status_stats.retry_failed++;
sta->status_stats.retry_count += retry_count;
@@ -1156,6 +1160,8 @@ void ieee80211_tx_status_ext(struct ieee80211_hw *hw,
sta->status_stats.last_tdls_pkt_time = jiffies;
} else if (test_sta_flag(sta, WLAN_STA_PS_STA)) {
return;
+ } else if (noack_success) {
+ /* nothing to do here, do not account as lost */
} else {
ieee80211_lost_packet(sta, info);
}
@@ -1198,6 +1204,77 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
}
EXPORT_SYMBOL(ieee80211_tx_rate_update);
+void ieee80211_tx_status_8023(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct sk_buff *skb)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct sta_info *sta;
+ int retry_count;
+ int rates_idx;
+ bool acked;
+
+ sdata = vif_to_sdata(vif);
+
+ acked = info->flags & IEEE80211_TX_STAT_ACK;
+ rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count);
+
+ rcu_read_lock();
+
+ if (ieee80211_lookup_ra_sta(sdata, skb, &sta))
+ goto counters_update;
+
+ if (IS_ERR(sta))
+ goto counters_update;
+
+ if (!acked)
+ sta->status_stats.retry_failed++;
+
+ if (rates_idx != -1)
+ sta->tx_stats.last_rate = info->status.rates[rates_idx];
+
+ sta->status_stats.retry_count += retry_count;
+
+ if (ieee80211_hw_check(hw, REPORTS_TX_ACK_STATUS)) {
+ if (acked && vif->type == NL80211_IFTYPE_STATION)
+ ieee80211_sta_reset_conn_monitor(sdata);
+
+ sta->status_stats.last_ack = jiffies;
+ if (info->flags & IEEE80211_TX_STAT_ACK) {
+ if (sta->status_stats.lost_packets)
+ sta->status_stats.lost_packets = 0;
+
+ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH))
+ sta->status_stats.last_tdls_pkt_time = jiffies;
+ } else {
+ ieee80211_lost_packet(sta, info);
+ }
+ }
+
+counters_update:
+ rcu_read_unlock();
+ ieee80211_led_tx(local);
+
+ if (!(info->flags & IEEE80211_TX_STAT_ACK) &&
+ !(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED))
+ goto skip_stats_update;
+
+ I802_DEBUG_INC(local->dot11TransmittedFrameCount);
+ if (is_multicast_ether_addr(skb->data))
+ I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount);
+ if (retry_count > 0)
+ I802_DEBUG_INC(local->dot11RetryCount);
+ if (retry_count > 1)
+ I802_DEBUG_INC(local->dot11MultipleRetryCount);
+
+skip_stats_update:
+ ieee80211_report_used_skb(local, skb, false);
+ dev_kfree_skb(skb);
+}
+EXPORT_SYMBOL(ieee80211_tx_status_8023);
+
void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets)
{
struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 4bd1faf4f779..82846aca86d9 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5,7 +5,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*
* Transmit and frame generation functions.
*/
@@ -297,7 +297,7 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
if (unlikely(test_bit(SCAN_SW_SCANNING, &tx->local->scanning)) &&
test_bit(SDATA_STATE_OFFCHANNEL, &tx->sdata->state) &&
!ieee80211_is_probe_req(hdr->frame_control) &&
- !ieee80211_is_nullfunc(hdr->frame_control))
+ !ieee80211_is_any_nullfunc(hdr->frame_control))
/*
* When software scanning only nullfunc frames (to notify
* the sleep state to the AP) and probe requests (for the
@@ -590,10 +590,13 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
- if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
+ if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) {
tx->key = NULL;
- else if (tx->sta &&
- (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
+ return TX_CONTINUE;
+ }
+
+ if (tx->sta &&
+ (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
tx->key = key;
else if (ieee80211_is_group_privacy_action(tx->skb) &&
(key = rcu_dereference(tx->sdata->default_multicast_key)))
@@ -654,6 +657,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
if (!skip_hw && tx->key &&
tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)
info->control.hw_key = &tx->key->conf;
+ } else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta &&
+ test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) {
+ return TX_DROP;
}
return TX_CONTINUE;
@@ -1250,7 +1256,8 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
(info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
return NULL;
- if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
+ if (!(info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) &&
+ unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
if ((!ieee80211_is_mgmt(hdr->frame_control) ||
ieee80211_is_bufferable_mmpdu(hdr->frame_control) ||
vif->type == NL80211_IFTYPE_STATION) &&
@@ -2360,9 +2367,9 @@ static inline bool ieee80211_is_tdls_setup(struct sk_buff *skb)
skb->data[14] == WLAN_TDLS_SNAP_RFTYPE;
}
-static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
- struct sk_buff *skb,
- struct sta_info **sta_out)
+int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb,
+ struct sta_info **sta_out)
{
struct sta_info *sta;
@@ -2442,7 +2449,7 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local,
spin_lock_irqsave(&local->ack_status_lock, flags);
id = idr_alloc(&local->ack_status_frames, ack_skb,
- 1, 0x40, GFP_ATOMIC);
+ 1, 0x2000, GFP_ATOMIC);
spin_unlock_irqrestore(&local->ack_status_lock, flags);
if (id >= 0) {
@@ -3598,8 +3605,26 @@ begin:
tx.skb = skb;
tx.sdata = vif_to_sdata(info->control.vif);
- if (txq->sta)
+ if (txq->sta) {
tx.sta = container_of(txq->sta, struct sta_info, sta);
+ /*
+ * Drop unicast frames to unauthorised stations unless they are
+ * EAPOL frames from the local station.
+ */
+ if (unlikely(ieee80211_is_data(hdr->frame_control) &&
+ !ieee80211_vif_is_mesh(&tx.sdata->vif) &&
+ tx.sdata->vif.type != NL80211_IFTYPE_OCB &&
+ !is_multicast_ether_addr(hdr->addr1) &&
+ !test_sta_flag(tx.sta, WLAN_STA_AUTHORIZED) &&
+ (!(info->control.flags &
+ IEEE80211_TX_CTRL_PORT_CTRL_PROTO) ||
+ !ether_addr_equal(tx.sdata->vif.addr,
+ hdr->addr2)))) {
+ I802_DEBUG_INC(local->tx_handlers_drop_unauth_port);
+ ieee80211_free_txskb(&local->hw, skb);
+ goto begin;
+ }
+ }
/*
* The key can be removed while the packet was queued, so need to call
@@ -3616,6 +3641,9 @@ begin:
else
info->flags &= ~IEEE80211_TX_CTL_AMPDU;
+ if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP)
+ goto encap_out;
+
if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) {
struct sta_info *sta = container_of(txq->sta, struct sta_info,
sta);
@@ -3675,9 +3703,11 @@ begin:
break;
}
+encap_out:
IEEE80211_SKB_CB(skb)->control.vif = vif;
- if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) {
+ if (vif &&
+ wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) {
u32 airtime;
airtime = ieee80211_calc_expected_tx_airtime(hw, vif, txq->sta,
@@ -4103,6 +4133,153 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
}
+static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, int led_len,
+ struct sta_info *sta,
+ bool txpending)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_tx_control control = {};
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_sta *pubsta = NULL;
+ unsigned long flags;
+ int q = info->hw_queue;
+
+ if (ieee80211_queue_skb(local, sdata, sta, skb))
+ return true;
+
+ spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+
+ if (local->queue_stop_reasons[q] ||
+ (!txpending && !skb_queue_empty(&local->pending[q]))) {
+ if (txpending)
+ skb_queue_head(&local->pending[q], skb);
+ else
+ skb_queue_tail(&local->pending[q], skb);
+
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+ return false;
+ }
+
+ spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+ if (sta && sta->uploaded)
+ pubsta = &sta->sta;
+
+ control.sta = pubsta;
+
+ drv_tx(local, &control, skb);
+
+ return true;
+}
+
+static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
+ struct net_device *dev, struct sta_info *sta,
+ struct sk_buff *skb)
+{
+ struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ethhdr *ehdr = (struct ethhdr *)skb->data;
+ struct ieee80211_local *local = sdata->local;
+ bool authorized = false;
+ bool multicast;
+ unsigned char *ra = ehdr->h_dest;
+
+ if (IS_ERR(sta) || (sta && !sta->uploaded))
+ sta = NULL;
+
+ if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+ (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER)))
+ ra = sdata->u.mgd.bssid;
+
+ if (!is_valid_ether_addr(ra))
+ goto out_free;
+
+ multicast = is_multicast_ether_addr(ra);
+
+ if (sta)
+ authorized = test_sta_flag(sta, WLAN_STA_AUTHORIZED);
+
+ if (!multicast && !authorized &&
+ (ehdr->h_proto != sdata->control_port_protocol ||
+ !ether_addr_equal(sdata->vif.addr, ehdr->h_source)))
+ goto out_free;
+
+ if (multicast && sdata->vif.type == NL80211_IFTYPE_AP &&
+ !atomic_read(&sdata->u.ap.num_mcast_sta))
+ goto out_free;
+
+ if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) &&
+ test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))
+ goto out_free;
+
+ if (unlikely(!multicast && skb->sk &&
+ skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
+ ieee80211_store_ack_skb(local, skb, &info->flags);
+
+ memset(info, 0, sizeof(*info));
+
+ if (unlikely(sdata->control_port_protocol == ehdr->h_proto)) {
+ if (sdata->control_port_no_encrypt)
+ info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ info->control.flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO;
+ }
+
+ if (multicast)
+ info->flags |= IEEE80211_TX_CTL_NO_ACK;
+
+ info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+
+ ieee80211_tx_stats(dev, skb->len);
+
+ if (sta) {
+ sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
+ sta->tx_stats.packets[skb_get_queue_mapping(skb)]++;
+ }
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ sdata = container_of(sdata->bss,
+ struct ieee80211_sub_if_data, u.ap);
+
+ info->control.flags |= IEEE80211_TX_CTRL_HW_80211_ENCAP;
+ info->control.vif = &sdata->vif;
+
+ ieee80211_tx_8023(sdata, skb, skb->len, sta, false);
+
+ return;
+
+out_free:
+ kfree_skb(skb);
+}
+
+netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct sta_info *sta;
+
+ if (WARN_ON(!sdata->hw_80211_encap)) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ if (unlikely(skb->len < ETH_HLEN)) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ rcu_read_lock();
+
+ if (ieee80211_lookup_ra_sta(sdata, skb, &sta))
+ kfree_skb(skb);
+ else
+ ieee80211_8023_xmit(sdata, dev, sta, skb);
+
+ rcu_read_unlock();
+
+ return NETDEV_TX_OK;
+}
+
struct sk_buff *
ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, u32 info_flags)
@@ -4181,6 +4358,16 @@ static bool ieee80211_tx_pending_skb(struct ieee80211_local *local,
}
info->band = chanctx_conf->def.chan->band;
result = ieee80211_tx(sdata, NULL, skb, true, 0);
+ } else if (info->control.flags & IEEE80211_TX_CTRL_HW_80211_ENCAP) {
+ if (ieee80211_lookup_ra_sta(sdata, skb, &sta)) {
+ dev_kfree_skb(skb);
+ return true;
+ }
+
+ if (IS_ERR(sta) || (sta && !sta->uploaded))
+ sta = NULL;
+
+ result = ieee80211_tx_8023(sdata, skb, skb->len, sta, true);
} else {
struct sk_buff_head skbs;
@@ -4501,6 +4688,32 @@ bool ieee80211_csa_is_complete(struct ieee80211_vif *vif)
}
EXPORT_SYMBOL(ieee80211_csa_is_complete);
+static int ieee80211_beacon_protect(struct sk_buff *skb,
+ struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ ieee80211_tx_result res;
+ struct ieee80211_tx_data tx;
+ struct sk_buff *check_skb;
+
+ memset(&tx, 0, sizeof(tx));
+ tx.key = rcu_dereference(sdata->default_beacon_key);
+ if (!tx.key)
+ return 0;
+ tx.local = local;
+ tx.sdata = sdata;
+ __skb_queue_head_init(&tx.skbs);
+ __skb_queue_tail(&tx.skbs, skb);
+ res = ieee80211_tx_h_encrypt(&tx);
+ check_skb = __skb_dequeue(&tx.skbs);
+ /* we may crash after this, but it'd be a bug in crypto */
+ WARN_ON(check_skb != skb);
+ if (WARN_ON_ONCE(res != TX_CONTINUE))
+ return -EINVAL;
+
+ return 0;
+}
+
static struct sk_buff *
__ieee80211_beacon_get(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
@@ -4568,6 +4781,9 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
if (beacon->tail)
skb_put_data(skb, beacon->tail,
beacon->tail_len);
+
+ if (ieee80211_beacon_protect(skb, local, sdata) < 0)
+ goto out;
} else
goto out;
} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
@@ -5126,6 +5342,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
struct ieee80211_local *local = sdata->local;
struct sk_buff *skb;
struct ethhdr *ehdr;
+ u32 ctrl_flags = 0;
u32 flags;
/* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE
@@ -5135,6 +5352,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
proto != cpu_to_be16(ETH_P_PREAUTH))
return -EINVAL;
+ if (proto == sdata->control_port_protocol)
+ ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO;
+
if (unencrypted)
flags = IEEE80211_TX_INTFL_DONT_ENCRYPT;
else
@@ -5160,7 +5380,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
skb_reset_mac_header(skb);
local_bh_disable();
- __ieee80211_subif_start_xmit(skb, skb->dev, flags, 0);
+ __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags);
local_bh_enable();
return 0;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 32a7a53833c0..20436c86b9bf 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -6,7 +6,7 @@
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*
* utilities for mac80211
*/
@@ -39,7 +39,6 @@ const void *const mac80211_wiphy_privid = &mac80211_wiphy_privid;
struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy)
{
struct ieee80211_local *local;
- BUG_ON(!wiphy);
local = wiphy_priv(wiphy);
return &local->hw;
@@ -891,6 +890,55 @@ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw,
}
EXPORT_SYMBOL(ieee80211_queue_delayed_work);
+static void ieee80211_parse_extension_element(u32 *crc,
+ const struct element *elem,
+ struct ieee802_11_elems *elems)
+{
+ const void *data = elem->data + 1;
+ u8 len = elem->datalen - 1;
+
+ switch (elem->data[0]) {
+ case WLAN_EID_EXT_HE_MU_EDCA:
+ if (len == sizeof(*elems->mu_edca_param_set)) {
+ elems->mu_edca_param_set = data;
+ if (crc)
+ *crc = crc32_be(*crc, (void *)elem,
+ elem->datalen + 2);
+ }
+ break;
+ case WLAN_EID_EXT_HE_CAPABILITY:
+ elems->he_cap = data;
+ elems->he_cap_len = len;
+ break;
+ case WLAN_EID_EXT_HE_OPERATION:
+ if (len >= sizeof(*elems->he_operation) &&
+ len == ieee80211_he_oper_size(data) - 1) {
+ if (crc)
+ *crc = crc32_be(*crc, (void *)elem,
+ elem->datalen + 2);
+ elems->he_operation = data;
+ }
+ break;
+ case WLAN_EID_EXT_UORA:
+ if (len == 1)
+ elems->uora_element = data;
+ break;
+ case WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME:
+ if (len == 3)
+ elems->max_channel_switch_time = data;
+ break;
+ case WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION:
+ if (len == sizeof(*elems->mbssid_config_ie))
+ elems->mbssid_config_ie = data;
+ break;
+ case WLAN_EID_EXT_HE_SPR:
+ if (len >= sizeof(*elems->he_spr) &&
+ len >= ieee80211_he_spr_size(data))
+ elems->he_spr = data;
+ break;
+ }
+}
+
static u32
_ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
struct ieee802_11_elems *elems,
@@ -950,6 +998,7 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
case WLAN_EID_CHAN_SWITCH_TIMING:
case WLAN_EID_LINK_ID:
case WLAN_EID_BSS_MAX_IDLE_PERIOD:
+ case WLAN_EID_RSNX:
/*
* not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible
* that if the content gets bigger it might be needed more than once
@@ -1063,16 +1112,22 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
elem_parse_failed = true;
break;
case WLAN_EID_VHT_OPERATION:
- if (elen >= sizeof(struct ieee80211_vht_operation))
+ if (elen >= sizeof(struct ieee80211_vht_operation)) {
elems->vht_operation = (void *)pos;
- else
- elem_parse_failed = true;
+ if (calc_crc)
+ crc = crc32_be(crc, pos - 2, elen + 2);
+ break;
+ }
+ elem_parse_failed = true;
break;
case WLAN_EID_OPMODE_NOTIF:
- if (elen > 0)
+ if (elen > 0) {
elems->opmode_notif = pos;
- else
- elem_parse_failed = true;
+ if (calc_crc)
+ crc = crc32_be(crc, pos - 2, elen + 2);
+ break;
+ }
+ elem_parse_failed = true;
break;
case WLAN_EID_MESH_ID:
elems->mesh_id = pos;
@@ -1220,34 +1275,14 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
if (elen >= sizeof(*elems->max_idle_period_ie))
elems->max_idle_period_ie = (void *)pos;
break;
+ case WLAN_EID_RSNX:
+ elems->rsnx = pos;
+ elems->rsnx_len = elen;
+ break;
case WLAN_EID_EXTENSION:
- if (pos[0] == WLAN_EID_EXT_HE_MU_EDCA &&
- elen >= (sizeof(*elems->mu_edca_param_set) + 1)) {
- elems->mu_edca_param_set = (void *)&pos[1];
- if (calc_crc)
- crc = crc32_be(crc, pos - 2, elen + 2);
- } else if (pos[0] == WLAN_EID_EXT_HE_CAPABILITY) {
- elems->he_cap = (void *)&pos[1];
- elems->he_cap_len = elen - 1;
- } else if (pos[0] == WLAN_EID_EXT_HE_OPERATION &&
- elen >= sizeof(*elems->he_operation) &&
- elen >= ieee80211_he_oper_size(&pos[1])) {
- elems->he_operation = (void *)&pos[1];
- } else if (pos[0] == WLAN_EID_EXT_UORA && elen >= 1) {
- elems->uora_element = (void *)&pos[1];
- } else if (pos[0] ==
- WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME &&
- elen == 4) {
- elems->max_channel_switch_time = pos + 1;
- } else if (pos[0] ==
- WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION &&
- elen == 3) {
- elems->mbssid_config_ie = (void *)&pos[1];
- } else if (pos[0] == WLAN_EID_EXT_HE_SPR &&
- elen >= sizeof(*elems->he_spr) &&
- elen >= ieee80211_he_spr_size(&pos[1])) {
- elems->he_spr = (void *)&pos[1];
- }
+ ieee80211_parse_extension_element(calc_crc ?
+ &crc : NULL,
+ elem, elems);
break;
default:
break;
@@ -2987,10 +3022,22 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
int cf0, cf1;
int ccfs0, ccfs1, ccfs2;
int ccf0, ccf1;
+ u32 vht_cap;
+ bool support_80_80 = false;
+ bool support_160 = false;
if (!oper || !htop)
return false;
+ vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap;
+ support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK |
+ IEEE80211_VHT_CAP_EXT_NSS_BW_MASK));
+ support_80_80 = ((vht_cap &
+ IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) ||
+ (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ &&
+ vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) ||
+ ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >>
+ IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1));
ccfs0 = oper->center_freq_seg0_idx;
ccfs1 = oper->center_freq_seg1_idx;
ccfs2 = (le16_to_cpu(htop->operation_mode) &
@@ -3018,10 +3065,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw,
unsigned int diff;
diff = abs(ccf1 - ccf0);
- if (diff == 8) {
+ if ((diff == 8) && support_160) {
new.width = NL80211_CHAN_WIDTH_160;
new.center_freq1 = cf1;
- } else if (diff > 8) {
+ } else if ((diff > 8) && support_80_80) {
new.width = NL80211_CHAN_WIDTH_80P80;
new.center_freq2 = cf1;
}
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index ccdcb9ad9ac7..632f07401850 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -333,11 +333,33 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
}
}
+/* FIXME: move this to some better location - parses HE now */
enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
{
struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
+ struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap;
u32 cap_width;
+ if (he_cap->has_he) {
+ u8 info = he_cap->he_cap_elem.phy_cap_info[0];
+
+ if (sta->sdata->vif.bss_conf.chandef.chan->band ==
+ NL80211_BAND_2GHZ) {
+ if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G)
+ return IEEE80211_STA_RX_BW_40;
+ else
+ return IEEE80211_STA_RX_BW_20;
+ }
+
+ if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G ||
+ info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G)
+ return IEEE80211_STA_RX_BW_160;
+ else if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G)
+ return IEEE80211_STA_RX_BW_80;
+
+ return IEEE80211_STA_RX_BW_20;
+ }
+
if (!vht_cap->vht_supported)
return sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
IEEE80211_STA_RX_BW_40 :
@@ -433,6 +455,7 @@ ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width)
}
}
+/* FIXME: rename/move - this deals with everything not just VHT */
enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
@@ -458,12 +481,40 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta)
void ieee80211_sta_set_rx_nss(struct sta_info *sta)
{
- u8 ht_rx_nss = 0, vht_rx_nss = 0;
+ u8 ht_rx_nss = 0, vht_rx_nss = 0, he_rx_nss = 0, rx_nss;
/* if we received a notification already don't overwrite it */
if (sta->sta.rx_nss)
return;
+ if (sta->sta.he_cap.has_he) {
+ int i;
+ u8 rx_mcs_80 = 0, rx_mcs_160 = 0;
+ const struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap;
+ u16 mcs_160_map =
+ le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_160);
+ u16 mcs_80_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_80);
+
+ for (i = 7; i >= 0; i--) {
+ u8 mcs_160 = (mcs_160_map >> (2 * i)) & 3;
+
+ if (mcs_160 != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+ rx_mcs_160 = i + 1;
+ break;
+ }
+ }
+ for (i = 7; i >= 0; i--) {
+ u8 mcs_80 = (mcs_80_map >> (2 * i)) & 3;
+
+ if (mcs_80 != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+ rx_mcs_80 = i + 1;
+ break;
+ }
+ }
+
+ he_rx_nss = min(rx_mcs_80, rx_mcs_160);
+ }
+
if (sta->sta.ht_cap.ht_supported) {
if (sta->sta.ht_cap.mcs.rx_mask[0])
ht_rx_nss++;
@@ -493,8 +544,9 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta)
/* FIXME: consider rx_highest? */
}
- ht_rx_nss = max(ht_rx_nss, vht_rx_nss);
- sta->sta.rx_nss = max_t(u8, 1, ht_rx_nss);
+ rx_nss = max(vht_rx_nss, ht_rx_nss);
+ rx_nss = max(he_rx_nss, rx_nss);
+ sta->sta.rx_nss = max_t(u8, 1, rx_nss);
}
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index b75c2c54e665..9a6e11d7b4db 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -22,12 +22,10 @@
#include "wep.h"
-int ieee80211_wep_init(struct ieee80211_local *local)
+void ieee80211_wep_init(struct ieee80211_local *local)
{
/* start WEP IV from a random value */
get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN);
-
- return 0;
}
static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen)
diff --git a/net/mac80211/wep.h b/net/mac80211/wep.h
index 997a034233c2..4ffe83554c67 100644
--- a/net/mac80211/wep.h
+++ b/net/mac80211/wep.h
@@ -13,7 +13,7 @@
#include "ieee80211_i.h"
#include "key.h"
-int ieee80211_wep_init(struct ieee80211_local *local);
+void ieee80211_wep_init(struct ieee80211_local *local);
int ieee80211_wep_encrypt_data(struct arc4_ctx *ctx, u8 *rc4key,
size_t klen, u8 *data, size_t data_len);
int ieee80211_wep_encrypt(struct ieee80211_local *local,
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 768a302879b4..0e9aa94adc07 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -98,7 +98,7 @@ struct mpls_nh { /* next hop label forwarding entry */
u8 nh_via_table;
u8 nh_reserved1;
- u32 nh_label[0];
+ u32 nh_label[];
};
/* offset of via from beginning of mpls_nh */
@@ -154,7 +154,7 @@ struct mpls_route { /* next hop label forwarding entry */
u8 rt_nh_size;
u8 rt_via_offset;
u8 rt_reserved1;
- struct mpls_nh rt_nh[0];
+ struct mpls_nh rt_nh[];
};
#define for_nexthops(rt) { \
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 44b675016393..2def85718d94 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -162,7 +162,7 @@ drop:
return -EINVAL;
}
-static int mpls_build_state(struct nlattr *nla,
+static int mpls_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
index 49f6054e7f4e..a9ed3bf1d93f 100644
--- a/net/mptcp/Kconfig
+++ b/net/mptcp/Kconfig
@@ -4,6 +4,7 @@ config MPTCP
depends on INET
select SKB_EXTENSIONS
select CRYPTO_LIB_SHA256
+ select CRYPTO
help
Multipath TCP (MPTCP) connections send and receive data over multiple
subflows in order to utilize multiple network paths. Each subflow
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index 4e98d9edfd0a..baa0640527c7 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_MPTCP) += mptcp.o
-mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o
+mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
+ mib.o pm_netlink.o
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index 40d1bb18fd60..c151628bd416 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -44,8 +44,7 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
*idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6]));
}
-void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
- void *hmac)
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
{
u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
__be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
@@ -55,6 +54,9 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
u8 key2be[8];
int i;
+ if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
+ len = SHA256_DIGEST_SIZE;
+
put_unaligned_be64(key1, key1be);
put_unaligned_be64(key2, key2be);
@@ -65,11 +67,10 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
for (i = 0; i < 8; i++)
input[i + 8] ^= key2be[i];
- put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]);
- put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]);
+ memcpy(&input[SHA256_BLOCK_SIZE], msg, len);
sha256_init(&state);
- sha256_update(&state, input, SHA256_BLOCK_SIZE + 8);
+ sha256_update(&state, input, SHA256_BLOCK_SIZE + len);
/* emit sha256(K1 || msg) on the second input block, so we can
* reuse 'input' for the last hashing
@@ -125,6 +126,7 @@ static int __init test_mptcp_crypto(void)
char hmac[20], hmac_hex[41];
u32 nonce1, nonce2;
u64 key1, key2;
+ u8 msg[8];
int i, j;
for (i = 0; i < ARRAY_SIZE(tests); ++i) {
@@ -134,7 +136,10 @@ static int __init test_mptcp_crypto(void)
nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
- mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac);
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
for (j = 0; j < 20; ++j)
sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
hmac_hex[40] = 0;
diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
new file mode 100644
index 000000000000..a536586742f2
--- /dev/null
+++ b/net/mptcp/diag.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP socket monitoring support
+ *
+ * Copyright (c) 2019 Red Hat
+ *
+ * Author: Davide Caratti <dcaratti@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet_diag.h>
+#include <net/netlink.h>
+#include <uapi/linux/mptcp.h>
+#include "protocol.h"
+
+static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *sf;
+ struct nlattr *start;
+ u32 flags = 0;
+ int err;
+
+ start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
+ if (!start)
+ return -EMSGSIZE;
+
+ rcu_read_lock();
+ sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
+ if (!sf) {
+ err = 0;
+ goto nla_failure;
+ }
+
+ if (sf->mp_capable)
+ flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM;
+ if (sf->request_mptcp)
+ flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC;
+ if (sf->mp_join)
+ flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM;
+ if (sf->request_join)
+ flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC;
+ if (sf->backup)
+ flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM;
+ if (sf->request_bkup)
+ flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC;
+ if (sf->fully_established)
+ flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED;
+ if (sf->conn_finished)
+ flags |= MPTCP_SUBFLOW_FLAG_CONNECTED;
+ if (sf->map_valid)
+ flags |= MPTCP_SUBFLOW_FLAG_MAPVALID;
+
+ if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
+ sf->rel_write_seq) ||
+ nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq,
+ MPTCP_SUBFLOW_ATTR_PAD) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
+ sf->map_subflow_seq) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) ||
+ nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
+ sf->map_data_len) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) ||
+ nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) ||
+ nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) {
+ err = -EMSGSIZE;
+ goto nla_failure;
+ }
+
+ rcu_read_unlock();
+ nla_nest_end(skb, start);
+ return 0;
+
+nla_failure:
+ rcu_read_unlock();
+ nla_nest_cancel(skb, start);
+ return err;
+}
+
+static size_t subflow_get_info_size(const struct sock *sk)
+{
+ size_t size = 0;
+
+ size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */
+ nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */
+ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
+ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */
+ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */
+ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */
+ 0;
+ return size;
+}
+
+void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops)
+{
+ ops->get_info = subflow_get_info;
+ ops->get_info_size = subflow_get_info_size;
+}
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
new file mode 100644
index 000000000000..0a6a15f3456d
--- /dev/null
+++ b/net/mptcp/mib.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/mptcp.h>
+#include <net/snmp.h>
+#include <net/net_namespace.h>
+
+#include "mib.h"
+
+static const struct snmp_mib mptcp_snmp_list[] = {
+ SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
+ SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
+ SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
+ SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
+ SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
+ SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
+ SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
+ SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX),
+ SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
+ SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
+ SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
+ SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
+ SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
+ SNMP_MIB_SENTINEL
+};
+
+/* mptcp_mib_alloc - allocate percpu mib counters
+ *
+ * These are allocated when the first mptcp socket is created so
+ * we do not waste percpu memory if mptcp isn't in use.
+ */
+bool mptcp_mib_alloc(struct net *net)
+{
+ struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib);
+
+ if (!mib)
+ return false;
+
+ if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib))
+ free_percpu(mib);
+
+ return true;
+}
+
+void mptcp_seq_show(struct seq_file *seq)
+{
+ struct net *net = seq->private;
+ int i;
+
+ seq_puts(seq, "MPTcpExt:");
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_printf(seq, " %s", mptcp_snmp_list[i].name);
+
+ seq_puts(seq, "\nMPTcpExt:");
+
+ if (!net->mib.mptcp_statistics) {
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_puts(seq, " 0");
+
+ return;
+ }
+
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.mptcp_statistics,
+ mptcp_snmp_list[i].entry));
+ seq_putc(seq, '\n');
+}
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
new file mode 100644
index 000000000000..d7de340fc997
--- /dev/null
+++ b/net/mptcp/mib.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+enum linux_mptcp_mib_field {
+ MPTCP_MIB_NUM = 0,
+ MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
+ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
+ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */
+ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
+ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
+ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
+ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
+ __MPTCP_MIB_MAX
+};
+
+#define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX
+struct mptcp_mib {
+ unsigned long mibs[LINUX_MIB_MPTCP_MAX];
+};
+
+static inline void MPTCP_INC_STATS(struct net *net,
+ enum linux_mptcp_mib_field field)
+{
+ if (likely(net->mib.mptcp_statistics))
+ SNMP_INC_STATS(net->mib.mptcp_statistics, field);
+}
+
+static inline void __MPTCP_INC_STATS(struct net *net,
+ enum linux_mptcp_mib_field field)
+{
+ if (likely(net->mib.mptcp_statistics))
+ __SNMP_INC_STATS(net->mib.mptcp_statistics, field);
+}
+
+bool mptcp_mib_alloc(struct net *net);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 45acd877bef3..bd220ee4aac9 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -96,6 +96,38 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
mp_opt->rcvr_key, mp_opt->data_len);
break;
+ case MPTCPOPT_MP_JOIN:
+ mp_opt->mp_join = 1;
+ if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
+ mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+ mp_opt->join_id = *ptr++;
+ mp_opt->token = get_unaligned_be32(ptr);
+ ptr += 4;
+ mp_opt->nonce = get_unaligned_be32(ptr);
+ ptr += 4;
+ pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
+ mp_opt->backup, mp_opt->join_id,
+ mp_opt->token, mp_opt->nonce);
+ } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
+ mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+ mp_opt->join_id = *ptr++;
+ mp_opt->thmac = get_unaligned_be64(ptr);
+ ptr += 8;
+ mp_opt->nonce = get_unaligned_be32(ptr);
+ ptr += 4;
+ pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ mp_opt->backup, mp_opt->join_id,
+ mp_opt->thmac, mp_opt->nonce);
+ } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
+ ptr += 2;
+ memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
+ pr_debug("MP_JOIN hmac");
+ } else {
+ pr_warn("MP_JOIN bad option size");
+ mp_opt->mp_join = 0;
+ }
+ break;
+
case MPTCPOPT_DSS:
pr_debug("DSS");
ptr++;
@@ -178,6 +210,71 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
break;
+ case MPTCPOPT_ADD_ADDR:
+ mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
+ if (!mp_opt->echo) {
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_4;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_6;
+#endif
+ else
+ break;
+ } else {
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_4;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_6;
+#endif
+ else
+ break;
+ }
+
+ mp_opt->add_addr = 1;
+ mp_opt->port = 0;
+ mp_opt->addr_id = *ptr++;
+ pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id);
+ if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
+ memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
+ ptr += 4;
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
+ mp_opt->port = get_unaligned_be16(ptr);
+ ptr += 2;
+ }
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else {
+ memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16);
+ ptr += 16;
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
+ mp_opt->port = get_unaligned_be16(ptr);
+ ptr += 2;
+ }
+ }
+#endif
+ if (!mp_opt->echo) {
+ mp_opt->ahmac = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ break;
+
+ case MPTCPOPT_RM_ADDR:
+ if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
+ break;
+
+ mp_opt->rm_addr = 1;
+ mp_opt->rm_id = *ptr++;
+ pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
+ break;
+
default:
break;
}
@@ -231,6 +328,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
opts->sndr_key = subflow->local_key;
*size = TCPOLEN_MPTCP_MPC_SYN;
return true;
+ } else if (subflow->request_join) {
+ pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
+ subflow->local_nonce);
+ opts->suboptions = OPTION_MPTCP_MPJ_SYN;
+ opts->join_id = subflow->local_id;
+ opts->token = subflow->remote_token;
+ opts->nonce = subflow->local_nonce;
+ opts->backup = subflow->request_bkup;
+ *size = TCPOLEN_MPTCP_MPJ_SYN;
+ return true;
}
return false;
}
@@ -240,16 +347,55 @@ void mptcp_rcv_synsent(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct tcp_sock *tp = tcp_sk(sk);
- pr_debug("subflow=%p", subflow);
if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
subflow->mp_capable = 1;
subflow->can_ack = 1;
subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
- } else {
+ pr_debug("subflow=%p, remote_key=%llu", subflow,
+ subflow->remote_key);
+ } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) {
+ subflow->mp_join = 1;
+ subflow->thmac = tp->rx_opt.mptcp.thmac;
+ subflow->remote_nonce = tp->rx_opt.mptcp.nonce;
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
+ subflow->thmac, subflow->remote_nonce);
+ } else if (subflow->request_mptcp) {
tcp_sk(sk)->is_mptcp = 0;
}
}
+/* MP_JOIN client subflow must wait for 4th ack before sending any data:
+ * TCP can't schedule delack timer before the subflow is fully established.
+ * MPTCP uses the delack timer to do 3rd ack retransmissions
+ */
+static void schedule_3rdack_retransmission(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
+
+ /* reschedule with a timeout above RTT, as we must look only for drop */
+ if (tp->srtt_us)
+ timeout = tp->srtt_us << 1;
+ else
+ timeout = TCP_TIMEOUT_INIT;
+
+ WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+static void clear_3rdack_retransmission(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_delack_timer);
+ icsk->icsk_ack.timeout = 0;
+ icsk->icsk_ack.ato = 0;
+ icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
+}
+
static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
unsigned int *size,
unsigned int remaining,
@@ -259,17 +405,21 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
struct mptcp_ext *mpext;
unsigned int data_len;
- pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow,
- subflow->fourth_ack, subflow->snd_isn,
- skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
+ /* When skb is not available, we better over-estimate the emitted
+ * options len. A full DSS option (28 bytes) is longer than
+ * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
+ * tell the caller to defer the estimate to
+ * mptcp_established_options_dss(), which will reserve enough space.
+ */
+ if (!skb)
+ return false;
- if (subflow->mp_capable && !subflow->fourth_ack && skb &&
- subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
- /* When skb is not available, we better over-estimate the
- * emitted options len. A full DSS option is longer than
- * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
- * that.
- */
+ /* MPC/MPJ needed only on 3rd ack packet */
+ if (subflow->fully_established ||
+ subflow->snd_isn != TCP_SKB_CB(skb)->seq)
+ return false;
+
+ if (subflow->mp_capable) {
mpext = mptcp_get_ext(skb);
data_len = mpext ? mpext->data_len : 0;
@@ -297,6 +447,14 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
data_len);
return true;
+ } else if (subflow->mp_join) {
+ opts->suboptions = OPTION_MPTCP_MPJ_ACK;
+ memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
+ *size = TCPOLEN_MPTCP_MPJ_ACK;
+ pr_debug("subflow=%p", subflow);
+
+ schedule_3rdack_retransmission(sk);
+ return true;
}
return false;
}
@@ -304,21 +462,22 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
struct mptcp_ext *ext)
{
- ext->data_fin = 1;
-
if (!ext->use_map) {
/* RFC6824 requires a DSS mapping with specific values
* if DATA_FIN is set but no data payload is mapped
*/
+ ext->data_fin = 1;
ext->use_map = 1;
ext->dsn64 = 1;
- ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
+ ext->data_seq = subflow->data_fin_tx_seq;
ext->subflow_seq = 0;
ext->data_len = 1;
- } else {
- /* If there's an existing DSS mapping, DATA_FIN consumes
- * 1 additional byte of mapping space.
+ } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) {
+ /* If there's an existing DSS mapping and it is the
+ * final mapping, DATA_FIN consumes 1 additional byte of
+ * mapping space.
*/
+ ext->data_fin = 1;
ext->data_len++;
}
}
@@ -354,15 +513,17 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
if (mpext)
opts->ext_copy = *mpext;
- if (skb && tcp_fin &&
- subflow->conn->sk_state != TCP_ESTABLISHED)
+ if (skb && tcp_fin && subflow->data_fin_tx_enable)
mptcp_write_data_fin(subflow, &opts->ext_copy);
ret = true;
}
+ /* passive sockets msk will set the 'can_ack' after accept(), even
+ * if the first subflow may have the already the remote key handy
+ */
opts->ext_copy.use_ack = 0;
msk = mptcp_sk(subflow->conn);
- if (!msk || !READ_ONCE(msk->can_ack)) {
+ if (!READ_ONCE(msk->can_ack)) {
*size = ALIGN(dss_size, 4);
return ret;
}
@@ -383,6 +544,83 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
return true;
}
+static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
+ struct in_addr *addr)
+{
+ u8 hmac[MPTCP_ADDR_HMAC_LEN];
+ u8 msg[7];
+
+ msg[0] = addr_id;
+ memcpy(&msg[1], &addr->s_addr, 4);
+ msg[5] = 0;
+ msg[6] = 0;
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
+
+ return get_unaligned_be64(hmac);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
+ struct in6_addr *addr)
+{
+ u8 hmac[MPTCP_ADDR_HMAC_LEN];
+ u8 msg[19];
+
+ msg[0] = addr_id;
+ memcpy(&msg[1], &addr->s6_addr, 16);
+ msg[17] = 0;
+ msg[18] = 0;
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
+
+ return get_unaligned_be64(hmac);
+}
+#endif
+
+static bool mptcp_established_options_addr(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct mptcp_addr_info saddr;
+ int len;
+
+ if (!mptcp_pm_should_signal(msk) ||
+ !(mptcp_pm_addr_signal(msk, remaining, &saddr)))
+ return false;
+
+ len = mptcp_add_addr_len(saddr.family);
+ if (remaining < len)
+ return false;
+
+ *size = len;
+ opts->addr_id = saddr.id;
+ if (saddr.family == AF_INET) {
+ opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
+ opts->addr = saddr.addr;
+ opts->ahmac = add_addr_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr);
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (saddr.family == AF_INET6) {
+ opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
+ opts->addr6 = saddr.addr6;
+ opts->ahmac = add_addr6_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr6);
+ }
+#endif
+ pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac);
+
+ return true;
+}
+
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts)
@@ -390,6 +628,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int opt_size = 0;
bool ret = false;
+ opts->suboptions = 0;
+
if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -404,6 +644,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
*size += opt_size;
remaining -= opt_size;
+ if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ }
return ret;
}
@@ -420,54 +665,194 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
pr_debug("subflow_req=%p, local_key=%llu",
subflow_req, subflow_req->local_key);
return true;
+ } else if (subflow_req->mp_join) {
+ opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
+ opts->backup = subflow_req->backup;
+ opts->join_id = subflow_req->local_id;
+ opts->thmac = subflow_req->thmac;
+ opts->nonce = subflow_req->local_nonce;
+ pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ subflow_req, opts->backup, opts->join_id,
+ opts->thmac, opts->nonce);
+ *size = TCPOLEN_MPTCP_MPJ_SYNACK;
+ return true;
}
return false;
}
-static bool check_fourth_ack(struct mptcp_subflow_context *subflow,
- struct sk_buff *skb,
- struct mptcp_options_received *mp_opt)
+static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
+ struct mptcp_subflow_context *subflow,
+ struct sk_buff *skb,
+ struct mptcp_options_received *mp_opt)
{
/* here we can process OoO, in-window pkts, only in-sequence 4th ack
- * are relevant
+ * will make the subflow fully established
*/
- if (likely(subflow->fourth_ack ||
- TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
- return true;
+ if (likely(subflow->fully_established)) {
+ /* on passive sockets, check for 3rd ack retransmission
+ * note that msk is always set by subflow_syn_recv_sock()
+ * for mp_join subflows
+ */
+ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
+ TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
+ subflow->mp_join && mp_opt->mp_join &&
+ READ_ONCE(msk->pm.server_side))
+ tcp_send_ack(sk);
+ goto fully_established;
+ }
- if (mp_opt->use_ack)
- subflow->fourth_ack = 1;
+ /* we should process OoO packets before the first subflow is fully
+ * established, but not expected for MP_JOIN subflows
+ */
+ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
+ return subflow->mp_capable;
- if (subflow->can_ack)
- return true;
+ if (mp_opt->use_ack) {
+ /* subflows are fully established as soon as we get any
+ * additional ack.
+ */
+ subflow->fully_established = 1;
+ goto fully_established;
+ }
+
+ WARN_ON_ONCE(subflow->can_ack);
/* If the first established packet does not contain MP_CAPABLE + data
* then fallback to TCP
*/
if (!mp_opt->mp_capable) {
subflow->mp_capable = 0;
- tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0;
+ tcp_sk(sk)->is_mptcp = 0;
return false;
}
+
+ subflow->fully_established = 1;
subflow->remote_key = mp_opt->sndr_key;
subflow->can_ack = 1;
+
+fully_established:
+ if (likely(subflow->pm_notified))
+ return true;
+
+ subflow->pm_notified = 1;
+ if (subflow->mp_join) {
+ clear_3rdack_retransmission(sk);
+ mptcp_pm_subflow_established(msk, subflow);
+ } else {
+ mptcp_pm_fully_established(msk);
+ }
return true;
}
+static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
+{
+ u32 old_ack32, cur_ack32;
+
+ if (use_64bit)
+ return cur_ack;
+
+ old_ack32 = (u32)old_ack;
+ cur_ack32 = (u32)cur_ack;
+ cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32;
+ if (unlikely(before(cur_ack32, old_ack32)))
+ return cur_ack + (1LL << 32);
+ return cur_ack;
+}
+
+static void update_una(struct mptcp_sock *msk,
+ struct mptcp_options_received *mp_opt)
+{
+ u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
+ u64 write_seq = READ_ONCE(msk->write_seq);
+
+ /* avoid ack expansion on update conflict, to reduce the risk of
+ * wrongly expanding to a future ack sequence number, which is way
+ * more dangerous than missing an ack
+ */
+ new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
+
+ /* ACK for data not even sent yet? Ignore. */
+ if (after64(new_snd_una, write_seq))
+ new_snd_una = old_snd_una;
+
+ while (after64(new_snd_una, old_snd_una)) {
+ snd_una = old_snd_una;
+ old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
+ new_snd_una);
+ if (old_snd_una == snd_una) {
+ mptcp_data_acked((struct sock *)msk);
+ break;
+ }
+ }
+}
+
+static bool add_addr_hmac_valid(struct mptcp_sock *msk,
+ struct mptcp_options_received *mp_opt)
+{
+ u64 hmac = 0;
+
+ if (mp_opt->echo)
+ return true;
+
+ if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
+ hmac = add_addr_generate_hmac(msk->remote_key,
+ msk->local_key,
+ mp_opt->addr_id, &mp_opt->addr);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else
+ hmac = add_addr6_generate_hmac(msk->remote_key,
+ msk->local_key,
+ mp_opt->addr_id, &mp_opt->addr6);
+#endif
+
+ pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
+ msk, (unsigned long long)hmac,
+ (unsigned long long)mp_opt->ahmac);
+
+ return hmac == mp_opt->ahmac;
+}
+
void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
struct tcp_options_received *opt_rx)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_options_received *mp_opt;
struct mptcp_ext *mpext;
mp_opt = &opt_rx->mptcp;
- if (!check_fourth_ack(subflow, skb, mp_opt))
+ if (!check_fully_established(msk, sk, subflow, skb, mp_opt))
return;
+ if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) {
+ struct mptcp_addr_info addr;
+
+ addr.port = htons(mp_opt->port);
+ addr.id = mp_opt->addr_id;
+ if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
+ addr.family = AF_INET;
+ addr.addr = mp_opt->addr;
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) {
+ addr.family = AF_INET6;
+ addr.addr6 = mp_opt->addr6;
+ }
+#endif
+ if (!mp_opt->echo)
+ mptcp_pm_add_addr_received(msk, &addr);
+ mp_opt->add_addr = 0;
+ }
+
if (!mp_opt->dss)
return;
+ /* we can't wait for recvmsg() to update the ack_seq, otherwise
+ * monodirectional flows will stuck
+ */
+ if (mp_opt->use_ack)
+ update_una(msk, mp_opt);
+
mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
if (!mpext)
return;
@@ -494,12 +879,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
mpext->use_map = 1;
}
- if (mp_opt->use_ack) {
- mpext->data_ack = mp_opt->data_ack;
- mpext->use_ack = 1;
- mpext->ack64 = mp_opt->ack64;
- }
-
mpext->data_fin = mp_opt->data_fin;
}
@@ -518,10 +897,9 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
else
len = TCPOLEN_MPTCP_MPC_ACK;
- *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
- (MPTCPOPT_MP_CAPABLE << 12) |
- (MPTCP_SUPPORTED_VERSION << 8) |
- MPTCP_CAP_HMAC_SHA256);
+ *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
+ MPTCP_SUPPORTED_VERSION,
+ MPTCP_CAP_HMAC_SHA256);
if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
opts->suboptions))
@@ -543,6 +921,77 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
}
mp_capable_done:
+ if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
+ if (opts->ahmac)
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR, 0,
+ opts->addr_id);
+ else
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR_BASE,
+ MPTCP_ADDR_ECHO,
+ opts->addr_id);
+ memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
+ ptr += 1;
+ if (opts->ahmac) {
+ put_unaligned_be64(opts->ahmac, ptr);
+ ptr += 2;
+ }
+ }
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
+ if (opts->ahmac)
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR6, 0,
+ opts->addr_id);
+ else
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR6_BASE,
+ MPTCP_ADDR_ECHO,
+ opts->addr_id);
+ memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
+ ptr += 4;
+ if (opts->ahmac) {
+ put_unaligned_be64(opts->ahmac, ptr);
+ ptr += 2;
+ }
+ }
+#endif
+
+ if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
+ TCPOLEN_MPTCP_RM_ADDR_BASE,
+ 0, opts->rm_id);
+ }
+
+ if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYN,
+ opts->backup, opts->join_id);
+ put_unaligned_be32(opts->token, ptr);
+ ptr += 1;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ }
+
+ if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYNACK,
+ opts->backup, opts->join_id);
+ put_unaligned_be64(opts->thmac, ptr);
+ ptr += 2;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ }
+
+ if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
+ memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
+ ptr += 5;
+ }
+
if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
struct mptcp_ext *mpext = &opts->ext_copy;
u8 len = TCPOLEN_MPTCP_DSS_BASE;
@@ -564,10 +1013,7 @@ mp_capable_done:
flags |= MPTCP_DSS_DATA_FIN;
}
- *ptr++ = htonl((TCPOPT_MPTCP << 24) |
- (len << 16) |
- (MPTCPOPT_DSS << 12) |
- (flags));
+ *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
if (mpext->use_ack) {
put_unaligned_be64(mpext->data_ack, ptr);
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
new file mode 100644
index 000000000000..064639f72487
--- /dev/null
+++ b/net/mptcp/pm.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+static struct workqueue_struct *pm_wq;
+
+/* path manager command handlers */
+
+int mptcp_pm_announce_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ pr_debug("msk=%p, local_id=%d", msk, addr->id);
+
+ msk->pm.local = *addr;
+ WRITE_ONCE(msk->pm.addr_signal, true);
+ return 0;
+}
+
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id)
+{
+ return -ENOTSUPP;
+}
+
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id)
+{
+ return -ENOTSUPP;
+}
+
+/* path manager event handlers */
+
+void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
+
+ WRITE_ONCE(pm->server_side, server_side);
+}
+
+bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ int ret;
+
+ pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
+ pm->subflows_max, READ_ONCE(pm->accept_subflow));
+
+ /* try to avoid acquiring the lock below */
+ if (!READ_ONCE(pm->accept_subflow))
+ return false;
+
+ spin_lock_bh(&pm->lock);
+ ret = pm->subflows < pm->subflows_max;
+ if (ret && ++pm->subflows == pm->subflows_max)
+ WRITE_ONCE(pm->accept_subflow, false);
+ spin_unlock_bh(&pm->lock);
+
+ return ret;
+}
+
+/* return true if the new status bit is currently cleared, that is, this event
+ * can be server, eventually by an already scheduled work
+ */
+static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
+ enum mptcp_pm_status new_status)
+{
+ pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
+ BIT(new_status));
+ if (msk->pm.status & BIT(new_status))
+ return false;
+
+ msk->pm.status |= BIT(new_status);
+ if (queue_work(pm_wq, &msk->pm.work))
+ sock_hold((struct sock *)msk);
+ return true;
+}
+
+void mptcp_pm_fully_established(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p", msk);
+
+ /* try to avoid acquiring the lock below */
+ if (!READ_ONCE(pm->work_pending))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ if (READ_ONCE(pm->work_pending))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_connection_closed(struct mptcp_sock *msk)
+{
+ pr_debug("msk=%p", msk);
+}
+
+void mptcp_pm_subflow_established(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p", msk);
+
+ if (!READ_ONCE(pm->work_pending))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ if (READ_ONCE(pm->work_pending))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id)
+{
+ pr_debug("msk=%p", msk);
+}
+
+void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
+ READ_ONCE(pm->accept_addr));
+
+ /* avoid acquiring the lock if there is no room for fouther addresses */
+ if (!READ_ONCE(pm->accept_addr))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ /* be sure there is something to signal re-checking under PM lock */
+ if (READ_ONCE(pm->accept_addr) &&
+ mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED))
+ pm->remote = *addr;
+
+ spin_unlock_bh(&pm->lock);
+}
+
+/* path manager helpers */
+
+bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr)
+{
+ int ret = false;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ /* double check after the lock is acquired */
+ if (!mptcp_pm_should_signal(msk))
+ goto out_unlock;
+
+ if (remaining < mptcp_add_addr_len(msk->pm.local.family))
+ goto out_unlock;
+
+ *saddr = msk->pm.local;
+ WRITE_ONCE(msk->pm.addr_signal, false);
+ ret = true;
+
+out_unlock:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
+{
+ return mptcp_pm_nl_get_local_id(msk, skc);
+}
+
+static void pm_worker(struct work_struct *work)
+{
+ struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data,
+ work);
+ struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm);
+ struct sock *sk = (struct sock *)msk;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+
+ pr_debug("msk=%p status=%x", msk, pm->status);
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
+ mptcp_pm_nl_add_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
+ mptcp_pm_nl_fully_established(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
+ mptcp_pm_nl_subflow_established(msk);
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+ sock_put(sk);
+}
+
+void mptcp_pm_data_init(struct mptcp_sock *msk)
+{
+ msk->pm.add_addr_signaled = 0;
+ msk->pm.add_addr_accepted = 0;
+ msk->pm.local_addr_used = 0;
+ msk->pm.subflows = 0;
+ WRITE_ONCE(msk->pm.work_pending, false);
+ WRITE_ONCE(msk->pm.addr_signal, false);
+ WRITE_ONCE(msk->pm.accept_addr, false);
+ WRITE_ONCE(msk->pm.accept_subflow, false);
+ msk->pm.status = 0;
+
+ spin_lock_init(&msk->pm.lock);
+ INIT_WORK(&msk->pm.work, pm_worker);
+
+ mptcp_pm_nl_data_init(msk);
+}
+
+void mptcp_pm_close(struct mptcp_sock *msk)
+{
+ if (cancel_work_sync(&msk->pm.work))
+ sock_put((struct sock *)msk);
+}
+
+void mptcp_pm_init(void)
+{
+ pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
+ if (!pm_wq)
+ panic("Failed to allocate workqueue");
+
+ mptcp_pm_nl_init();
+}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
new file mode 100644
index 000000000000..a0ce7f324499
--- /dev/null
+++ b/net/mptcp/pm_netlink.c
@@ -0,0 +1,857 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2020, Red Hat, Inc.
+ */
+
+#include <linux/inet.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/netns/generic.h>
+#include <net/mptcp.h>
+#include <net/genetlink.h>
+#include <uapi/linux/mptcp.h>
+
+#include "protocol.h"
+
+/* forward declaration */
+static struct genl_family mptcp_genl_family;
+
+static int pm_nl_pernet_id;
+
+struct mptcp_pm_addr_entry {
+ struct list_head list;
+ unsigned int flags;
+ int ifindex;
+ struct mptcp_addr_info addr;
+ struct rcu_head rcu;
+};
+
+struct pm_nl_pernet {
+ /* protects pernet updates */
+ spinlock_t lock;
+ struct list_head local_addr_list;
+ unsigned int addrs;
+ unsigned int add_addr_signal_max;
+ unsigned int add_addr_accept_max;
+ unsigned int local_addr_max;
+ unsigned int subflows_max;
+ unsigned int next_id;
+};
+
+#define MPTCP_PM_ADDR_MAX 8
+
+static bool addresses_equal(const struct mptcp_addr_info *a,
+ struct mptcp_addr_info *b, bool use_port)
+{
+ bool addr_equals = false;
+
+ if (a->family != b->family)
+ return false;
+
+ if (a->family == AF_INET)
+ addr_equals = a->addr.s_addr == b->addr.s_addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else
+ addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
+#endif
+
+ if (!addr_equals)
+ return false;
+ if (!use_port)
+ return true;
+
+ return a->port == b->port;
+}
+
+static void local_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr)
+{
+ addr->port = 0;
+ addr->family = skc->skc_family;
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_rcv_saddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_rcv_saddr;
+#endif
+}
+
+static void remote_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr)
+{
+ addr->family = skc->skc_family;
+ addr->port = skc->skc_dport;
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_daddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_daddr;
+#endif
+}
+
+static bool lookup_subflow_by_saddr(const struct list_head *list,
+ struct mptcp_addr_info *saddr)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_addr_info cur;
+ struct sock_common *skc;
+
+ list_for_each_entry(subflow, list, node) {
+ skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
+
+ local_address(skc, &cur);
+ if (addresses_equal(&cur, saddr, false))
+ return true;
+ }
+
+ return false;
+}
+
+static struct mptcp_pm_addr_entry *
+select_local_address(const struct pm_nl_pernet *pernet,
+ struct mptcp_sock *msk)
+{
+ struct mptcp_pm_addr_entry *entry, *ret = NULL;
+
+ rcu_read_lock();
+ spin_lock_bh(&msk->join_list_lock);
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
+ continue;
+
+ /* avoid any address already in use by subflows and
+ * pending join
+ */
+ if (entry->addr.family == ((struct sock *)msk)->sk_family &&
+ !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
+ !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) {
+ ret = entry;
+ break;
+ }
+ }
+ spin_unlock_bh(&msk->join_list_lock);
+ rcu_read_unlock();
+ return ret;
+}
+
+static struct mptcp_pm_addr_entry *
+select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
+{
+ struct mptcp_pm_addr_entry *entry, *ret = NULL;
+ int i = 0;
+
+ rcu_read_lock();
+ /* do not keep any additional per socket state, just signal
+ * the address list in order.
+ * Note: removal from the local address list during the msk life-cycle
+ * can lead to additional addresses not being announced.
+ */
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
+ continue;
+ if (i++ == pos) {
+ ret = entry;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static void check_work_pending(struct mptcp_sock *msk)
+{
+ if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max &&
+ (msk->pm.local_addr_used == msk->pm.local_addr_max ||
+ msk->pm.subflows == msk->pm.subflows_max))
+ WRITE_ONCE(msk->pm.work_pending, false);
+}
+
+static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *local;
+ struct mptcp_addr_info remote;
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
+ msk->pm.local_addr_used, msk->pm.local_addr_max,
+ msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max,
+ msk->pm.subflows, msk->pm.subflows_max);
+
+ /* check first for announce */
+ if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) {
+ local = select_signal_address(pernet,
+ msk->pm.add_addr_signaled);
+
+ if (local) {
+ msk->pm.add_addr_signaled++;
+ mptcp_pm_announce_addr(msk, &local->addr);
+ } else {
+ /* pick failed, avoid fourther attempts later */
+ msk->pm.local_addr_used = msk->pm.add_addr_signal_max;
+ }
+
+ check_work_pending(msk);
+ }
+
+ /* check if should create a new subflow */
+ if (msk->pm.local_addr_used < msk->pm.local_addr_max &&
+ msk->pm.subflows < msk->pm.subflows_max) {
+ remote_address((struct sock_common *)sk, &remote);
+
+ local = select_local_address(pernet, msk);
+ if (local) {
+ msk->pm.local_addr_used++;
+ msk->pm.subflows++;
+ check_work_pending(msk);
+ spin_unlock_bh(&msk->pm.lock);
+ __mptcp_subflow_connect(sk, local->ifindex,
+ &local->addr, &remote);
+ spin_lock_bh(&msk->pm.lock);
+ return;
+ }
+
+ /* lookup failed, avoid fourther attempts later */
+ msk->pm.local_addr_used = msk->pm.local_addr_max;
+ check_work_pending(msk);
+ }
+}
+
+void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info remote;
+ struct mptcp_addr_info local;
+
+ pr_debug("accepted %d:%d remote family %d",
+ msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max,
+ msk->pm.remote.family);
+ msk->pm.add_addr_accepted++;
+ msk->pm.subflows++;
+ if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max ||
+ msk->pm.subflows >= msk->pm.subflows_max)
+ WRITE_ONCE(msk->pm.accept_addr, false);
+
+ /* connect to the specified remote address, using whatever
+ * local address the routing configuration will pick.
+ */
+ remote = msk->pm.remote;
+ if (!remote.port)
+ remote.port = sk->sk_dport;
+ memset(&local, 0, sizeof(local));
+ local.family = remote.family;
+
+ spin_unlock_bh(&msk->pm.lock);
+ __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote);
+ spin_lock_bh(&msk->pm.lock);
+}
+
+static bool address_use_port(struct mptcp_pm_addr_entry *entry)
+{
+ return (entry->flags &
+ (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
+ MPTCP_PM_ADDR_FLAG_SIGNAL;
+}
+
+static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_pm_addr_entry *cur;
+ int ret = -EINVAL;
+
+ spin_lock_bh(&pernet->lock);
+ /* to keep the code simple, don't do IDR-like allocation for address ID,
+ * just bail when we exceed limits
+ */
+ if (pernet->next_id > 255)
+ goto out;
+ if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
+ goto out;
+
+ /* do not insert duplicate address, differentiate on port only
+ * singled addresses
+ */
+ list_for_each_entry(cur, &pernet->local_addr_list, list) {
+ if (addresses_equal(&cur->addr, &entry->addr,
+ address_use_port(entry) &&
+ address_use_port(cur)))
+ goto out;
+ }
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ pernet->add_addr_signal_max++;
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ pernet->local_addr_max++;
+
+ entry->addr.id = pernet->next_id++;
+ pernet->addrs++;
+ list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
+ ret = entry->addr.id;
+
+out:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
+{
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_addr_info skc_local;
+ struct mptcp_addr_info msk_local;
+ struct pm_nl_pernet *pernet;
+ int ret = -1;
+
+ if (WARN_ON_ONCE(!msk))
+ return -1;
+
+ /* The 0 ID mapping is defined by the first subflow, copied into the msk
+ * addr
+ */
+ local_address((struct sock_common *)msk, &msk_local);
+ local_address((struct sock_common *)msk, &skc_local);
+ if (addresses_equal(&msk_local, &skc_local, false))
+ return 0;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (addresses_equal(&entry->addr, &skc_local, false)) {
+ ret = entry->addr.id;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (ret >= 0)
+ return ret;
+
+ /* address not found, add to local list */
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->flags = 0;
+ entry->addr = skc_local;
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
+ if (ret < 0)
+ kfree(entry);
+
+ return ret;
+}
+
+void mptcp_pm_nl_data_init(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ struct pm_nl_pernet *pernet;
+ bool subflows;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max);
+ pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max);
+ pm->local_addr_max = READ_ONCE(pernet->local_addr_max);
+ pm->subflows_max = READ_ONCE(pernet->subflows_max);
+ subflows = !!pm->subflows_max;
+ WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) ||
+ !!pm->add_addr_signal_max);
+ WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows);
+ WRITE_ONCE(pm->accept_subflow, subflows);
+}
+
+#define MPTCP_PM_CMD_GRP_OFFSET 0
+
+static const struct genl_multicast_group mptcp_pm_mcgrps[] = {
+ [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, },
+};
+
+static const struct nla_policy
+mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
+ [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
+ [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
+ [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
+ [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct in6_addr), },
+ [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 },
+ [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 },
+ [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 },
+};
+
+static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
+ [MPTCP_PM_ATTR_ADDR] =
+ NLA_POLICY_NESTED(mptcp_pm_addr_policy),
+ [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
+};
+
+static int mptcp_pm_family_to_addr(int family)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (family == AF_INET6)
+ return MPTCP_PM_ADDR_ATTR_ADDR6;
+#endif
+ return MPTCP_PM_ADDR_ATTR_ADDR4;
+}
+
+static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
+ bool require_family,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
+ int err, addr_addr;
+
+ if (!attr) {
+ GENL_SET_ERR_MSG(info, "missing address info");
+ return -EINVAL;
+ }
+
+ /* no validation needed - was already done via nested policy */
+ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
+ mptcp_pm_addr_policy, info->extack);
+ if (err)
+ return err;
+
+ memset(entry, 0, sizeof(*entry));
+ if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) {
+ if (!require_family)
+ goto skip_family;
+
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "missing family");
+ return -EINVAL;
+ }
+
+ entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]);
+ if (entry->addr.family != AF_INET
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ && entry->addr.family != AF_INET6
+#endif
+ ) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "unknown address family");
+ return -EINVAL;
+ }
+ addr_addr = mptcp_pm_family_to_addr(entry->addr.family);
+ if (!tb[addr_addr]) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "missing address data");
+ return -EINVAL;
+ }
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (entry->addr.family == AF_INET6)
+ entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]);
+ else
+#endif
+ entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]);
+
+skip_family:
+ if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX])
+ entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+
+ if (tb[MPTCP_PM_ADDR_ATTR_ID])
+ entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
+
+ if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
+ entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+
+ return 0;
+}
+
+static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
+{
+ return net_generic(genl_info_net(info), pm_nl_pernet_id);
+}
+
+static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, true, &addr);
+ if (ret < 0)
+ return ret;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "can't allocate addr");
+ return -ENOMEM;
+ }
+
+ *entry = addr;
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "too many addresses or duplicate one");
+ kfree(entry);
+ return ret;
+ }
+
+ return 0;
+}
+
+static struct mptcp_pm_addr_entry *
+__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (entry->addr.id == id)
+ return entry;
+ }
+ return NULL;
+}
+
+static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr_by_id(pernet, addr.addr.id);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "address not found");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ pernet->add_addr_signal_max--;
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ pernet->local_addr_max--;
+
+ pernet->addrs--;
+ list_del_rcu(&entry->list);
+ kfree_rcu(entry, rcu);
+out:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+static void __flush_addrs(struct pm_nl_pernet *pernet)
+{
+ while (!list_empty(&pernet->local_addr_list)) {
+ struct mptcp_pm_addr_entry *cur;
+
+ cur = list_entry(pernet->local_addr_list.next,
+ struct mptcp_pm_addr_entry, list);
+ list_del_rcu(&cur->list);
+ kfree_rcu(cur, rcu);
+ }
+}
+
+static void __reset_counters(struct pm_nl_pernet *pernet)
+{
+ pernet->add_addr_signal_max = 0;
+ pernet->add_addr_accept_max = 0;
+ pernet->local_addr_max = 0;
+ pernet->addrs = 0;
+}
+
+static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+
+ spin_lock_bh(&pernet->lock);
+ __flush_addrs(pernet);
+ __reset_counters(pernet);
+ spin_unlock_bh(&pernet->lock);
+ return 0;
+}
+
+static int mptcp_nl_fill_addr(struct sk_buff *skb,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_addr_info *addr = &entry->addr;
+ struct nlattr *attr;
+
+ attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
+ goto nla_put_failure;
+ if (entry->ifindex &&
+ nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
+ goto nla_put_failure;
+
+ if (addr->family == AF_INET)
+ nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4,
+ addr->addr.s_addr);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6);
+#endif
+ nla_nest_end(skb, attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, attr);
+ return -EMSGSIZE;
+}
+
+static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ struct sk_buff *msg;
+ void *reply;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ info->genlhdr->cmd);
+ if (!reply) {
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ ret = -EMSGSIZE;
+ goto fail;
+ }
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr_by_id(pernet, addr.addr.id);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "address not found");
+ ret = -EINVAL;
+ goto unlock_fail;
+ }
+
+ ret = mptcp_nl_fill_addr(msg, entry);
+ if (ret)
+ goto unlock_fail;
+
+ genlmsg_end(msg, reply);
+ ret = genlmsg_reply(msg, info);
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+
+unlock_fail:
+ spin_unlock_bh(&pernet->lock);
+
+fail:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(msg->sk);
+ struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
+ int id = cb->args[0];
+ void *hdr;
+
+ pernet = net_generic(net, pm_nl_pernet_id);
+
+ spin_lock_bh(&pernet->lock);
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (entry->addr.id <= id)
+ continue;
+
+ hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &mptcp_genl_family,
+ NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
+ if (!hdr)
+ break;
+
+ if (mptcp_nl_fill_addr(msg, entry) < 0) {
+ genlmsg_cancel(msg, hdr);
+ break;
+ }
+
+ id = entry->addr.id;
+ genlmsg_end(msg, hdr);
+ }
+ spin_unlock_bh(&pernet->lock);
+
+ cb->args[0] = id;
+ return msg->len;
+}
+
+static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
+{
+ struct nlattr *attr = info->attrs[id];
+
+ if (!attr)
+ return 0;
+
+ *limit = nla_get_u32(attr);
+ if (*limit > MPTCP_PM_ADDR_MAX) {
+ GENL_SET_ERR_MSG(info, "limit greater than maximum");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ unsigned int rcv_addrs, subflows;
+ int ret;
+
+ spin_lock_bh(&pernet->lock);
+ rcv_addrs = pernet->add_addr_accept_max;
+ ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
+ if (ret)
+ goto unlock;
+
+ subflows = pernet->subflows_max;
+ ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
+ if (ret)
+ goto unlock;
+
+ WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs);
+ WRITE_ONCE(pernet->subflows_max, subflows);
+
+unlock:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+static int
+mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct sk_buff *msg;
+ void *reply;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ MPTCP_PM_CMD_GET_LIMITS);
+ if (!reply)
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
+ READ_ONCE(pernet->add_addr_accept_max)))
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
+ READ_ONCE(pernet->subflows_max)))
+ goto fail;
+
+ genlmsg_end(msg, reply);
+ return genlmsg_reply(msg, info);
+
+fail:
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+static struct genl_ops mptcp_pm_ops[] = {
+ {
+ .cmd = MPTCP_PM_CMD_ADD_ADDR,
+ .doit = mptcp_nl_cmd_add_addr,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_DEL_ADDR,
+ .doit = mptcp_nl_cmd_del_addr,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
+ .doit = mptcp_nl_cmd_flush_addrs,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_ADDR,
+ .doit = mptcp_nl_cmd_get_addr,
+ .dumpit = mptcp_nl_cmd_dump_addrs,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SET_LIMITS,
+ .doit = mptcp_nl_cmd_set_limits,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_LIMITS,
+ .doit = mptcp_nl_cmd_get_limits,
+ },
+};
+
+static struct genl_family mptcp_genl_family __ro_after_init = {
+ .name = MPTCP_PM_NAME,
+ .version = MPTCP_PM_VER,
+ .maxattr = MPTCP_PM_ATTR_MAX,
+ .policy = mptcp_pm_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = mptcp_pm_ops,
+ .n_ops = ARRAY_SIZE(mptcp_pm_ops),
+ .mcgrps = mptcp_pm_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
+};
+
+static int __net_init pm_nl_init_net(struct net *net)
+{
+ struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id);
+
+ INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
+ __reset_counters(pernet);
+ pernet->next_id = 1;
+ spin_lock_init(&pernet->lock);
+ return 0;
+}
+
+static void __net_exit pm_nl_exit_net(struct list_head *net_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_list, exit_list) {
+ /* net is removed from namespace list, can't race with
+ * other modifiers
+ */
+ __flush_addrs(net_generic(net, pm_nl_pernet_id));
+ }
+}
+
+static struct pernet_operations mptcp_pm_pernet_ops = {
+ .init = pm_nl_init_net,
+ .exit_batch = pm_nl_exit_net,
+ .id = &pm_nl_pernet_id,
+ .size = sizeof(struct pm_nl_pernet),
+};
+
+void mptcp_pm_nl_init(void)
+{
+ if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
+ panic("Failed to register MPTCP PM pernet subsystem.\n");
+
+ if (genl_register_family(&mptcp_genl_family))
+ panic("Failed to register MPTCP PM netlink family\n");
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 73780b4cb108..1833bc1f4a43 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,6 +21,7 @@
#endif
#include <net/mptcp.h>
#include "protocol.h"
+#include "mib.h"
#define MPTCP_SAME_STATE TCP_MAX_STATES
@@ -31,6 +32,14 @@ struct mptcp6_sock {
};
#endif
+struct mptcp_skb_cb {
+ u32 offset;
+};
+
+#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
+
+static struct percpu_counter mptcp_sockets_allocated;
+
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket.
* Otherwise return NULL.
@@ -98,17 +107,195 @@ set_state:
return ssock;
}
-static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
+static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
+ struct sk_buff *skb,
+ unsigned int offset, size_t copy_len)
{
- struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
- sock_owned_by_me((const struct sock *)msk);
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+ skb_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
- mptcp_for_each_subflow(msk, subflow) {
- return mptcp_subflow_tcp_sock(subflow);
+ msk->ack_seq += copy_len;
+ MPTCP_SKB_CB(skb)->offset = offset;
+}
+
+static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
+ struct sock *ssk,
+ unsigned int *bytes)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = (struct sock *)msk;
+ unsigned int moved = 0;
+ bool more_data_avail;
+ struct tcp_sock *tp;
+ bool done = false;
+
+ if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
+ int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
+
+ if (rcvbuf > sk->sk_rcvbuf)
+ sk->sk_rcvbuf = rcvbuf;
}
- return NULL;
+ tp = tcp_sk(ssk);
+ do {
+ u32 map_remaining, offset;
+ u32 seq = tp->copied_seq;
+ struct sk_buff *skb;
+ bool fin;
+
+ /* try to move as much data as available */
+ map_remaining = subflow->map_data_len -
+ mptcp_subflow_get_map_offset(subflow);
+
+ skb = skb_peek(&ssk->sk_receive_queue);
+ if (!skb)
+ break;
+
+ offset = seq - TCP_SKB_CB(skb)->seq;
+ fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
+ if (fin) {
+ done = true;
+ seq++;
+ }
+
+ if (offset < skb->len) {
+ size_t len = skb->len - offset;
+
+ if (tp->urg_data)
+ done = true;
+
+ __mptcp_move_skb(msk, ssk, skb, offset, len);
+ seq += len;
+ moved += len;
+
+ if (WARN_ON_ONCE(map_remaining < len))
+ break;
+ } else {
+ WARN_ON_ONCE(!fin);
+ sk_eat_skb(ssk, skb);
+ done = true;
+ }
+
+ WRITE_ONCE(tp->copied_seq, seq);
+ more_data_avail = mptcp_subflow_data_available(ssk);
+
+ if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf)) {
+ done = true;
+ break;
+ }
+ } while (more_data_avail);
+
+ *bytes = moved;
+
+ return done;
+}
+
+/* In most cases we will be able to lock the mptcp socket. If its already
+ * owned, we need to defer to the work queue to avoid ABBA deadlock.
+ */
+static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
+{
+ struct sock *sk = (struct sock *)msk;
+ unsigned int moved = 0;
+
+ if (READ_ONCE(sk->sk_lock.owned))
+ return false;
+
+ if (unlikely(!spin_trylock_bh(&sk->sk_lock.slock)))
+ return false;
+
+ /* must re-check after taking the lock */
+ if (!READ_ONCE(sk->sk_lock.owned))
+ __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+
+ spin_unlock_bh(&sk->sk_lock.slock);
+
+ return moved > 0;
+}
+
+void mptcp_data_ready(struct sock *sk, struct sock *ssk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+
+ if (atomic_read(&sk->sk_rmem_alloc) < READ_ONCE(sk->sk_rcvbuf) &&
+ move_skbs_to_msk(msk, ssk))
+ goto wake;
+
+ /* don't schedule if mptcp sk is (still) over limit */
+ if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
+ goto wake;
+
+ /* mptcp socket is owned, release_cb should retry */
+ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
+ &sk->sk_tsq_flags)) {
+ sock_hold(sk);
+
+ /* need to try again, its possible release_cb() has already
+ * been called after the test_and_set_bit() above.
+ */
+ move_skbs_to_msk(msk, ssk);
+ }
+wake:
+ sk->sk_data_ready(sk);
+}
+
+static void __mptcp_flush_join_list(struct mptcp_sock *msk)
+{
+ if (likely(list_empty(&msk->join_list)))
+ return;
+
+ spin_lock_bh(&msk->join_list_lock);
+ list_splice_tail_init(&msk->join_list, &msk->conn_list);
+ spin_unlock_bh(&msk->join_list_lock);
+}
+
+static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
+{
+ long tout = ssk && inet_csk(ssk)->icsk_pending ?
+ inet_csk(ssk)->icsk_timeout - jiffies : 0;
+
+ if (tout <= 0)
+ tout = mptcp_sk(sk)->timer_ival;
+ mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
+}
+
+static bool mptcp_timer_pending(struct sock *sk)
+{
+ return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
+}
+
+static void mptcp_reset_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long tout;
+
+ /* should never be called with mptcp level timer cleared */
+ tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
+ if (WARN_ON_ONCE(!tout))
+ tout = TCP_RTO_MIN;
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
+}
+
+void mptcp_data_acked(struct sock *sk)
+{
+ mptcp_reset_timer(sk);
+
+ if (!sk_stream_is_writeable(sk) &&
+ schedule_work(&mptcp_sk(sk)->work))
+ sock_hold(sk);
+}
+
+static void mptcp_stop_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ mptcp_sk(sk)->timer_ival = 0;
}
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
@@ -134,41 +321,149 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
return NULL;
}
-static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
- const struct sk_buff *skb,
- const struct mptcp_ext *mpext)
+static bool mptcp_skb_can_collapse_to(u64 write_seq,
+ const struct sk_buff *skb,
+ const struct mptcp_ext *mpext)
{
if (!tcp_skb_can_collapse_to(skb))
return false;
/* can collapse only if MPTCP level sequence is in order */
- return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
+ return mpext && mpext->data_seq + mpext->data_len == write_seq;
+}
+
+static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
+ const struct page_frag *pfrag,
+ const struct mptcp_data_frag *df)
+{
+ return df && pfrag->page == df->page &&
+ df->data_seq + df->data_len == msk->write_seq;
+}
+
+static void dfrag_uncharge(struct sock *sk, int len)
+{
+ sk_mem_uncharge(sk, len);
+ sk_wmem_queued_add(sk, -len);
+}
+
+static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
+{
+ int len = dfrag->data_len + dfrag->overhead;
+
+ list_del(&dfrag->list);
+ dfrag_uncharge(sk, len);
+ put_page(dfrag->page);
+}
+
+static void mptcp_clean_una(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *dtmp, *dfrag;
+ u64 snd_una = atomic64_read(&msk->snd_una);
+ bool cleaned = false;
+
+ list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
+ if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
+ break;
+
+ dfrag_clear(sk, dfrag);
+ cleaned = true;
+ }
+
+ dfrag = mptcp_rtx_head(sk);
+ if (dfrag && after64(snd_una, dfrag->data_seq)) {
+ u64 delta = dfrag->data_seq + dfrag->data_len - snd_una;
+
+ dfrag->data_seq += delta;
+ dfrag->data_len -= delta;
+
+ dfrag_uncharge(sk, delta);
+ cleaned = true;
+ }
+
+ if (cleaned) {
+ sk_mem_reclaim_partial(sk);
+
+ /* Only wake up writers if a subflow is ready */
+ if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ sk_stream_write_space(sk);
+ }
+}
+
+/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
+ * data
+ */
+static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
+ pfrag, sk->sk_allocation)))
+ return true;
+
+ sk->sk_prot->enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return false;
+}
+
+static struct mptcp_data_frag *
+mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
+ int orig_offset)
+{
+ int offset = ALIGN(orig_offset, sizeof(long));
+ struct mptcp_data_frag *dfrag;
+
+ dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
+ dfrag->data_len = 0;
+ dfrag->data_seq = msk->write_seq;
+ dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
+ dfrag->offset = offset + sizeof(struct mptcp_data_frag);
+ dfrag->page = pfrag->page;
+
+ return dfrag;
}
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
- struct msghdr *msg, long *timeo, int *pmss_now,
+ struct msghdr *msg, struct mptcp_data_frag *dfrag,
+ long *timeo, int *pmss_now,
int *ps_goal)
{
- int mss_now, avail_size, size_goal, ret;
+ int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
+ bool dfrag_collapsed, can_collapse = false;
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_ext *mpext = NULL;
+ bool retransmission = !!dfrag;
struct sk_buff *skb, *tail;
- bool can_collapse = false;
struct page_frag *pfrag;
+ struct page *page;
+ u64 *write_seq;
size_t psize;
/* use the mptcp page cache so that we can easily move the data
* from one substream to another, but do per subflow memory accounting
+ * Note: pfrag is used only !retransmission, but the compiler if
+ * fooled into a warning if we don't init here
*/
pfrag = sk_page_frag(sk);
- while (!sk_page_frag_refill(ssk, pfrag) ||
+ while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
!mptcp_ext_cache_refill(msk)) {
ret = sk_stream_wait_memory(ssk, timeo);
if (ret)
return ret;
+
+ /* if sk_stream_wait_memory() sleeps snd_una can change
+ * significantly, refresh the rtx queue
+ */
+ mptcp_clean_una(sk);
+
if (unlikely(__mptcp_needs_tcp_fallback(msk)))
return 0;
}
+ if (!retransmission) {
+ write_seq = &msk->write_seq;
+ page = pfrag->page;
+ } else {
+ write_seq = &dfrag->data_seq;
+ page = dfrag->page;
+ }
/* compute copy limit */
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
@@ -186,32 +481,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* SSN association set here
*/
can_collapse = (size_goal - skb->len > 0) &&
- mptcp_skb_can_collapse_to(msk, skb, mpext);
+ mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
if (!can_collapse)
TCP_SKB_CB(skb)->eor = 1;
else
avail_size = size_goal - skb->len;
}
- psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
-
- /* Copy to page */
- pr_debug("left=%zu", msg_data_left(msg));
- psize = copy_page_from_iter(pfrag->page, pfrag->offset,
- min_t(size_t, msg_data_left(msg), psize),
- &msg->msg_iter);
- pr_debug("left=%zu", msg_data_left(msg));
- if (!psize)
- return -EINVAL;
+
+ if (!retransmission) {
+ /* reuse tail pfrag, if possible, or carve a new one from the
+ * page allocator
+ */
+ dfrag = mptcp_rtx_tail(sk);
+ offset = pfrag->offset;
+ dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
+ if (!dfrag_collapsed) {
+ dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
+ offset = dfrag->offset;
+ frag_truesize = dfrag->overhead;
+ }
+ psize = min_t(size_t, pfrag->size - offset, avail_size);
+
+ /* Copy to page */
+ pr_debug("left=%zu", msg_data_left(msg));
+ psize = copy_page_from_iter(pfrag->page, offset,
+ min_t(size_t, msg_data_left(msg),
+ psize),
+ &msg->msg_iter);
+ pr_debug("left=%zu", msg_data_left(msg));
+ if (!psize)
+ return -EINVAL;
+
+ if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
+ return -ENOMEM;
+ } else {
+ offset = dfrag->offset;
+ psize = min_t(size_t, dfrag->data_len, avail_size);
+ }
/* tell the TCP stack to delay the push so that we can safely
* access the skb after the sendpages call
*/
- ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
+ ret = do_tcp_sendpages(ssk, page, offset, psize,
msg->msg_flags | MSG_SENDPAGE_NOTLAST);
if (ret <= 0)
return ret;
- if (unlikely(ret < psize))
- iov_iter_revert(&msg->msg_iter, psize - ret);
+
+ frag_truesize += ret;
+ if (!retransmission) {
+ if (unlikely(ret < psize))
+ iov_iter_revert(&msg->msg_iter, psize - ret);
+
+ /* send successful, keep track of sent data for mptcp-level
+ * retransmission
+ */
+ dfrag->data_len += ret;
+ if (!dfrag_collapsed) {
+ get_page(dfrag->page);
+ list_add_tail(&dfrag->list, &msk->rtx_queue);
+ sk_wmem_queued_add(sk, frag_truesize);
+ } else {
+ sk_wmem_queued_add(sk, ret);
+ }
+
+ /* charge data on mptcp rtx queue to the master socket
+ * Note: we charge such data both to sk and ssk
+ */
+ sk->sk_forward_alloc -= frag_truesize;
+ }
/* if the tail skb extension is still the cached one, collapsing
* really happened. Note: we can't check for 'same skb' as the sk_buff
@@ -230,7 +567,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
msk->cached_ext = NULL;
memset(mpext, 0, sizeof(*mpext));
- mpext->data_seq = msk->write_seq;
+ mpext->data_seq = *write_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
mpext->data_len = ret;
mpext->use_map = 1;
@@ -241,13 +578,51 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->dsn64);
out:
- pfrag->offset += ret;
- msk->write_seq += ret;
+ if (!retransmission)
+ pfrag->offset += frag_truesize;
+ *write_seq += ret;
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
return ret;
}
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *backup = NULL;
+
+ sock_owned_by_me((const struct sock *)msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!sk_stream_memory_free(ssk)) {
+ struct socket *sock = ssk->sk_socket;
+
+ if (sock) {
+ clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ smp_mb__after_atomic();
+
+ /* enables sk->write_space() callbacks */
+ set_bit(SOCK_NOSPACE, &sock->flags);
+ }
+
+ return NULL;
+ }
+
+ if (subflow->backup) {
+ if (!backup)
+ backup = ssk;
+
+ continue;
+ }
+
+ return ssk;
+ }
+
+ return backup;
+}
+
static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
{
struct socket *sock;
@@ -278,6 +653,15 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EOPNOTSUPP;
lock_sock(sk);
+
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
+ ret = sk_stream_wait_connect(sk, &timeo);
+ if (ret)
+ goto out;
+ }
+
ssock = __mptcp_tcp_fallback(msk);
if (unlikely(ssock)) {
fallback:
@@ -286,19 +670,29 @@ fallback:
return ret >= 0 ? ret + copied : (copied ? copied : ret);
}
- timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ mptcp_clean_una(sk);
- ssk = mptcp_subflow_get(msk);
- if (!ssk) {
- release_sock(sk);
- return -ENOTCONN;
+ __mptcp_flush_join_list(msk);
+ ssk = mptcp_subflow_get_send(msk);
+ while (!sk_stream_memory_free(sk) || !ssk) {
+ ret = sk_stream_wait_memory(sk, &timeo);
+ if (ret)
+ goto out;
+
+ mptcp_clean_una(sk);
+
+ ssk = mptcp_subflow_get_send(msk);
+ if (list_empty(&msk->conn_list)) {
+ ret = -ENOTCONN;
+ goto out;
+ }
}
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
while (msg_data_left(msg)) {
- ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
+ ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
&size_goal);
if (ret < 0)
break;
@@ -311,73 +705,101 @@ fallback:
copied += ret;
}
+ mptcp_set_timeout(sk, ssk);
if (copied) {
ret = copied;
tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
size_goal);
+
+ /* start the timer, if it's not pending */
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
}
ssk_check_wmem(msk, ssk);
release_sock(ssk);
+out:
release_sock(sk);
return ret;
}
-int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb,
- unsigned int offset, size_t len)
+static void mptcp_wait_data(struct sock *sk, long *timeo)
{
- struct mptcp_read_arg *arg = desc->arg.data;
- size_t copy_len;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct mptcp_sock *msk = mptcp_sk(sk);
- copy_len = min(desc->count, len);
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+
+ sk_wait_event(sk, timeo,
+ test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
- if (likely(arg->msg)) {
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+}
+
+static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
+ struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct sk_buff *skb;
+ int copied = 0;
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u32 offset = MPTCP_SKB_CB(skb)->offset;
+ u32 data_len = skb->len - offset;
+ u32 count = min_t(size_t, len - copied, data_len);
int err;
- err = skb_copy_datagram_msg(skb, offset, arg->msg, copy_len);
- if (err) {
- pr_debug("error path");
- desc->error = err;
- return err;
+ err = skb_copy_datagram_msg(skb, offset, msg, count);
+ if (unlikely(err < 0)) {
+ if (!copied)
+ return err;
+ break;
}
- } else {
- pr_debug("Flushing skb payload");
- }
- desc->count -= copy_len;
+ copied += count;
+
+ if (count < data_len) {
+ MPTCP_SKB_CB(skb)->offset += count;
+ break;
+ }
- pr_debug("consumed %zu bytes, %zu left", copy_len, desc->count);
- return copy_len;
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ __kfree_skb(skb);
+
+ if (copied >= len)
+ break;
+ }
+
+ return copied;
}
-static void mptcp_wait_data(struct sock *sk, long *timeo)
+static bool __mptcp_move_skbs(struct mptcp_sock *msk)
{
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
- struct mptcp_sock *msk = mptcp_sk(sk);
+ unsigned int moved = 0;
+ bool done;
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ do {
+ struct sock *ssk = mptcp_subflow_recv_lookup(msk);
- sk_wait_event(sk, timeo,
- test_and_clear_bit(MPTCP_DATA_READY, &msk->flags), &wait);
+ if (!ssk)
+ break;
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
+ lock_sock(ssk);
+ done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ release_sock(ssk);
+ } while (!done);
+
+ return moved > 0;
}
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int nonblock, int flags, int *addr_len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- struct mptcp_subflow_context *subflow;
- bool more_data_avail = false;
- struct mptcp_read_arg arg;
- read_descriptor_t desc;
- bool wait_data = false;
struct socket *ssock;
- struct tcp_sock *tp;
- bool done = false;
- struct sock *ssk;
int copied = 0;
int target;
long timeo;
@@ -395,65 +817,27 @@ fallback:
return copied;
}
- arg.msg = msg;
- desc.arg.data = &arg;
- desc.error = 0;
-
timeo = sock_rcvtimeo(sk, nonblock);
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+ __mptcp_flush_join_list(msk);
- while (!done) {
- u32 map_remaining;
+ while (len > (size_t)copied) {
int bytes_read;
- ssk = mptcp_subflow_recv_lookup(msk);
- pr_debug("msk=%p ssk=%p", msk, ssk);
- if (!ssk)
- goto wait_for_data;
-
- subflow = mptcp_subflow_ctx(ssk);
- tp = tcp_sk(ssk);
+ bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
+ if (unlikely(bytes_read < 0)) {
+ if (!copied)
+ copied = bytes_read;
+ goto out_err;
+ }
- lock_sock(ssk);
- do {
- /* try to read as much data as available */
- map_remaining = subflow->map_data_len -
- mptcp_subflow_get_map_offset(subflow);
- desc.count = min_t(size_t, len - copied, map_remaining);
- pr_debug("reading %zu bytes, copied %d", desc.count,
- copied);
- bytes_read = tcp_read_sock(ssk, &desc,
- mptcp_read_actor);
- if (bytes_read < 0) {
- if (!copied)
- copied = bytes_read;
- done = true;
- goto next;
- }
+ copied += bytes_read;
- pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq,
- msk->ack_seq + bytes_read);
- msk->ack_seq += bytes_read;
- copied += bytes_read;
- if (copied >= len) {
- done = true;
- goto next;
- }
- if (tp->urg_data && tp->urg_seq == tp->copied_seq) {
- pr_err("Urgent data present, cannot proceed");
- done = true;
- goto next;
- }
-next:
- more_data_avail = mptcp_subflow_data_available(ssk);
- } while (more_data_avail && !done);
- release_sock(ssk);
- continue;
-
-wait_for_data:
- more_data_avail = false;
+ if (skb_queue_empty(&sk->sk_receive_queue) &&
+ __mptcp_move_skbs(msk))
+ continue;
/* only the master socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg()
@@ -494,30 +878,92 @@ wait_for_data:
}
pr_debug("block timeout %ld", timeo);
- wait_data = true;
mptcp_wait_data(sk, &timeo);
if (unlikely(__mptcp_tcp_fallback(msk)))
goto fallback;
}
- if (more_data_avail) {
- if (!test_bit(MPTCP_DATA_READY, &msk->flags))
- set_bit(MPTCP_DATA_READY, &msk->flags);
- } else if (!wait_data) {
+ if (skb_queue_empty(&sk->sk_receive_queue)) {
+ /* entire backlog drained, clear DATA_READY. */
clear_bit(MPTCP_DATA_READY, &msk->flags);
- /* .. race-breaker: ssk might get new data after last
- * data_available() returns false.
+ /* .. race-breaker: ssk might have gotten new data
+ * after last __mptcp_move_skbs() returned false.
*/
- ssk = mptcp_subflow_recv_lookup(msk);
- if (unlikely(ssk))
+ if (unlikely(__mptcp_move_skbs(msk)))
set_bit(MPTCP_DATA_READY, &msk->flags);
+ } else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
+ /* data to read but mptcp_wait_data() cleared DATA_READY */
+ set_bit(MPTCP_DATA_READY, &msk->flags);
}
-
+out_err:
release_sock(sk);
return copied;
}
+static void mptcp_retransmit_handler(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (atomic64_read(&msk->snd_una) == msk->write_seq) {
+ mptcp_stop_timer(sk);
+ } else {
+ set_bit(MPTCP_WORK_RTX, &msk->flags);
+ if (schedule_work(&msk->work))
+ sock_hold(sk);
+ }
+}
+
+static void mptcp_retransmit_timer(struct timer_list *t)
+{
+ struct inet_connection_sock *icsk = from_timer(icsk, t,
+ icsk_retransmit_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ mptcp_retransmit_handler(sk);
+ } else {
+ /* delegate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED,
+ &sk->sk_tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/* Find an idle subflow. Return NULL if there is unacked data at tcp
+ * level.
+ *
+ * A backup subflow is returned only if that is the only kind available.
+ */
+static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *backup = NULL;
+
+ sock_owned_by_me((const struct sock *)msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ /* still data outstanding at TCP level? Don't retransmit. */
+ if (!tcp_write_queue_empty(ssk))
+ return NULL;
+
+ if (subflow->backup) {
+ if (!backup)
+ backup = ssk;
+ continue;
+ }
+
+ return ssk;
+ }
+
+ return backup;
+}
+
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
*
@@ -543,27 +989,140 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
}
}
+static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+ return 0;
+}
+
+static void mptcp_worker(struct work_struct *work)
+{
+ struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
+ struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
+ int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0;
+ struct mptcp_data_frag *dfrag;
+ u64 orig_write_seq;
+ size_t copied = 0;
+ struct msghdr msg;
+ long timeo = 0;
+
+ lock_sock(sk);
+ mptcp_clean_una(sk);
+ __mptcp_flush_join_list(msk);
+ __mptcp_move_skbs(msk);
+
+ if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
+ goto unlock;
+
+ dfrag = mptcp_rtx_head(sk);
+ if (!dfrag)
+ goto unlock;
+
+ ssk = mptcp_subflow_get_retrans(msk);
+ if (!ssk)
+ goto reset_unlock;
+
+ lock_sock(ssk);
+
+ msg.msg_flags = MSG_DONTWAIT;
+ orig_len = dfrag->data_len;
+ orig_offset = dfrag->offset;
+ orig_write_seq = dfrag->data_seq;
+ while (dfrag->data_len > 0) {
+ ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now,
+ &size_goal);
+ if (ret < 0)
+ break;
+
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
+ copied += ret;
+ dfrag->data_len -= ret;
+ dfrag->offset += ret;
+ }
+ if (copied)
+ tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
+ size_goal);
+
+ dfrag->data_seq = orig_write_seq;
+ dfrag->offset = orig_offset;
+ dfrag->data_len = orig_len;
+
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+
+reset_unlock:
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+
+unlock:
+ release_sock(sk);
+ sock_put(sk);
+}
+
static int __mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ spin_lock_init(&msk->join_list_lock);
+
INIT_LIST_HEAD(&msk->conn_list);
+ INIT_LIST_HEAD(&msk->join_list);
+ INIT_LIST_HEAD(&msk->rtx_queue);
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
+ INIT_WORK(&msk->work, mptcp_worker);
msk->first = NULL;
+ inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
+
+ mptcp_pm_data_init(msk);
+
+ /* re-use the csk retrans timer for MPTCP-level retrans */
+ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
return 0;
}
static int mptcp_init_sock(struct sock *sk)
{
- if (!mptcp_is_enabled(sock_net(sk)))
+ struct net *net = sock_net(sk);
+ int ret;
+
+ if (!mptcp_is_enabled(net))
return -ENOPROTOOPT;
- return __mptcp_init_sock(sk);
+ if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
+ return -ENOMEM;
+
+ ret = __mptcp_init_sock(sk);
+ if (ret)
+ return ret;
+
+ sk_sockets_allocated_inc(sk);
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
+
+ return 0;
+}
+
+static void __mptcp_clear_xmit(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *dtmp, *dfrag;
+
+ sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+
+ list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
+ dfrag_clear(sk, dfrag);
}
-static void mptcp_subflow_shutdown(struct sock *ssk, int how)
+static void mptcp_cancel_work(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (cancel_work_sync(&msk->work))
+ sock_put(sk);
+}
+
+static void mptcp_subflow_shutdown(struct sock *ssk, int how,
+ bool data_fin_tx_enable, u64 data_fin_tx_seq)
{
lock_sock(ssk);
@@ -576,6 +1135,14 @@ static void mptcp_subflow_shutdown(struct sock *ssk, int how)
tcp_disconnect(ssk, O_NONBLOCK);
break;
default:
+ if (data_fin_tx_enable) {
+ struct mptcp_subflow_context *subflow;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->data_fin_tx_seq = data_fin_tx_seq;
+ subflow->data_fin_tx_enable = 1;
+ }
+
ssk->sk_shutdown |= how;
tcp_shutdown(ssk, how);
break;
@@ -592,22 +1159,36 @@ static void mptcp_close(struct sock *sk, long timeout)
struct mptcp_subflow_context *subflow, *tmp;
struct mptcp_sock *msk = mptcp_sk(sk);
LIST_HEAD(conn_list);
+ u64 data_fin_tx_seq;
lock_sock(sk);
mptcp_token_destroy(msk->token);
inet_sk_state_store(sk, TCP_CLOSE);
+ __mptcp_flush_join_list(msk);
+
list_splice_init(&msk->conn_list, &conn_list);
+ data_fin_tx_seq = msk->write_seq;
+
+ __mptcp_clear_xmit(sk);
+
release_sock(sk);
list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ subflow->data_fin_tx_seq = data_fin_tx_seq;
+ subflow->data_fin_tx_enable = 1;
__mptcp_close_ssk(sk, ssk, subflow, timeout);
}
+ mptcp_cancel_work(sk);
+ mptcp_pm_close(msk);
+
+ __skb_queue_purge(&sk->sk_receive_queue);
+
sk_common_release(sk);
}
@@ -634,6 +1215,15 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
}
+static int mptcp_disconnect(struct sock *sk, int flags)
+{
+ lock_sock(sk);
+ __mptcp_clear_xmit(sk);
+ release_sock(sk);
+ mptcp_cancel_work(sk);
+ return tcp_disconnect(sk, flags);
+}
+
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
{
@@ -643,9 +1233,12 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
}
#endif
-struct sock *mptcp_sk_clone_lock(const struct sock *sk)
+struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req)
{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC);
+ struct mptcp_sock *msk;
+ u64 ack_seq;
if (!nsk)
return NULL;
@@ -655,6 +1248,40 @@ struct sock *mptcp_sk_clone_lock(const struct sock *sk)
inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
#endif
+ __mptcp_init_sock(nsk);
+
+ msk = mptcp_sk(nsk);
+ msk->local_key = subflow_req->local_key;
+ msk->token = subflow_req->token;
+ msk->subflow = NULL;
+
+ if (unlikely(mptcp_token_new_accept(subflow_req->token, nsk))) {
+ bh_unlock_sock(nsk);
+
+ /* we can't call into mptcp_close() here - possible BH context
+ * free the sock directly
+ */
+ nsk->sk_prot->destroy(nsk);
+ sk_free(nsk);
+ return NULL;
+ }
+
+ msk->write_seq = subflow_req->idsn + 1;
+ atomic64_set(&msk->snd_una, msk->write_seq);
+ if (subflow_req->remote_key_valid) {
+ msk->can_ack = true;
+ msk->remote_key = subflow_req->remote_key;
+ mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
+ ack_seq++;
+ msk->ack_seq = ack_seq;
+ }
+
+ /* will be fully established after successful MPC subflow creation */
+ inet_sk_state_store(nsk, TCP_SYN_RECV);
+ bh_unlock_sock(nsk);
+
+ /* keep a single reference */
+ __sock_put(nsk);
return nsk;
}
@@ -682,62 +1309,37 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
struct mptcp_subflow_context *subflow;
struct sock *new_mptcp_sock;
struct sock *ssk = newsk;
- u64 ack_seq;
subflow = mptcp_subflow_ctx(newsk);
- lock_sock(sk);
+ new_mptcp_sock = subflow->conn;
- local_bh_disable();
- new_mptcp_sock = mptcp_sk_clone_lock(sk);
- if (!new_mptcp_sock) {
- *err = -ENOBUFS;
- local_bh_enable();
- release_sock(sk);
- mptcp_subflow_shutdown(newsk, SHUT_RDWR + 1);
- tcp_close(newsk, 0);
- return NULL;
+ /* is_mptcp should be false if subflow->conn is missing, see
+ * subflow_syn_recv_sock()
+ */
+ if (WARN_ON_ONCE(!new_mptcp_sock)) {
+ tcp_sk(newsk)->is_mptcp = 0;
+ return newsk;
}
- __mptcp_init_sock(new_mptcp_sock);
+ /* acquire the 2nd reference for the owning socket */
+ sock_hold(new_mptcp_sock);
+ local_bh_disable();
+ bh_lock_sock(new_mptcp_sock);
msk = mptcp_sk(new_mptcp_sock);
- msk->local_key = subflow->local_key;
- msk->token = subflow->token;
- msk->subflow = NULL;
msk->first = newsk;
- mptcp_token_update_accept(newsk, new_mptcp_sock);
-
- msk->write_seq = subflow->idsn + 1;
- if (subflow->can_ack) {
- msk->can_ack = true;
- msk->remote_key = subflow->remote_key;
- mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
- ack_seq++;
- msk->ack_seq = ack_seq;
- }
newsk = new_mptcp_sock;
mptcp_copy_inaddrs(newsk, ssk);
list_add(&subflow->node, &msk->conn_list);
- /* will be fully established at mptcp_stream_accept()
- * completion.
- */
- inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV);
bh_unlock_sock(new_mptcp_sock);
- local_bh_enable();
- release_sock(sk);
- /* the subflow can already receive packet, avoid racing with
- * the receive path and process the pending ones
- */
- lock_sock(ssk);
- subflow->rel_write_seq = 1;
- subflow->tcp_sock = ssk;
- subflow->conn = new_mptcp_sock;
- if (unlikely(!skb_queue_empty(&ssk->sk_receive_queue)))
- mptcp_subflow_data_available(ssk);
- release_sock(ssk);
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
+ local_bh_enable();
+ } else {
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
}
return newsk;
@@ -749,66 +1351,92 @@ static void mptcp_destroy(struct sock *sk)
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
+
+ sk_sockets_allocated_dec(sk);
}
static int mptcp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- int ret = -EOPNOTSUPP;
struct socket *ssock;
- struct sock *ssk;
pr_debug("msk=%p", msk);
/* @@ the meaning of setsockopt() when the socket is connected and
- * there are multiple subflows is not defined.
+ * there are multiple subflows is not yet defined. It is up to the
+ * MPTCP-level socket to configure the subflows until the subflow
+ * is in TCP fallback, when TCP socket options are passed through
+ * to the one remaining subflow.
*/
lock_sock(sk);
- ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
- if (IS_ERR(ssock)) {
- release_sock(sk);
- return ret;
- }
+ ssock = __mptcp_tcp_fallback(msk);
+ if (ssock)
+ return tcp_setsockopt(ssock->sk, level, optname, optval,
+ optlen);
- ssk = ssock->sk;
- sock_hold(ssk);
release_sock(sk);
- ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
- sock_put(ssk);
-
- return ret;
+ return -EOPNOTSUPP;
}
static int mptcp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *option)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- int ret = -EOPNOTSUPP;
struct socket *ssock;
- struct sock *ssk;
pr_debug("msk=%p", msk);
- /* @@ the meaning of getsockopt() when the socket is connected and
- * there are multiple subflows is not defined.
+ /* @@ the meaning of setsockopt() when the socket is connected and
+ * there are multiple subflows is not yet defined. It is up to the
+ * MPTCP-level socket to configure the subflows until the subflow
+ * is in TCP fallback, when socket options are passed through
+ * to the one remaining subflow.
*/
lock_sock(sk);
- ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE);
- if (IS_ERR(ssock)) {
- release_sock(sk);
- return ret;
- }
+ ssock = __mptcp_tcp_fallback(msk);
+ if (ssock)
+ return tcp_getsockopt(ssock->sk, level, optname, optval,
+ option);
- ssk = ssock->sk;
- sock_hold(ssk);
release_sock(sk);
- ret = tcp_getsockopt(ssk, level, optname, optval, option);
- sock_put(ssk);
+ return -EOPNOTSUPP;
+}
- return ret;
+#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED)
+
+/* this is very alike tcp_release_cb() but we must handle differently a
+ * different set of events
+ */
+static void mptcp_release_cb(struct sock *sk)
+{
+ unsigned long flags, nflags;
+
+ do {
+ flags = sk->sk_tsq_flags;
+ if (!(flags & MPTCP_DEFERRED_ALL))
+ return;
+ nflags = flags & ~MPTCP_DEFERRED_ALL;
+ } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
+
+ sock_release_ownership(sk);
+
+ if (flags & TCPF_DELACK_TIMER_DEFERRED) {
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sock *ssk;
+
+ ssk = mptcp_subflow_recv_lookup(msk);
+ if (!ssk || !schedule_work(&msk->work))
+ __sock_put(sk);
+ }
+
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
+ mptcp_retransmit_handler(sk);
+ __sock_put(sk);
+ }
}
static int mptcp_get_port(struct sock *sk, unsigned short snum)
@@ -832,13 +1460,15 @@ void mptcp_finish_connect(struct sock *ssk)
u64 ack_seq;
subflow = mptcp_subflow_ctx(ssk);
-
- if (!subflow->mp_capable)
- return;
-
sk = subflow->conn;
msk = mptcp_sk(sk);
+ if (!subflow->mp_capable) {
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
+ return;
+ }
+
pr_debug("msk=%p, token=%u", sk, subflow->token);
mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
@@ -856,6 +1486,9 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
+ atomic64_set(&msk->snd_una, msk->write_seq);
+
+ mptcp_pm_new_connection(msk, 0);
}
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
@@ -867,6 +1500,46 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
write_unlock_bh(&sk->sk_callback_lock);
}
+bool mptcp_finish_join(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct sock *parent = (void *)msk;
+ struct socket *parent_sock;
+ bool ret;
+
+ pr_debug("msk=%p, subflow=%p", msk, subflow);
+
+ /* mptcp socket already closing? */
+ if (inet_sk_state_load(parent) != TCP_ESTABLISHED)
+ return false;
+
+ if (!msk->pm.server_side)
+ return true;
+
+ /* passive connection, attach to msk socket */
+ parent_sock = READ_ONCE(parent->sk_socket);
+ if (parent_sock && !sk->sk_socket)
+ mptcp_sock_graft(sk, parent_sock);
+
+ ret = mptcp_pm_allow_new_subflow(msk);
+ if (ret) {
+ /* active connections are already on conn_list */
+ spin_lock_bh(&msk->join_list_lock);
+ if (!WARN_ON_ONCE(!list_empty(&subflow->node)))
+ list_add_tail(&subflow->node, &msk->join_list);
+ spin_unlock_bh(&msk->join_list_lock);
+ }
+ return ret;
+}
+
+bool mptcp_sk_is_subflow(const struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ return subflow->mp_join == 1;
+}
+
static bool mptcp_memory_free(const struct sock *sk, int wake)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -878,6 +1551,7 @@ static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
.init = mptcp_init_sock,
+ .disconnect = mptcp_disconnect,
.close = mptcp_close,
.accept = mptcp_accept,
.setsockopt = mptcp_setsockopt,
@@ -886,10 +1560,16 @@ static struct proto mptcp_prot = {
.destroy = mptcp_destroy,
.sendmsg = mptcp_sendmsg,
.recvmsg = mptcp_recvmsg,
+ .release_cb = mptcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = mptcp_get_port,
+ .sockets_allocated = &mptcp_sockets_allocated,
+ .memory_allocated = &tcp_memory_allocated,
+ .memory_pressure = &tcp_memory_pressure,
.stream_memory_free = mptcp_memory_free,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+ .sysctl_mem = sysctl_tcp_mem,
.obj_size = sizeof(struct mptcp_sock),
.no_autobind = true,
};
@@ -1045,14 +1725,13 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
*/
+ __mptcp_flush_join_list(msk);
list_for_each_entry(subflow, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
if (!ssk->sk_socket)
mptcp_sock_graft(ssk, newsock);
}
-
- inet_sk_state_store(newsock->sk, TCP_ESTABLISHED);
}
sock_put(ssock->sk);
@@ -1128,10 +1807,11 @@ static int mptcp_shutdown(struct socket *sock, int how)
sock->state = SS_CONNECTED;
}
+ __mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
- mptcp_subflow_shutdown(tcp_sk, how);
+ mptcp_subflow_shutdown(tcp_sk, how, 1, msk->write_seq);
}
out_unlock:
@@ -1178,12 +1858,18 @@ void mptcp_proto_init(void)
{
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
+ if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
+ panic("Failed to allocate MPTCP pcpu counter\n");
+
mptcp_subflow_init();
+ mptcp_pm_init();
if (proto_register(&mptcp_prot, 1) != 0)
panic("Failed to register MPTCP proto.\n");
inet_register_protosw(&mptcp_protosw);
+
+ BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 8a99a2930284..f733c5425552 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -17,6 +17,12 @@
#define OPTION_MPTCP_MPC_SYN BIT(0)
#define OPTION_MPTCP_MPC_SYNACK BIT(1)
#define OPTION_MPTCP_MPC_ACK BIT(2)
+#define OPTION_MPTCP_MPJ_SYN BIT(3)
+#define OPTION_MPTCP_MPJ_SYNACK BIT(4)
+#define OPTION_MPTCP_MPJ_ACK BIT(5)
+#define OPTION_MPTCP_ADD_ADDR BIT(6)
+#define OPTION_MPTCP_ADD_ADDR6 BIT(7)
+#define OPTION_MPTCP_RM_ADDR BIT(8)
/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0
@@ -33,12 +39,30 @@
#define TCPOLEN_MPTCP_MPC_SYNACK 12
#define TCPOLEN_MPTCP_MPC_ACK 20
#define TCPOLEN_MPTCP_MPC_ACK_DATA 22
+#define TCPOLEN_MPTCP_MPJ_SYN 12
+#define TCPOLEN_MPTCP_MPJ_SYNACK 16
+#define TCPOLEN_MPTCP_MPJ_ACK 24
#define TCPOLEN_MPTCP_DSS_BASE 4
#define TCPOLEN_MPTCP_DSS_ACK32 4
#define TCPOLEN_MPTCP_DSS_ACK64 8
#define TCPOLEN_MPTCP_DSS_MAP32 10
#define TCPOLEN_MPTCP_DSS_MAP64 14
#define TCPOLEN_MPTCP_DSS_CHECKSUM 2
+#define TCPOLEN_MPTCP_ADD_ADDR 16
+#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE 8
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10
+#define TCPOLEN_MPTCP_ADD_ADDR6 28
+#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22
+#define TCPOLEN_MPTCP_PORT_LEN 2
+#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
+
+/* MPTCP MP_JOIN flags */
+#define MPTCPOPT_BACKUP BIT(0)
+#define MPTCPOPT_HMAC_LEN 20
+#define MPTCPOPT_THMAC_LEN 8
/* MPTCP MP_CAPABLE flags */
#define MPTCP_VERSION_MASK (0x0F)
@@ -55,9 +79,75 @@
#define MPTCP_DSS_HAS_ACK BIT(0)
#define MPTCP_DSS_FLAG_MASK (0x1F)
+/* MPTCP ADD_ADDR flags */
+#define MPTCP_ADDR_ECHO BIT(0)
+#define MPTCP_ADDR_HMAC_LEN 20
+#define MPTCP_ADDR_IPVERSION_4 4
+#define MPTCP_ADDR_IPVERSION_6 6
+
/* MPTCP socket flags */
-#define MPTCP_DATA_READY BIT(0)
-#define MPTCP_SEND_SPACE BIT(1)
+#define MPTCP_DATA_READY 0
+#define MPTCP_SEND_SPACE 1
+#define MPTCP_WORK_RTX 2
+
+static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
+{
+ return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) |
+ ((nib & 0xF) << 8) | field);
+}
+
+#define MPTCP_PM_MAX_ADDR 4
+
+struct mptcp_addr_info {
+ sa_family_t family;
+ __be16 port;
+ u8 id;
+ union {
+ struct in_addr addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ struct in6_addr addr6;
+#endif
+ };
+};
+
+enum mptcp_pm_status {
+ MPTCP_PM_ADD_ADDR_RECEIVED,
+ MPTCP_PM_ESTABLISHED,
+ MPTCP_PM_SUBFLOW_ESTABLISHED,
+};
+
+struct mptcp_pm_data {
+ struct mptcp_addr_info local;
+ struct mptcp_addr_info remote;
+
+ spinlock_t lock; /*protects the whole PM data */
+
+ bool addr_signal;
+ bool server_side;
+ bool work_pending;
+ bool accept_addr;
+ bool accept_subflow;
+ u8 add_addr_signaled;
+ u8 add_addr_accepted;
+ u8 local_addr_used;
+ u8 subflows;
+ u8 add_addr_signal_max;
+ u8 add_addr_accept_max;
+ u8 local_addr_max;
+ u8 subflows_max;
+ u8 status;
+
+ struct work_struct work;
+};
+
+struct mptcp_data_frag {
+ struct list_head list;
+ u64 data_seq;
+ int data_len;
+ int offset;
+ int overhead;
+ struct page *page;
+};
/* MPTCP connection sock */
struct mptcp_sock {
@@ -67,13 +157,20 @@ struct mptcp_sock {
u64 remote_key;
u64 write_seq;
u64 ack_seq;
+ atomic64_t snd_una;
+ unsigned long timer_ival;
u32 token;
unsigned long flags;
bool can_ack;
+ spinlock_t join_list_lock;
+ struct work_struct work;
struct list_head conn_list;
+ struct list_head rtx_queue;
+ struct list_head join_list;
struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
+ struct mptcp_pm_data pm;
};
#define mptcp_for_each_subflow(__msk, __subflow) \
@@ -84,17 +181,42 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
return (struct mptcp_sock *)sk;
}
+static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (list_empty(&msk->rtx_queue))
+ return NULL;
+
+ return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
+static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (list_empty(&msk->rtx_queue))
+ return NULL;
+
+ return list_first_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
struct mptcp_subflow_request_sock {
struct tcp_request_sock sk;
u16 mp_capable : 1,
mp_join : 1,
backup : 1,
remote_key_valid : 1;
+ u8 local_id;
+ u8 remote_id;
u64 local_key;
u64 remote_key;
u64 idsn;
u32 token;
u32 ssn_offset;
+ u64 thmac;
+ u32 local_nonce;
+ u32 remote_nonce;
};
static inline struct mptcp_subflow_request_sock *
@@ -117,14 +239,28 @@ struct mptcp_subflow_context {
u32 ssn_offset;
u32 map_data_len;
u32 request_mptcp : 1, /* send MP_CAPABLE */
+ request_join : 1, /* send MP_JOIN */
+ request_bkup : 1,
mp_capable : 1, /* remote is MPTCP capable */
- fourth_ack : 1, /* send initial DSS */
+ mp_join : 1, /* remote is JOINing */
+ fully_established : 1, /* path validated */
+ pm_notified : 1, /* PM hook called for established status */
conn_finished : 1,
map_valid : 1,
mpc_map : 1,
+ backup : 1,
data_avail : 1,
rx_eof : 1,
+ data_fin_tx_enable : 1,
can_ack : 1; /* only after processing the remote a key */
+ u64 data_fin_tx_seq;
+ u32 remote_nonce;
+ u64 thmac;
+ u32 local_nonce;
+ u32 remote_token;
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ u8 local_id;
+ u8 remote_id;
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
@@ -168,6 +304,11 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
int mptcp_is_enabled(struct net *net);
bool mptcp_subflow_data_available(struct sock *sk);
void mptcp_subflow_init(void);
+
+/* called with sk socket lock held */
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+ const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
@@ -190,23 +331,20 @@ void mptcp_proto_init(void);
int mptcp_proto_v6_init(void);
#endif
-struct mptcp_read_arg {
- struct msghdr *msg;
-};
-
-int mptcp_read_actor(read_descriptor_t *desc, struct sk_buff *skb,
- unsigned int offset, size_t len);
-
+struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req);
void mptcp_get_options(const struct sk_buff *skb,
struct tcp_options_received *opt_rx);
void mptcp_finish_connect(struct sock *sk);
+void mptcp_data_ready(struct sock *sk, struct sock *ssk);
+bool mptcp_finish_join(struct sock *sk);
+void mptcp_data_acked(struct sock *sk);
int mptcp_token_new_request(struct request_sock *req);
void mptcp_token_destroy_request(u32 token);
int mptcp_token_new_connect(struct sock *sk);
-int mptcp_token_new_accept(u32 token);
-void mptcp_token_update_accept(struct sock *sk, struct sock *conn);
+int mptcp_token_new_accept(u32 token, struct sock *conn);
+struct mptcp_sock *mptcp_token_get_sock(u32 token);
void mptcp_token_destroy(u32 token);
void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
@@ -222,8 +360,48 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
mptcp_crypto_key_sha(*key, token, idsn);
}
-void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
- void *hash_out);
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
+
+void mptcp_pm_init(void);
+void mptcp_pm_data_init(struct mptcp_sock *msk);
+void mptcp_pm_close(struct mptcp_sock *msk);
+void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side);
+void mptcp_pm_fully_established(struct mptcp_sock *msk);
+bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
+void mptcp_pm_connection_closed(struct mptcp_sock *msk);
+void mptcp_pm_subflow_established(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow);
+void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id);
+void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+
+int mptcp_pm_announce_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id);
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id);
+
+static inline bool mptcp_pm_should_signal(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal);
+}
+
+static inline unsigned int mptcp_add_addr_len(int family)
+{
+ if (family == AF_INET)
+ return TCPOLEN_MPTCP_ADD_ADDR;
+ return TCPOLEN_MPTCP_ADD_ADDR6;
+}
+
+bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr);
+int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
+
+void mptcp_pm_nl_init(void);
+void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
+void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
+void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
+void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk);
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
{
@@ -237,4 +415,6 @@ static inline bool before64(__u64 seq1, __u64 seq2)
#define after64(seq2, seq1) before64(seq1, seq2)
+void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
+
#endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 65122edf60aa..b5180c81588e 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netdevice.h>
+#include <crypto/algapi.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
@@ -19,17 +20,42 @@
#endif
#include <net/mptcp.h>
#include "protocol.h"
+#include "mib.h"
+
+static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
+ enum linux_mptcp_mib_field field)
+{
+ MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
+}
static int subflow_rebuild_header(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- int err = 0;
+ int local_id, err = 0;
if (subflow->request_mptcp && !subflow->token) {
pr_debug("subflow=%p", sk);
err = mptcp_token_new_connect(sk);
+ } else if (subflow->request_join && !subflow->local_nonce) {
+ struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
+
+ pr_debug("subflow=%p", sk);
+
+ do {
+ get_random_bytes(&subflow->local_nonce, sizeof(u32));
+ } while (!subflow->local_nonce);
+
+ if (subflow->local_id)
+ goto out;
+
+ local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
+ if (local_id < 0)
+ return -EINVAL;
+
+ subflow->local_id = local_id;
}
+out:
if (err)
return err;
@@ -47,6 +73,51 @@ static void subflow_req_destructor(struct request_sock *req)
tcp_request_sock_ops.destructor(req);
}
+static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
+ void *hmac)
+{
+ u8 msg[8];
+
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
+}
+
+/* validate received token and create truncated hmac and nonce for SYN-ACK */
+static bool subflow_token_join_request(struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ struct mptcp_sock *msk;
+ int local_id;
+
+ msk = mptcp_token_get_sock(subflow_req->token);
+ if (!msk) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
+ return false;
+ }
+
+ local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
+ if (local_id < 0) {
+ sock_put((struct sock *)msk);
+ return false;
+ }
+ subflow_req->local_id = local_id;
+
+ get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
+
+ subflow_generate_hmac(msk->local_key, msk->remote_key,
+ subflow_req->local_nonce,
+ subflow_req->remote_nonce, hmac);
+
+ subflow_req->thmac = get_unaligned_be64(hmac);
+
+ sock_put((struct sock *)msk);
+ return true;
+}
+
static void subflow_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
@@ -61,6 +132,7 @@ static void subflow_init_req(struct request_sock *req,
mptcp_get_options(skb, &rx_opt);
subflow_req->mp_capable = 0;
+ subflow_req->mp_join = 0;
subflow_req->remote_key_valid = 0;
#ifdef CONFIG_TCP_MD5SIG
@@ -71,6 +143,15 @@ static void subflow_init_req(struct request_sock *req,
return;
#endif
+ if (rx_opt.mptcp.mp_capable) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
+
+ if (rx_opt.mptcp.mp_join)
+ return;
+ } else if (rx_opt.mptcp.mp_join) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
+ }
+
if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
int err;
@@ -79,6 +160,19 @@ static void subflow_init_req(struct request_sock *req,
subflow_req->mp_capable = 1;
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+ } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+ subflow_req->mp_join = 1;
+ subflow_req->backup = rx_opt.mptcp.backup;
+ subflow_req->remote_id = rx_opt.mptcp.join_id;
+ subflow_req->token = rx_opt.mptcp.token;
+ subflow_req->remote_nonce = rx_opt.mptcp.nonce;
+ pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
+ subflow_req->remote_nonce);
+ if (!subflow_token_join_request(req, skb)) {
+ subflow_req->mp_join = 0;
+ // @@ need to trigger RST
+ }
}
}
@@ -106,13 +200,41 @@ static void subflow_v6_init_req(struct request_sock *req,
}
#endif
+/* validate received truncated hmac and create hmac for third ACK */
+static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
+{
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ u64 thmac;
+
+ subflow_generate_hmac(subflow->remote_key, subflow->local_key,
+ subflow->remote_nonce, subflow->local_nonce,
+ hmac);
+
+ thmac = get_unaligned_be64(hmac);
+ pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
+ subflow, subflow->token,
+ (unsigned long long)thmac,
+ (unsigned long long)subflow->thmac);
+
+ return thmac == subflow->thmac;
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct sock *parent = subflow->conn;
subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
- if (subflow->conn && !subflow->conn_finished) {
+ if (inet_sk_state_load(parent) != TCP_ESTABLISHED) {
+ inet_sk_state_store(parent, TCP_ESTABLISHED);
+ parent->sk_state_change(parent);
+ }
+
+ if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
+ return;
+
+ if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
subflow->remote_key);
mptcp_finish_connect(sk);
@@ -122,6 +244,33 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
}
+ } else if (subflow->mp_join) {
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
+ subflow, subflow->thmac,
+ subflow->remote_nonce);
+ if (!subflow_thmac_valid(subflow)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
+ subflow->mp_join = 0;
+ goto do_reset;
+ }
+
+ subflow_generate_hmac(subflow->local_key, subflow->remote_key,
+ subflow->local_nonce,
+ subflow->remote_nonce,
+ subflow->hmac);
+
+ if (skb)
+ subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+
+ if (!mptcp_finish_join(sk))
+ goto do_reset;
+
+ subflow->conn_finished = 1;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
+ } else {
+do_reset:
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
}
}
@@ -172,6 +321,32 @@ drop:
}
#endif
+/* validate hmac received in third ACK */
+static bool subflow_hmac_valid(const struct request_sock *req,
+ const struct tcp_options_received *rx_opt)
+{
+ const struct mptcp_subflow_request_sock *subflow_req;
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ struct mptcp_sock *msk;
+ bool ret;
+
+ subflow_req = mptcp_subflow_rsk(req);
+ msk = mptcp_token_get_sock(subflow_req->token);
+ if (!msk)
+ return false;
+
+ subflow_generate_hmac(msk->remote_key, msk->local_key,
+ subflow_req->remote_nonce,
+ subflow_req->local_nonce, hmac);
+
+ ret = true;
+ if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac)))
+ ret = false;
+
+ sock_put((struct sock *)msk);
+ return ret;
+}
+
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct sk_buff *skb,
struct request_sock *req,
@@ -182,6 +357,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
struct mptcp_subflow_request_sock *subflow_req;
struct tcp_options_received opt_rx;
+ bool fallback_is_fatal = false;
+ struct sock *new_msk = NULL;
struct sock *child;
pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
@@ -197,7 +374,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
* out-of-order pkt, which will not carry the MP_CAPABLE
* opt even on mptcp enabled paths
*/
- goto create_child;
+ goto create_msk;
}
opt_rx.mptcp.mp_capable = 0;
@@ -207,6 +384,21 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
subflow_req->remote_key_valid = 1;
} else {
subflow_req->mp_capable = 0;
+ goto create_child;
+ }
+
+create_msk:
+ new_msk = mptcp_sk_clone(listener->conn, req);
+ if (!new_msk)
+ subflow_req->mp_capable = 0;
+ } else if (subflow_req->mp_join) {
+ fallback_is_fatal = true;
+ opt_rx.mptcp.mp_join = 0;
+ mptcp_get_options(skb, &opt_rx);
+ if (!opt_rx.mptcp.mp_join ||
+ !subflow_hmac_valid(req, &opt_rx)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
+ return NULL;
}
}
@@ -217,22 +409,45 @@ create_child:
if (child && *own_req) {
struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
- /* we have null ctx on TCP fallback, not fatal on MPC
- * handshake
+ /* we have null ctx on TCP fallback, which is fatal on
+ * MPJ handshake
*/
- if (!ctx)
- return child;
+ if (!ctx) {
+ if (fallback_is_fatal)
+ goto close_child;
+ goto out;
+ }
if (ctx->mp_capable) {
- if (mptcp_token_new_accept(ctx->token))
+ /* new mpc subflow takes ownership of the newly
+ * created mptcp socket
+ */
+ inet_sk_state_store(new_msk, TCP_ESTABLISHED);
+ mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
+ ctx->conn = new_msk;
+ new_msk = NULL;
+ } else if (ctx->mp_join) {
+ struct mptcp_sock *owner;
+
+ owner = mptcp_token_get_sock(ctx->token);
+ if (!owner)
goto close_child;
+
+ ctx->conn = (struct sock *)owner;
+ if (!mptcp_finish_join(child))
+ goto close_child;
+
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
}
}
+out:
+ /* dispose of the left over mptcp master, if any */
+ if (unlikely(new_msk))
+ sock_put(new_msk);
return child;
close_child:
- pr_debug("closing child socket");
tcp_send_active_reset(child, GFP_ATOMIC);
inet_csk_prepare_forced_close(child);
tcp_done(child);
@@ -338,6 +553,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
data_len = mpext->data_len;
if (data_len == 0) {
pr_err("Infinite mapping not handled");
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
return MAPPING_INVALID;
}
@@ -381,8 +597,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
/* If this skb data are fully covered by the current mapping,
* the new map would need caching, which is not supported
*/
- if (skb_is_fully_mapped(ssk, skb))
+ if (skb_is_fully_mapped(ssk, skb)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
return MAPPING_INVALID;
+ }
/* will validate the next map after consuming the current one */
return MAPPING_OK;
@@ -408,6 +626,18 @@ validate_seq:
return MAPPING_OK;
}
+static int subflow_read_actor(read_descriptor_t *desc,
+ struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ size_t copy_len = min(desc->count, len);
+
+ desc->count -= copy_len;
+
+ pr_debug("flushed %zu bytes, %zu left", copy_len, desc->count);
+ return copy_len;
+}
+
static bool subflow_check_data_avail(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@@ -420,9 +650,6 @@ static bool subflow_check_data_avail(struct sock *ssk)
if (subflow->data_avail)
return true;
- if (!subflow->conn)
- return false;
-
msk = mptcp_sk(subflow->conn);
for (;;) {
u32 map_remaining;
@@ -482,16 +709,12 @@ static bool subflow_check_data_avail(struct sock *ssk)
pr_debug("discarding %zu bytes, current map len=%d", delta,
map_remaining);
if (delta) {
- struct mptcp_read_arg arg = {
- .msg = NULL,
- };
read_descriptor_t desc = {
.count = delta,
- .arg.data = &arg,
};
int ret;
- ret = tcp_read_sock(ssk, &desc, mptcp_read_actor);
+ ret = tcp_read_sock(ssk, &desc, subflow_read_actor);
if (ret < 0) {
ssk->sk_err = -ret;
goto fatal;
@@ -546,19 +769,15 @@ static void subflow_data_ready(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
- if (!parent || !subflow->mp_capable) {
+ if (!subflow->mp_capable && !subflow->mp_join) {
subflow->tcp_data_ready(sk);
- if (parent)
- parent->sk_data_ready(parent);
+ parent->sk_data_ready(parent);
return;
}
- if (mptcp_subflow_data_available(sk)) {
- set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
-
- parent->sk_data_ready(parent);
- }
+ if (mptcp_subflow_data_available(sk))
+ mptcp_data_ready(parent, sk);
}
static void subflow_write_space(struct sock *sk)
@@ -567,7 +786,7 @@ static void subflow_write_space(struct sock *sk)
struct sock *parent = subflow->conn;
sk_stream_write_space(sk);
- if (parent && sk_stream_is_writeable(sk)) {
+ if (sk_stream_is_writeable(sk)) {
set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
smp_mb__after_atomic();
/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
@@ -605,6 +824,85 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
}
#endif
+static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+ struct sockaddr_storage *addr)
+{
+ memset(addr, 0, sizeof(*addr));
+ addr->ss_family = info->family;
+ if (addr->ss_family == AF_INET) {
+ struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
+
+ in_addr->sin_addr = info->addr;
+ in_addr->sin_port = info->port;
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->ss_family == AF_INET6) {
+ struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
+
+ in6_addr->sin6_addr = info->addr6;
+ in6_addr->sin6_port = info->port;
+ }
+#endif
+}
+
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+ const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *remote)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
+ struct sockaddr_storage addr;
+ struct socket *sf;
+ u32 remote_token;
+ int addrlen;
+ int err;
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+
+ err = mptcp_subflow_create_socket(sk, &sf);
+ if (err)
+ return err;
+
+ subflow = mptcp_subflow_ctx(sf->sk);
+ subflow->remote_key = msk->remote_key;
+ subflow->local_key = msk->local_key;
+ subflow->token = msk->token;
+ mptcp_info2sockaddr(loc, &addr);
+
+ addrlen = sizeof(struct sockaddr_in);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (loc->family == AF_INET6)
+ addrlen = sizeof(struct sockaddr_in6);
+#endif
+ sf->sk->sk_bound_dev_if = ifindex;
+ err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
+ if (err)
+ goto failed;
+
+ mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
+ pr_debug("msk=%p remote_token=%u", msk, remote_token);
+ subflow->remote_token = remote_token;
+ subflow->local_id = loc->id;
+ subflow->request_join = 1;
+ subflow->request_bkup = 1;
+ mptcp_info2sockaddr(remote, &addr);
+
+ err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
+ if (err && err != -EINPROGRESS)
+ goto failed;
+
+ spin_lock_bh(&msk->join_list_lock);
+ list_add_tail(&subflow->node, &msk->join_list);
+ spin_unlock_bh(&msk->join_list_lock);
+
+ return err;
+
+failed:
+ sock_release(sf);
+ return err;
+}
+
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
struct mptcp_subflow_context *subflow;
@@ -682,7 +980,7 @@ static bool subflow_is_done(const struct sock *sk)
static void subflow_state_change(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- struct sock *parent = READ_ONCE(subflow->conn);
+ struct sock *parent = subflow->conn;
__subflow_state_change(sk);
@@ -690,13 +988,10 @@ static void subflow_state_change(struct sock *sk)
* a fin packet carrying a DSS can be unnoticed if we don't trigger
* the data available machinery here.
*/
- if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) {
- set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
-
- parent->sk_data_ready(parent);
- }
+ if (subflow->mp_capable && mptcp_subflow_data_available(sk))
+ mptcp_data_ready(parent, sk);
- if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) &&
+ if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
!subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1;
parent->sk_shutdown |= RCV_SHUTDOWN;
@@ -772,7 +1067,8 @@ static void subflow_ulp_clone(const struct request_sock *req,
struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
struct mptcp_subflow_context *new_ctx;
- if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) {
+ if (!tcp_rsk(req)->is_mptcp ||
+ (!subflow_req->mp_capable && !subflow_req->mp_join)) {
subflow_ulp_fallback(newsk, old_ctx);
return;
}
@@ -783,22 +1079,35 @@ static void subflow_ulp_clone(const struct request_sock *req,
return;
}
- /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
- * established only after we receive the remote key
- */
new_ctx->conn_finished = 1;
new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
new_ctx->tcp_state_change = old_ctx->tcp_state_change;
new_ctx->tcp_write_space = old_ctx->tcp_write_space;
- new_ctx->mp_capable = 1;
- new_ctx->fourth_ack = subflow_req->remote_key_valid;
- new_ctx->can_ack = subflow_req->remote_key_valid;
- new_ctx->remote_key = subflow_req->remote_key;
- new_ctx->local_key = subflow_req->local_key;
- new_ctx->token = subflow_req->token;
- new_ctx->ssn_offset = subflow_req->ssn_offset;
- new_ctx->idsn = subflow_req->idsn;
+ new_ctx->rel_write_seq = 1;
+ new_ctx->tcp_sock = newsk;
+
+ if (subflow_req->mp_capable) {
+ /* see comments in subflow_syn_recv_sock(), MPTCP connection
+ * is fully established only after we receive the remote key
+ */
+ new_ctx->mp_capable = 1;
+ new_ctx->fully_established = subflow_req->remote_key_valid;
+ new_ctx->can_ack = subflow_req->remote_key_valid;
+ new_ctx->remote_key = subflow_req->remote_key;
+ new_ctx->local_key = subflow_req->local_key;
+ new_ctx->token = subflow_req->token;
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->idsn = subflow_req->idsn;
+ } else if (subflow_req->mp_join) {
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->mp_join = 1;
+ new_ctx->fully_established = 1;
+ new_ctx->backup = subflow_req->backup;
+ new_ctx->local_id = subflow_req->local_id;
+ new_ctx->token = subflow_req->token;
+ new_ctx->thmac = subflow_req->thmac;
+ }
}
static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
@@ -860,6 +1169,8 @@ void mptcp_subflow_init(void)
subflow_v6m_specific.net_frag_header_len = 0;
#endif
+ mptcp_diag_subflow_init(&subflow_ulp_ops);
+
if (tcp_register_ulp(&subflow_ulp_ops) != 0)
panic("MPTCP: failed to register subflows to ULP\n");
}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 84d887806090..129a5ad1bc35 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -128,43 +128,43 @@ int mptcp_token_new_connect(struct sock *sk)
*
* Called when a SYN packet creates a new logical connection, i.e.
* is not a join request.
- *
- * We don't have an mptcp socket yet at that point.
- * This is paired with mptcp_token_update_accept, called on accept().
*/
-int mptcp_token_new_accept(u32 token)
+int mptcp_token_new_accept(u32 token, struct sock *conn)
{
int err;
spin_lock_bh(&token_tree_lock);
- err = radix_tree_insert(&token_tree, token, &token_used);
+ err = radix_tree_insert(&token_tree, token, conn);
spin_unlock_bh(&token_tree_lock);
return err;
}
/**
- * mptcp_token_update_accept - update token to map to mptcp socket
- * @conn: the new struct mptcp_sock
- * @sk: the initial subflow for this mptcp socket
+ * mptcp_token_get_sock - retrieve mptcp connection sock using its token
+ * @token: token of the mptcp connection to retrieve
+ *
+ * This function returns the mptcp connection structure with the given token.
+ * A reference count on the mptcp socket returned is taken.
*
- * Called when the first mptcp socket is created on accept to
- * refresh the dummy mapping (done to reserve the token) with
- * the mptcp_socket structure that wasn't allocated before.
+ * returns NULL if no connection with the given token value exists.
*/
-void mptcp_token_update_accept(struct sock *sk, struct sock *conn)
+struct mptcp_sock *mptcp_token_get_sock(u32 token)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- void __rcu **slot;
+ struct sock *conn;
spin_lock_bh(&token_tree_lock);
- slot = radix_tree_lookup_slot(&token_tree, subflow->token);
- WARN_ON_ONCE(!slot);
- if (slot) {
- WARN_ON_ONCE(rcu_access_pointer(*slot) != &token_used);
- radix_tree_replace_slot(&token_tree, slot, conn);
+ conn = radix_tree_lookup(&token_tree, token);
+ if (conn) {
+ /* token still reserved? */
+ if (conn == (struct sock *)&token_used)
+ conn = NULL;
+ else
+ sock_hold(conn);
}
spin_unlock_bh(&token_tree_lock);
+
+ return mptcp_sk(conn);
}
/**
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 91efae88e8c2..468fea1aebba 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -455,14 +455,6 @@ config NF_TABLES
To compile it as a module, choose M here.
if NF_TABLES
-
-config NF_TABLES_SET
- tristate "Netfilter nf_tables set infrastructure"
- help
- This option enables the nf_tables set infrastructure that allows to
- look up for elements in a set and to build one-way mappings between
- matchings and actions.
-
config NF_TABLES_INET
depends on IPV6
select NF_TABLES_IPV4
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 3f572e5a975e..292e71dc7ba4 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -78,14 +78,17 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \
- nft_chain_route.o nf_tables_offload.o
+ nft_chain_route.o nf_tables_offload.o \
+ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
+ nft_set_pipapo.o
-nf_tables_set-objs := nf_tables_set_core.o \
- nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
- nft_set_pipapo.o
+ifdef CONFIG_X86_64
+ifneq (,$(findstring -DCONFIG_AS_AVX2=1,$(KBUILD_CFLAGS)))
+nf_tables-objs += nft_set_pipapo_avx2.o
+endif
+endif
obj-$(CONFIG_NF_TABLES) += nf_tables.o
-obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 0a2196f59106..486959f70cf3 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -46,7 +46,7 @@ struct bitmap_ip {
u8 netmask; /* subnet netmask */
struct timer_list gc; /* garbage collection */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* data extensions */
+ unsigned char extensions[] /* data extensions */
__aligned(__alignof__(u64));
};
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 739e343efaf6..2310a316e0af 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -49,7 +49,7 @@ struct bitmap_ipmac {
size_t memsize; /* members size */
struct timer_list gc; /* garbage collector */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* MAC + data extensions */
+ unsigned char extensions[] /* MAC + data extensions */
__aligned(__alignof__(u64));
};
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index b49978dd810d..e56ced66f202 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -37,7 +37,7 @@ struct bitmap_port {
size_t memsize; /* members size */
struct timer_list gc; /* garbage collection */
struct ip_set *set; /* attached to this ip_set */
- unsigned char extensions[0] /* data extensions */
+ unsigned char extensions[] /* data extensions */
__aligned(__alignof__(u64));
};
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 69c107f9ba8d..8dd17589217d 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -723,6 +723,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index)
return set;
}
+static inline void
+ip_set_lock(struct ip_set *set)
+{
+ if (!set->variant->region_lock)
+ spin_lock_bh(&set->lock);
+}
+
+static inline void
+ip_set_unlock(struct ip_set *set)
+{
+ if (!set->variant->region_lock)
+ spin_unlock_bh(&set->lock);
+}
+
int
ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par, struct ip_set_adt_opt *opt)
@@ -744,9 +758,9 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
if (ret == -EAGAIN) {
/* Type requests element to be completed */
pr_debug("element must be completed, ADD is triggered\n");
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
ret = 1;
} else {
/* --return-nomatch: invert matched element */
@@ -775,9 +789,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
return ret;
}
@@ -797,9 +811,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
!(opt->family == set->family || set->family == NFPROTO_UNSPEC))
return -IPSET_ERR_TYPE_MISMATCH;
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
return ret;
}
@@ -1264,9 +1278,9 @@ ip_set_flush_set(struct ip_set *set)
{
pr_debug("set: %s\n", set->name);
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
set->variant->flush(set);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
}
static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
@@ -1713,9 +1727,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
do {
- spin_lock_bh(&set->lock);
+ ip_set_lock(set);
ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
- spin_unlock_bh(&set->lock);
+ ip_set_unlock(set);
retried = true;
} while (ret == -EAGAIN &&
set->variant->resize &&
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7480ce55b5c8..1ee43752d6d3 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -7,13 +7,21 @@
#include <linux/rcupdate.h>
#include <linux/jhash.h>
#include <linux/types.h>
+#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/ipset/ip_set.h>
-#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c)
-#define ipset_dereference_protected(p, set) \
- __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock))
-
-#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
+#define __ipset_dereference(p) \
+ rcu_dereference_protected(p, 1)
+#define ipset_dereference_nfnl(p) \
+ rcu_dereference_protected(p, \
+ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
+#define ipset_dereference_set(p, set) \
+ rcu_dereference_protected(p, \
+ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
+ lockdep_is_held(&(set)->lock))
+#define ipset_dereference_bh_nfnl(p) \
+ rcu_dereference_bh_check(p, \
+ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
/* Hashing which uses arrays to resolve clashing. The hash table is resized
* (doubled) when searching becomes too long.
@@ -68,16 +76,40 @@ struct hbucket {
DECLARE_BITMAP(used, AHASH_MAX_TUNED);
u8 size; /* size of the array */
u8 pos; /* position of the first free entry */
- unsigned char value[0] /* the array of the values */
+ unsigned char value[] /* the array of the values */
__aligned(__alignof__(u64));
};
+/* Region size for locking == 2^HTABLE_REGION_BITS */
+#define HTABLE_REGION_BITS 10
+#define ahash_numof_locks(htable_bits) \
+ ((htable_bits) < HTABLE_REGION_BITS ? 1 \
+ : jhash_size((htable_bits) - HTABLE_REGION_BITS))
+#define ahash_sizeof_regions(htable_bits) \
+ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
+#define ahash_region(n, htable_bits) \
+ ((n) % ahash_numof_locks(htable_bits))
+#define ahash_bucket_start(h, htable_bits) \
+ ((htable_bits) < HTABLE_REGION_BITS ? 0 \
+ : (h) * jhash_size(HTABLE_REGION_BITS))
+#define ahash_bucket_end(h, htable_bits) \
+ ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \
+ : ((h) + 1) * jhash_size(HTABLE_REGION_BITS))
+
+struct htable_gc {
+ struct delayed_work dwork;
+ struct ip_set *set; /* Set the gc belongs to */
+ u32 region; /* Last gc run position */
+};
+
/* The hash table: the table size stored here in order to make resizing easy */
struct htable {
atomic_t ref; /* References for resizing */
- atomic_t uref; /* References for dumping */
+ atomic_t uref; /* References for dumping and gc */
u8 htable_bits; /* size of hash table == 2^htable_bits */
- struct hbucket __rcu *bucket[0]; /* hashtable buckets */
+ u32 maxelem; /* Maxelem per region */
+ struct ip_set_region *hregion; /* Region locks and ext sizes */
+ struct hbucket __rcu *bucket[]; /* hashtable buckets */
};
#define hbucket(h, i) ((h)->bucket[i])
@@ -162,6 +194,10 @@ htable_bits(u32 hashsize)
#define NLEN 0
#endif /* IP_SET_HASH_WITH_NETS */
+#define SET_ELEM_EXPIRED(set, d) \
+ (SET_WITH_TIMEOUT(set) && \
+ ip_set_timeout_expired(ext_timeout(d, set)))
+
#endif /* _IP_SET_HASH_GEN_H */
#ifndef MTYPE
@@ -205,10 +241,12 @@ htable_bits(u32 hashsize)
#undef mtype_test_cidrs
#undef mtype_test
#undef mtype_uref
-#undef mtype_expire
#undef mtype_resize
+#undef mtype_ext_size
+#undef mtype_resize_ad
#undef mtype_head
#undef mtype_list
+#undef mtype_gc_do
#undef mtype_gc
#undef mtype_gc_init
#undef mtype_variant
@@ -247,10 +285,12 @@ htable_bits(u32 hashsize)
#define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs)
#define mtype_test IPSET_TOKEN(MTYPE, _test)
#define mtype_uref IPSET_TOKEN(MTYPE, _uref)
-#define mtype_expire IPSET_TOKEN(MTYPE, _expire)
#define mtype_resize IPSET_TOKEN(MTYPE, _resize)
+#define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size)
+#define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad)
#define mtype_head IPSET_TOKEN(MTYPE, _head)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
@@ -275,8 +315,7 @@ htable_bits(u32 hashsize)
/* The generic hash structure */
struct htype {
struct htable __rcu *table; /* the hash table */
- struct timer_list gc; /* garbage collection when timeout enabled */
- struct ip_set *set; /* attached to this ip_set */
+ struct htable_gc gc; /* gc workqueue */
u32 maxelem; /* max elements in the hash */
u32 initval; /* random jhash init value */
#ifdef IP_SET_HASH_WITH_MARKMASK
@@ -288,21 +327,33 @@ struct htype {
#ifdef IP_SET_HASH_WITH_NETMASK
u8 netmask; /* netmask value for subnets to store */
#endif
+ struct list_head ad; /* Resize add|del backlist */
struct mtype_elem next; /* temporary storage for uadd */
#ifdef IP_SET_HASH_WITH_NETS
struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
#endif
};
+/* ADD|DEL entries saved during resize */
+struct mtype_resize_ad {
+ struct list_head list;
+ enum ipset_adt ad; /* ADD|DEL element */
+ struct mtype_elem d; /* Element value */
+ struct ip_set_ext ext; /* Extensions for ADD */
+ struct ip_set_ext mext; /* Target extensions for ADD */
+ u32 flags; /* Flags for ADD */
+};
+
#ifdef IP_SET_HASH_WITH_NETS
/* Network cidr size book keeping when the hash stores different
* sized networks. cidr == real cidr + 1 to support /0.
*/
static void
-mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
{
int i, j;
+ spin_lock_bh(&set->lock);
/* Add in increasing prefix order, so larger cidr first */
for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
if (j != -1) {
@@ -311,7 +362,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
j = i;
} else if (h->nets[i].cidr[n] == cidr) {
h->nets[CIDR_POS(cidr)].nets[n]++;
- return;
+ goto unlock;
}
}
if (j != -1) {
@@ -320,24 +371,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
}
h->nets[i].cidr[n] = cidr;
h->nets[CIDR_POS(cidr)].nets[n] = 1;
+unlock:
+ spin_unlock_bh(&set->lock);
}
static void
-mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
{
u8 i, j, net_end = NLEN - 1;
+ spin_lock_bh(&set->lock);
for (i = 0; i < NLEN; i++) {
if (h->nets[i].cidr[n] != cidr)
continue;
h->nets[CIDR_POS(cidr)].nets[n]--;
if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
- return;
+ goto unlock;
for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
h->nets[j].cidr[n] = 0;
- return;
+ goto unlock;
}
+unlock:
+ spin_unlock_bh(&set->lock);
}
#endif
@@ -345,7 +401,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
static size_t
mtype_ahash_memsize(const struct htype *h, const struct htable *t)
{
- return sizeof(*h) + sizeof(*t);
+ return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits);
}
/* Get the ith element from the array block n */
@@ -369,24 +425,29 @@ mtype_flush(struct ip_set *set)
struct htype *h = set->data;
struct htable *t;
struct hbucket *n;
- u32 i;
-
- t = ipset_dereference_protected(h->table, set);
- for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(t, i), 1);
- if (!n)
- continue;
- if (set->extensions & IPSET_EXT_DESTROY)
- mtype_ext_cleanup(set, n);
- /* FIXME: use slab cache */
- rcu_assign_pointer(hbucket(t, i), NULL);
- kfree_rcu(n, rcu);
+ u32 r, i;
+
+ t = ipset_dereference_nfnl(h->table);
+ for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+ spin_lock_bh(&t->hregion[r].lock);
+ for (i = ahash_bucket_start(r, t->htable_bits);
+ i < ahash_bucket_end(r, t->htable_bits); i++) {
+ n = __ipset_dereference(hbucket(t, i));
+ if (!n)
+ continue;
+ if (set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set, n);
+ /* FIXME: use slab cache */
+ rcu_assign_pointer(hbucket(t, i), NULL);
+ kfree_rcu(n, rcu);
+ }
+ t->hregion[r].ext_size = 0;
+ t->hregion[r].elements = 0;
+ spin_unlock_bh(&t->hregion[r].lock);
}
#ifdef IP_SET_HASH_WITH_NETS
memset(h->nets, 0, sizeof(h->nets));
#endif
- set->elements = 0;
- set->ext_size = 0;
}
/* Destroy the hashtable part of the set */
@@ -397,7 +458,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
u32 i;
for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(t, i), 1);
+ n = __ipset_dereference(hbucket(t, i));
if (!n)
continue;
if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
@@ -406,6 +467,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
kfree(n);
}
+ ip_set_free(t->hregion);
ip_set_free(t);
}
@@ -414,28 +476,21 @@ static void
mtype_destroy(struct ip_set *set)
{
struct htype *h = set->data;
+ struct list_head *l, *lt;
if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&h->gc);
+ cancel_delayed_work_sync(&h->gc.dwork);
- mtype_ahash_destroy(set,
- __ipset_dereference_protected(h->table, 1), true);
+ mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
+ list_for_each_safe(l, lt, &h->ad) {
+ list_del(l);
+ kfree(l);
+ }
kfree(h);
set->data = NULL;
}
-static void
-mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t))
-{
- struct htype *h = set->data;
-
- timer_setup(&h->gc, gc, 0);
- mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
- pr_debug("gc initialized, run in every %u\n",
- IPSET_GC_PERIOD(set->timeout));
-}
-
static bool
mtype_same_set(const struct ip_set *a, const struct ip_set *b)
{
@@ -454,11 +509,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
a->extensions == b->extensions;
}
-/* Delete expired elements from the hashtable */
static void
-mtype_expire(struct ip_set *set, struct htype *h)
+mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
{
- struct htable *t;
struct hbucket *n, *tmp;
struct mtype_elem *data;
u32 i, j, d;
@@ -466,10 +519,12 @@ mtype_expire(struct ip_set *set, struct htype *h)
#ifdef IP_SET_HASH_WITH_NETS
u8 k;
#endif
+ u8 htable_bits = t->htable_bits;
- t = ipset_dereference_protected(h->table, set);
- for (i = 0; i < jhash_size(t->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(t, i), 1);
+ spin_lock_bh(&t->hregion[r].lock);
+ for (i = ahash_bucket_start(r, htable_bits);
+ i < ahash_bucket_end(r, htable_bits); i++) {
+ n = __ipset_dereference(hbucket(t, i));
if (!n)
continue;
for (j = 0, d = 0; j < n->pos; j++) {
@@ -485,58 +540,100 @@ mtype_expire(struct ip_set *set, struct htype *h)
smp_mb__after_atomic();
#ifdef IP_SET_HASH_WITH_NETS
for (k = 0; k < IPSET_NET_COUNT; k++)
- mtype_del_cidr(h,
+ mtype_del_cidr(set, h,
NCIDR_PUT(DCIDR_GET(data->cidr, k)),
k);
#endif
+ t->hregion[r].elements--;
ip_set_ext_destroy(set, data);
- set->elements--;
d++;
}
if (d >= AHASH_INIT_SIZE) {
if (d >= n->size) {
+ t->hregion[r].ext_size -=
+ ext_size(n->size, dsize);
rcu_assign_pointer(hbucket(t, i), NULL);
kfree_rcu(n, rcu);
continue;
}
tmp = kzalloc(sizeof(*tmp) +
- (n->size - AHASH_INIT_SIZE) * dsize,
- GFP_ATOMIC);
+ (n->size - AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
if (!tmp)
- /* Still try to delete expired elements */
+ /* Still try to delete expired elements. */
continue;
tmp->size = n->size - AHASH_INIT_SIZE;
for (j = 0, d = 0; j < n->pos; j++) {
if (!test_bit(j, n->used))
continue;
data = ahash_data(n, j, dsize);
- memcpy(tmp->value + d * dsize, data, dsize);
+ memcpy(tmp->value + d * dsize,
+ data, dsize);
set_bit(d, tmp->used);
d++;
}
tmp->pos = d;
- set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+ t->hregion[r].ext_size -=
+ ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, i), tmp);
kfree_rcu(n, rcu);
}
}
+ spin_unlock_bh(&t->hregion[r].lock);
}
static void
-mtype_gc(struct timer_list *t)
+mtype_gc(struct work_struct *work)
{
- struct htype *h = from_timer(h, t, gc);
- struct ip_set *set = h->set;
+ struct htable_gc *gc;
+ struct ip_set *set;
+ struct htype *h;
+ struct htable *t;
+ u32 r, numof_locks;
+ unsigned int next_run;
+
+ gc = container_of(work, struct htable_gc, dwork.work);
+ set = gc->set;
+ h = set->data;
- pr_debug("called\n");
spin_lock_bh(&set->lock);
- mtype_expire(set, h);
+ t = ipset_dereference_set(h->table, set);
+ atomic_inc(&t->uref);
+ numof_locks = ahash_numof_locks(t->htable_bits);
+ r = gc->region++;
+ if (r >= numof_locks) {
+ r = gc->region = 0;
+ }
+ next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks;
+ if (next_run < HZ/10)
+ next_run = HZ/10;
spin_unlock_bh(&set->lock);
- h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
- add_timer(&h->gc);
+ mtype_gc_do(set, h, t, r);
+
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ pr_debug("Table destroy after resize by expire: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
+
+ queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run);
+
+}
+
+static void
+mtype_gc_init(struct htable_gc *gc)
+{
+ INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc);
+ queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
}
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags);
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags);
+
/* Resize a hash: create a new hash table with doubling the hashsize
* and inserting the elements to it. Repeat until we succeed or
* fail due to memory pressures.
@@ -547,7 +644,7 @@ mtype_resize(struct ip_set *set, bool retried)
struct htype *h = set->data;
struct htable *t, *orig;
u8 htable_bits;
- size_t extsize, dsize = set->dsize;
+ size_t dsize = set->dsize;
#ifdef IP_SET_HASH_WITH_NETS
u8 flags;
struct mtype_elem *tmp;
@@ -555,7 +652,9 @@ mtype_resize(struct ip_set *set, bool retried)
struct mtype_elem *data;
struct mtype_elem *d;
struct hbucket *n, *m;
- u32 i, j, key;
+ struct list_head *l, *lt;
+ struct mtype_resize_ad *x;
+ u32 i, j, r, nr, key;
int ret;
#ifdef IP_SET_HASH_WITH_NETS
@@ -563,10 +662,8 @@ mtype_resize(struct ip_set *set, bool retried)
if (!tmp)
return -ENOMEM;
#endif
- rcu_read_lock_bh();
- orig = rcu_dereference_bh_nfnl(h->table);
+ orig = ipset_dereference_bh_nfnl(h->table);
htable_bits = orig->htable_bits;
- rcu_read_unlock_bh();
retry:
ret = 0;
@@ -583,88 +680,124 @@ retry:
ret = -ENOMEM;
goto out;
}
+ t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits));
+ if (!t->hregion) {
+ kfree(t);
+ ret = -ENOMEM;
+ goto out;
+ }
t->htable_bits = htable_bits;
+ t->maxelem = h->maxelem / ahash_numof_locks(htable_bits);
+ for (i = 0; i < ahash_numof_locks(htable_bits); i++)
+ spin_lock_init(&t->hregion[i].lock);
- spin_lock_bh(&set->lock);
- orig = __ipset_dereference_protected(h->table, 1);
- /* There can't be another parallel resizing, but dumping is possible */
+ /* There can't be another parallel resizing,
+ * but dumping, gc, kernel side add/del are possible
+ */
+ orig = ipset_dereference_bh_nfnl(h->table);
atomic_set(&orig->ref, 1);
atomic_inc(&orig->uref);
- extsize = 0;
pr_debug("attempt to resize set %s from %u to %u, t %p\n",
set->name, orig->htable_bits, htable_bits, orig);
- for (i = 0; i < jhash_size(orig->htable_bits); i++) {
- n = __ipset_dereference_protected(hbucket(orig, i), 1);
- if (!n)
- continue;
- for (j = 0; j < n->pos; j++) {
- if (!test_bit(j, n->used))
+ for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) {
+ /* Expire may replace a hbucket with another one */
+ rcu_read_lock_bh();
+ for (i = ahash_bucket_start(r, orig->htable_bits);
+ i < ahash_bucket_end(r, orig->htable_bits); i++) {
+ n = __ipset_dereference(hbucket(orig, i));
+ if (!n)
continue;
- data = ahash_data(n, j, dsize);
+ for (j = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, dsize);
+ if (SET_ELEM_EXPIRED(set, data))
+ continue;
#ifdef IP_SET_HASH_WITH_NETS
- /* We have readers running parallel with us,
- * so the live data cannot be modified.
- */
- flags = 0;
- memcpy(tmp, data, dsize);
- data = tmp;
- mtype_data_reset_flags(data, &flags);
+ /* We have readers running parallel with us,
+ * so the live data cannot be modified.
+ */
+ flags = 0;
+ memcpy(tmp, data, dsize);
+ data = tmp;
+ mtype_data_reset_flags(data, &flags);
#endif
- key = HKEY(data, h->initval, htable_bits);
- m = __ipset_dereference_protected(hbucket(t, key), 1);
- if (!m) {
- m = kzalloc(sizeof(*m) +
+ key = HKEY(data, h->initval, htable_bits);
+ m = __ipset_dereference(hbucket(t, key));
+ nr = ahash_region(key, htable_bits);
+ if (!m) {
+ m = kzalloc(sizeof(*m) +
AHASH_INIT_SIZE * dsize,
GFP_ATOMIC);
- if (!m) {
- ret = -ENOMEM;
- goto cleanup;
- }
- m->size = AHASH_INIT_SIZE;
- extsize += ext_size(AHASH_INIT_SIZE, dsize);
- RCU_INIT_POINTER(hbucket(t, key), m);
- } else if (m->pos >= m->size) {
- struct hbucket *ht;
-
- if (m->size >= AHASH_MAX(h)) {
- ret = -EAGAIN;
- } else {
- ht = kzalloc(sizeof(*ht) +
+ if (!m) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+ m->size = AHASH_INIT_SIZE;
+ t->hregion[nr].ext_size +=
+ ext_size(AHASH_INIT_SIZE,
+ dsize);
+ RCU_INIT_POINTER(hbucket(t, key), m);
+ } else if (m->pos >= m->size) {
+ struct hbucket *ht;
+
+ if (m->size >= AHASH_MAX(h)) {
+ ret = -EAGAIN;
+ } else {
+ ht = kzalloc(sizeof(*ht) +
(m->size + AHASH_INIT_SIZE)
* dsize,
GFP_ATOMIC);
- if (!ht)
- ret = -ENOMEM;
+ if (!ht)
+ ret = -ENOMEM;
+ }
+ if (ret < 0)
+ goto cleanup;
+ memcpy(ht, m, sizeof(struct hbucket) +
+ m->size * dsize);
+ ht->size = m->size + AHASH_INIT_SIZE;
+ t->hregion[nr].ext_size +=
+ ext_size(AHASH_INIT_SIZE,
+ dsize);
+ kfree(m);
+ m = ht;
+ RCU_INIT_POINTER(hbucket(t, key), ht);
}
- if (ret < 0)
- goto cleanup;
- memcpy(ht, m, sizeof(struct hbucket) +
- m->size * dsize);
- ht->size = m->size + AHASH_INIT_SIZE;
- extsize += ext_size(AHASH_INIT_SIZE, dsize);
- kfree(m);
- m = ht;
- RCU_INIT_POINTER(hbucket(t, key), ht);
- }
- d = ahash_data(m, m->pos, dsize);
- memcpy(d, data, dsize);
- set_bit(m->pos++, m->used);
+ d = ahash_data(m, m->pos, dsize);
+ memcpy(d, data, dsize);
+ set_bit(m->pos++, m->used);
+ t->hregion[nr].elements++;
#ifdef IP_SET_HASH_WITH_NETS
- mtype_data_reset_flags(d, &flags);
+ mtype_data_reset_flags(d, &flags);
#endif
+ }
}
+ rcu_read_unlock_bh();
}
- rcu_assign_pointer(h->table, t);
- set->ext_size = extsize;
- spin_unlock_bh(&set->lock);
+ /* There can't be any other writer. */
+ rcu_assign_pointer(h->table, t);
/* Give time to other readers of the set */
synchronize_rcu();
pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
orig->htable_bits, orig, t->htable_bits, t);
- /* If there's nobody else dumping the table, destroy it */
+ /* Add/delete elements processed by the SET target during resize.
+ * Kernel-side add cannot trigger a resize and userspace actions
+ * are serialized by the mutex.
+ */
+ list_for_each_safe(l, lt, &h->ad) {
+ x = list_entry(l, struct mtype_resize_ad, list);
+ if (x->ad == IPSET_ADD) {
+ mtype_add(set, &x->d, &x->ext, &x->mext, x->flags);
+ } else {
+ mtype_del(set, &x->d, NULL, NULL, 0);
+ }
+ list_del(l);
+ kfree(l);
+ }
+ /* If there's nobody else using the table, destroy it */
if (atomic_dec_and_test(&orig->uref)) {
pr_debug("Table destroy by resize %p\n", orig);
mtype_ahash_destroy(set, orig, false);
@@ -677,15 +810,44 @@ out:
return ret;
cleanup:
+ rcu_read_unlock_bh();
atomic_set(&orig->ref, 0);
atomic_dec(&orig->uref);
- spin_unlock_bh(&set->lock);
mtype_ahash_destroy(set, t, false);
if (ret == -EAGAIN)
goto retry;
goto out;
}
+/* Get the current number of elements and ext_size in the set */
+static void
+mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size)
+{
+ struct htype *h = set->data;
+ const struct htable *t;
+ u32 i, j, r;
+ struct hbucket *n;
+ struct mtype_elem *data;
+
+ t = rcu_dereference_bh(h->table);
+ for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+ for (i = ahash_bucket_start(r, t->htable_bits);
+ i < ahash_bucket_end(r, t->htable_bits); i++) {
+ n = rcu_dereference_bh(hbucket(t, i));
+ if (!n)
+ continue;
+ for (j = 0; j < n->pos; j++) {
+ if (!test_bit(j, n->used))
+ continue;
+ data = ahash_data(n, j, set->dsize);
+ if (!SET_ELEM_EXPIRED(set, data))
+ (*elements)++;
+ }
+ }
+ *ext_size += t->hregion[r].ext_size;
+ }
+}
+
/* Add an element to a hash and update the internal counters when succeeded,
* otherwise report the proper error code.
*/
@@ -698,32 +860,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
const struct mtype_elem *d = value;
struct mtype_elem *data;
struct hbucket *n, *old = ERR_PTR(-ENOENT);
- int i, j = -1;
+ int i, j = -1, ret;
bool flag_exist = flags & IPSET_FLAG_EXIST;
bool deleted = false, forceadd = false, reuse = false;
- u32 key, multi = 0;
+ u32 r, key, multi = 0, elements, maxelem;
- if (set->elements >= h->maxelem) {
- if (SET_WITH_TIMEOUT(set))
- /* FIXME: when set is full, we slow down here */
- mtype_expire(set, h);
- if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ r = ahash_region(key, t->htable_bits);
+ atomic_inc(&t->uref);
+ elements = t->hregion[r].elements;
+ maxelem = t->maxelem;
+ if (elements >= maxelem) {
+ u32 e;
+ if (SET_WITH_TIMEOUT(set)) {
+ rcu_read_unlock_bh();
+ mtype_gc_do(set, h, t, r);
+ rcu_read_lock_bh();
+ }
+ maxelem = h->maxelem;
+ elements = 0;
+ for (e = 0; e < ahash_numof_locks(t->htable_bits); e++)
+ elements += t->hregion[e].elements;
+ if (elements >= maxelem && SET_WITH_FORCEADD(set))
forceadd = true;
}
+ rcu_read_unlock_bh();
- t = ipset_dereference_protected(h->table, set);
- key = HKEY(value, h->initval, t->htable_bits);
- n = __ipset_dereference_protected(hbucket(t, key), 1);
+ spin_lock_bh(&t->hregion[r].lock);
+ n = rcu_dereference_bh(hbucket(t, key));
if (!n) {
- if (forceadd || set->elements >= h->maxelem)
+ if (forceadd || elements >= maxelem)
goto set_full;
old = NULL;
n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
GFP_ATOMIC);
- if (!n)
- return -ENOMEM;
+ if (!n) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
n->size = AHASH_INIT_SIZE;
- set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+ t->hregion[r].ext_size +=
+ ext_size(AHASH_INIT_SIZE, set->dsize);
goto copy_elem;
}
for (i = 0; i < n->pos; i++) {
@@ -737,38 +916,37 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
data = ahash_data(n, i, set->dsize);
if (mtype_data_equal(data, d, &multi)) {
- if (flag_exist ||
- (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(data, set)))) {
+ if (flag_exist || SET_ELEM_EXPIRED(set, data)) {
/* Just the extensions could be overwritten */
j = i;
goto overwrite_extensions;
}
- return -IPSET_ERR_EXIST;
+ ret = -IPSET_ERR_EXIST;
+ goto unlock;
}
/* Reuse first timed out entry */
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(data, set)) &&
- j == -1) {
+ if (SET_ELEM_EXPIRED(set, data) && j == -1) {
j = i;
reuse = true;
}
}
if (reuse || forceadd) {
+ if (j == -1)
+ j = 0;
data = ahash_data(n, j, set->dsize);
if (!deleted) {
#ifdef IP_SET_HASH_WITH_NETS
for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_del_cidr(h,
+ mtype_del_cidr(set, h,
NCIDR_PUT(DCIDR_GET(data->cidr, i)),
i);
#endif
ip_set_ext_destroy(set, data);
- set->elements--;
+ t->hregion[r].elements--;
}
goto copy_data;
}
- if (set->elements >= h->maxelem)
+ if (elements >= maxelem)
goto set_full;
/* Create a new slot */
if (n->pos >= n->size) {
@@ -776,28 +954,32 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (n->size >= AHASH_MAX(h)) {
/* Trigger rehashing */
mtype_data_next(&h->next, d);
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto resize;
}
old = n;
n = kzalloc(sizeof(*n) +
(old->size + AHASH_INIT_SIZE) * set->dsize,
GFP_ATOMIC);
- if (!n)
- return -ENOMEM;
+ if (!n) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
memcpy(n, old, sizeof(struct hbucket) +
old->size * set->dsize);
n->size = old->size + AHASH_INIT_SIZE;
- set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+ t->hregion[r].ext_size +=
+ ext_size(AHASH_INIT_SIZE, set->dsize);
}
copy_elem:
j = n->pos++;
data = ahash_data(n, j, set->dsize);
copy_data:
- set->elements++;
+ t->hregion[r].elements++;
#ifdef IP_SET_HASH_WITH_NETS
for (i = 0; i < IPSET_NET_COUNT; i++)
- mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
+ mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
#endif
memcpy(data, d, sizeof(struct mtype_elem));
overwrite_extensions:
@@ -820,13 +1002,41 @@ overwrite_extensions:
if (old)
kfree_rcu(old, rcu);
}
+ ret = 0;
+resize:
+ spin_unlock_bh(&t->hregion[r].lock);
+ if (atomic_read(&t->ref) && ext->target) {
+ /* Resize is in process and kernel side add, save values */
+ struct mtype_resize_ad *x;
+
+ x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC);
+ if (!x)
+ /* Don't bother */
+ goto out;
+ x->ad = IPSET_ADD;
+ memcpy(&x->d, value, sizeof(struct mtype_elem));
+ memcpy(&x->ext, ext, sizeof(struct ip_set_ext));
+ memcpy(&x->mext, mext, sizeof(struct ip_set_ext));
+ x->flags = flags;
+ spin_lock_bh(&set->lock);
+ list_add_tail(&x->list, &h->ad);
+ spin_unlock_bh(&set->lock);
+ }
+ goto out;
- return 0;
set_full:
if (net_ratelimit())
pr_warn("Set %s is full, maxelem %u reached\n",
- set->name, h->maxelem);
- return -IPSET_ERR_HASH_FULL;
+ set->name, maxelem);
+ ret = -IPSET_ERR_HASH_FULL;
+unlock:
+ spin_unlock_bh(&t->hregion[r].lock);
+out:
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ pr_debug("Table destroy after resize by add: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
+ return ret;
}
/* Delete an element from the hash and free up space if possible.
@@ -840,13 +1050,23 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
const struct mtype_elem *d = value;
struct mtype_elem *data;
struct hbucket *n;
- int i, j, k, ret = -IPSET_ERR_EXIST;
+ struct mtype_resize_ad *x = NULL;
+ int i, j, k, r, ret = -IPSET_ERR_EXIST;
u32 key, multi = 0;
size_t dsize = set->dsize;
- t = ipset_dereference_protected(h->table, set);
+ /* Userspace add and resize is excluded by the mutex.
+ * Kernespace add does not trigger resize.
+ */
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
key = HKEY(value, h->initval, t->htable_bits);
- n = __ipset_dereference_protected(hbucket(t, key), 1);
+ r = ahash_region(key, t->htable_bits);
+ atomic_inc(&t->uref);
+ rcu_read_unlock_bh();
+
+ spin_lock_bh(&t->hregion[r].lock);
+ n = rcu_dereference_bh(hbucket(t, key));
if (!n)
goto out;
for (i = 0, k = 0; i < n->pos; i++) {
@@ -857,8 +1077,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
data = ahash_data(n, i, dsize);
if (!mtype_data_equal(data, d, &multi))
continue;
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(data, set)))
+ if (SET_ELEM_EXPIRED(set, data))
goto out;
ret = 0;
@@ -866,20 +1085,33 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
smp_mb__after_atomic();
if (i + 1 == n->pos)
n->pos--;
- set->elements--;
+ t->hregion[r].elements--;
#ifdef IP_SET_HASH_WITH_NETS
for (j = 0; j < IPSET_NET_COUNT; j++)
- mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
- j);
+ mtype_del_cidr(set, h,
+ NCIDR_PUT(DCIDR_GET(d->cidr, j)), j);
#endif
ip_set_ext_destroy(set, data);
+ if (atomic_read(&t->ref) && ext->target) {
+ /* Resize is in process and kernel side del,
+ * save values
+ */
+ x = kzalloc(sizeof(struct mtype_resize_ad),
+ GFP_ATOMIC);
+ if (x) {
+ x->ad = IPSET_DEL;
+ memcpy(&x->d, value,
+ sizeof(struct mtype_elem));
+ x->flags = flags;
+ }
+ }
for (; i < n->pos; i++) {
if (!test_bit(i, n->used))
k++;
}
if (n->pos == 0 && k == 0) {
- set->ext_size -= ext_size(n->size, dsize);
+ t->hregion[r].ext_size -= ext_size(n->size, dsize);
rcu_assign_pointer(hbucket(t, key), NULL);
kfree_rcu(n, rcu);
} else if (k >= AHASH_INIT_SIZE) {
@@ -898,7 +1130,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
k++;
}
tmp->pos = k;
- set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+ t->hregion[r].ext_size -=
+ ext_size(AHASH_INIT_SIZE, dsize);
rcu_assign_pointer(hbucket(t, key), tmp);
kfree_rcu(n, rcu);
}
@@ -906,6 +1139,16 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
}
out:
+ spin_unlock_bh(&t->hregion[r].lock);
+ if (x) {
+ spin_lock_bh(&set->lock);
+ list_add(&x->list, &h->ad);
+ spin_unlock_bh(&set->lock);
+ }
+ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+ pr_debug("Table destroy after resize by del: %p\n", t);
+ mtype_ahash_destroy(set, t, false);
+ }
return ret;
}
@@ -991,6 +1234,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
int i, ret = 0;
u32 key, multi = 0;
+ rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
#ifdef IP_SET_HASH_WITH_NETS
/* If we test an IP address and not a network address,
@@ -1022,6 +1266,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
goto out;
}
out:
+ rcu_read_unlock_bh();
return ret;
}
@@ -1033,23 +1278,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
const struct htable *t;
struct nlattr *nested;
size_t memsize;
+ u32 elements = 0;
+ size_t ext_size = 0;
u8 htable_bits;
- /* If any members have expired, set->elements will be wrong
- * mytype_expire function will update it with the right count.
- * we do not hold set->lock here, so grab it first.
- * set->elements can still be incorrect in the case of a huge set,
- * because elements might time out during the listing.
- */
- if (SET_WITH_TIMEOUT(set)) {
- spin_lock_bh(&set->lock);
- mtype_expire(set, h);
- spin_unlock_bh(&set->lock);
- }
-
rcu_read_lock_bh();
- t = rcu_dereference_bh_nfnl(h->table);
- memsize = mtype_ahash_memsize(h, t) + set->ext_size;
+ t = rcu_dereference_bh(h->table);
+ mtype_ext_size(set, &elements, &ext_size);
+ memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size;
htable_bits = t->htable_bits;
rcu_read_unlock_bh();
@@ -1071,7 +1307,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
#endif
if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
- nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
+ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
@@ -1091,15 +1327,15 @@ mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start)
if (start) {
rcu_read_lock_bh();
- t = rcu_dereference_bh_nfnl(h->table);
+ t = ipset_dereference_bh_nfnl(h->table);
atomic_inc(&t->uref);
cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
rcu_read_unlock_bh();
} else if (cb->args[IPSET_CB_PRIVATE]) {
t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
- /* Resizing didn't destroy the hash table */
- pr_debug("Table destroy by dump: %p\n", t);
+ pr_debug("Table destroy after resize "
+ " by dump: %p\n", t);
mtype_ahash_destroy(set, t, false);
}
cb->args[IPSET_CB_PRIVATE] = 0;
@@ -1141,8 +1377,7 @@ mtype_list(const struct ip_set *set,
if (!test_bit(i, n->used))
continue;
e = ahash_data(n, i, set->dsize);
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set)))
+ if (SET_ELEM_EXPIRED(set, e))
continue;
pr_debug("list hash %lu hbucket %p i %u, data %p\n",
cb->args[IPSET_CB_ARG0], n, i, e);
@@ -1208,6 +1443,7 @@ static const struct ip_set_type_variant mtype_variant = {
.uref = mtype_uref,
.resize = mtype_resize,
.same_set = mtype_same_set,
+ .region_lock = true,
};
#ifdef IP_SET_EMIT_CREATE
@@ -1226,6 +1462,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
size_t hsize;
struct htype *h;
struct htable *t;
+ u32 i;
pr_debug("Create set %s with family %s\n",
set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
@@ -1294,6 +1531,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
kfree(h);
return -ENOMEM;
}
+ t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits));
+ if (!t->hregion) {
+ kfree(t);
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->gc.set = set;
+ for (i = 0; i < ahash_numof_locks(hbits); i++)
+ spin_lock_init(&t->hregion[i].lock);
h->maxelem = maxelem;
#ifdef IP_SET_HASH_WITH_NETMASK
h->netmask = netmask;
@@ -1304,9 +1550,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
get_random_bytes(&h->initval, sizeof(h->initval));
t->htable_bits = hbits;
+ t->maxelem = h->maxelem / ahash_numof_locks(hbits);
RCU_INIT_POINTER(h->table, t);
- h->set = set;
+ INIT_LIST_HEAD(&h->ad);
set->data = h;
#ifndef IP_SET_PROTO_UNDEF
if (set->family == NFPROTO_IPV4) {
@@ -1329,12 +1576,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#ifndef IP_SET_PROTO_UNDEF
if (set->family == NFPROTO_IPV4)
#endif
- IPSET_TOKEN(HTYPE, 4_gc_init)(set,
- IPSET_TOKEN(HTYPE, 4_gc));
+ IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc);
#ifndef IP_SET_PROTO_UNDEF
else
- IPSET_TOKEN(HTYPE, 6_gc_init)(set,
- IPSET_TOKEN(HTYPE, 6_gc));
+ IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc);
#endif
}
pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 512259f579d7..aa6a603a2425 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1661,8 +1661,9 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
unsigned int offset, offset2, ihl, verdict;
- bool ipip, new_cp = false;
+ bool tunnel, new_cp = false;
union nf_inet_addr *raddr;
+ char *outer_proto = "IPIP";
*related = 1;
@@ -1703,8 +1704,8 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
return NF_ACCEPT; /* The packet looks wrong, ignore */
raddr = (union nf_inet_addr *)&cih->daddr;
- /* Special case for errors for IPIP packets */
- ipip = false;
+ /* Special case for errors for IPIP/UDP/GRE tunnel packets */
+ tunnel = false;
if (cih->protocol == IPPROTO_IPIP) {
struct ip_vs_dest *dest;
@@ -1721,7 +1722,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- ipip = true;
+ tunnel = true;
} else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */
cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */
/* Error for our tunnel must arrive at LOCAL_IN */
@@ -1729,16 +1730,19 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
__u8 iproto;
int ulen;
- /* Non-first fragment has no UDP header */
+ /* Non-first fragment has no UDP/GRE header */
if (unlikely(cih->frag_off & htons(IP_OFFSET)))
return NF_ACCEPT;
offset2 = offset + cih->ihl * 4;
- if (cih->protocol == IPPROTO_UDP)
+ if (cih->protocol == IPPROTO_UDP) {
ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET,
raddr, &iproto);
- else
+ outer_proto = "UDP";
+ } else {
ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET,
raddr, &iproto);
+ outer_proto = "GRE";
+ }
if (ulen > 0) {
/* Skip IP and UDP/GRE tunnel headers */
offset = offset2 + ulen;
@@ -1747,7 +1751,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
&_ciph);
if (cih && cih->version == 4 && cih->ihl >= 5 &&
iproto == IPPROTO_IPIP)
- ipip = true;
+ tunnel = true;
else
return NF_ACCEPT;
}
@@ -1767,11 +1771,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
"Checking incoming ICMP for");
offset2 = offset;
- ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph);
+ ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph);
offset = ciph.len;
/* The embedded headers contain source and dest in reverse order.
- * For IPIP this is error for request, not for reply.
+ * For IPIP/UDP/GRE tunnel this is error for request, not for reply.
*/
cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
ipvs, AF_INET, skb, &ciph);
@@ -1779,7 +1783,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
if (!cp) {
int v;
- if (ipip || !sysctl_schedule_icmp(ipvs))
+ if (tunnel || !sysctl_schedule_icmp(ipvs))
return NF_ACCEPT;
if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
@@ -1797,7 +1801,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
goto out;
}
- if (ipip) {
+ if (tunnel) {
__be32 info = ic->un.gateway;
__u8 type = ic->type;
__u8 code = ic->code;
@@ -1809,17 +1813,18 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
u32 mtu = ntohs(ic->un.frag.mtu);
__be16 frag_off = cih->frag_off;
- /* Strip outer IP and ICMP, go to IPIP header */
+ /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */
if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
- goto ignore_ipip;
+ goto ignore_tunnel;
offset2 -= ihl + sizeof(_icmph);
skb_reset_network_header(skb);
- IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
- &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
+ IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n",
+ outer_proto, &ip_hdr(skb)->saddr,
+ &ip_hdr(skb)->daddr, mtu);
ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
/* Client uses PMTUD? */
if (!(frag_off & htons(IP_DF)))
- goto ignore_ipip;
+ goto ignore_tunnel;
/* Prefer the resulting PMTU */
if (dest) {
struct ip_vs_dest_dst *dest_dst;
@@ -1832,11 +1837,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
mtu -= sizeof(struct iphdr);
info = htonl(mtu);
}
- /* Strip outer IP, ICMP and IPIP, go to IP header of
+ /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of
* original request.
*/
if (pskb_pull(skb, offset2) == NULL)
- goto ignore_ipip;
+ goto ignore_tunnel;
skb_reset_network_header(skb);
IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1845,7 +1850,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
/* ICMP can be shorter but anyways, account it */
ip_vs_out_stats(cp, skb);
-ignore_ipip:
+ignore_tunnel:
consume_skb(skb);
verdict = NF_STOLEN;
goto out;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index d1305423640f..c4582eb71766 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -143,6 +143,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
}
static void nf_conntrack_all_lock(void)
+ __acquires(&nf_conntrack_locks_all_lock)
{
int i;
@@ -162,6 +163,7 @@ static void nf_conntrack_all_lock(void)
}
static void nf_conntrack_all_unlock(void)
+ __releases(&nf_conntrack_locks_all_lock)
{
/* All prior stores must be complete before we clear
* 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
@@ -863,9 +865,8 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
-static inline void nf_ct_acct_update(struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int len)
+void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
+ unsigned int bytes)
{
struct nf_conn_acct *acct;
@@ -873,10 +874,11 @@ static inline void nf_ct_acct_update(struct nf_conn *ct,
if (acct) {
struct nf_conn_counter *counter = acct->counter;
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ atomic64_add(packets, &counter[dir].packets);
+ atomic64_add(bytes, &counter[dir].bytes);
}
}
+EXPORT_SYMBOL_GPL(nf_ct_acct_add);
static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
const struct nf_conn *loser_ct)
@@ -890,36 +892,179 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
/* u32 should be fine since we must have seen one packet. */
bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
- nf_ct_acct_update(ct, ctinfo, bytes);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
}
}
-/* Resolve race on insertion if this protocol allows this. */
+static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
+{
+ struct nf_conn_tstamp *tstamp;
+
+ atomic_inc(&ct->ct_general.use);
+ ct->status |= IPS_CONFIRMED;
+
+ /* set conntrack timestamp, if enabled. */
+ tstamp = nf_conn_tstamp_find(ct);
+ if (tstamp)
+ tstamp->start = ktime_get_real_ns();
+}
+
+static int __nf_ct_resolve_clash(struct sk_buff *skb,
+ struct nf_conntrack_tuple_hash *h)
+{
+ /* This is the conntrack entry already in hashes that won race. */
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *loser_ct;
+
+ loser_ct = nf_ct_get(skb, &ctinfo);
+
+ if (nf_ct_is_dying(ct))
+ return NF_DROP;
+
+ if (!atomic_inc_not_zero(&ct->ct_general.use))
+ return NF_DROP;
+
+ if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
+ nf_ct_match(ct, loser_ct)) {
+ struct net *net = nf_ct_net(ct);
+
+ nf_ct_acct_merge(ct, ctinfo, loser_ct);
+ nf_ct_add_to_dying_list(loser_ct);
+ nf_conntrack_put(&loser_ct->ct_general);
+ nf_ct_set(skb, ct, ctinfo);
+
+ NF_CT_STAT_INC(net, insert_failed);
+ return NF_ACCEPT;
+ }
+
+ nf_ct_put(ct);
+ return NF_DROP;
+}
+
+/**
+ * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
+ *
+ * @skb: skb that causes the collision
+ * @repl_idx: hash slot for reply direction
+ *
+ * Called when origin or reply direction had a clash.
+ * The skb can be handled without packet drop provided the reply direction
+ * is unique or there the existing entry has the identical tuple in both
+ * directions.
+ *
+ * Caller must hold conntrack table locks to prevent concurrent updates.
+ *
+ * Returns NF_DROP if the clash could not be handled.
+ */
+static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
+{
+ struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb);
+ const struct nf_conntrack_zone *zone;
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct net *net;
+
+ zone = nf_ct_zone(loser_ct);
+ net = nf_ct_net(loser_ct);
+
+ /* Reply direction must never result in a clash, unless both origin
+ * and reply tuples are identical.
+ */
+ hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
+ if (nf_ct_key_equal(h,
+ &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
+ zone, net))
+ return __nf_ct_resolve_clash(skb, h);
+ }
+
+ /* We want the clashing entry to go away real soon: 1 second timeout. */
+ loser_ct->timeout = nfct_time_stamp + HZ;
+
+ /* IPS_NAT_CLASH removes the entry automatically on the first
+ * reply. Also prevents UDP tracker from moving the entry to
+ * ASSURED state, i.e. the entry can always be evicted under
+ * pressure.
+ */
+ loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH;
+
+ __nf_conntrack_insert_prepare(loser_ct);
+
+ /* fake add for ORIGINAL dir: we want lookups to only find the entry
+ * already in the table. This also hides the clashing entry from
+ * ctnetlink iteration, i.e. conntrack -L won't show them.
+ */
+ hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+
+ hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
+ &nf_conntrack_hash[repl_idx]);
+ return NF_ACCEPT;
+}
+
+/**
+ * nf_ct_resolve_clash - attempt to handle clash without packet drop
+ *
+ * @skb: skb that causes the clash
+ * @h: tuplehash of the clashing entry already in table
+ * @hash_reply: hash slot for reply direction
+ *
+ * A conntrack entry can be inserted to the connection tracking table
+ * if there is no existing entry with an identical tuple.
+ *
+ * If there is one, @skb (and the assocated, unconfirmed conntrack) has
+ * to be dropped. In case @skb is retransmitted, next conntrack lookup
+ * will find the already-existing entry.
+ *
+ * The major problem with such packet drop is the extra delay added by
+ * the packet loss -- it will take some time for a retransmit to occur
+ * (or the sender to time out when waiting for a reply).
+ *
+ * This function attempts to handle the situation without packet drop.
+ *
+ * If @skb has no NAT transformation or if the colliding entries are
+ * exactly the same, only the to-be-confirmed conntrack entry is discarded
+ * and @skb is associated with the conntrack entry already in the table.
+ *
+ * Failing that, the new, unconfirmed conntrack is still added to the table
+ * provided that the collision only occurs in the ORIGINAL direction.
+ * The new entry will be added after the existing one in the hash list,
+ * so packets in the ORIGINAL direction will continue to match the existing
+ * entry. The new entry will also have a fixed timeout so it expires --
+ * due to the collision, it will not see bidirectional traffic.
+ *
+ * Returns NF_DROP if the clash could not be resolved.
+ */
static __cold noinline int
-nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
- enum ip_conntrack_info ctinfo,
- struct nf_conntrack_tuple_hash *h)
+nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
+ u32 reply_hash)
{
/* This is the conntrack entry already in hashes that won race. */
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
const struct nf_conntrack_l4proto *l4proto;
- enum ip_conntrack_info oldinfo;
- struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *loser_ct;
+ struct net *net;
+ int ret;
+
+ loser_ct = nf_ct_get(skb, &ctinfo);
+ net = nf_ct_net(loser_ct);
l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct));
- if (l4proto->allow_clash &&
- !nf_ct_is_dying(ct) &&
- atomic_inc_not_zero(&ct->ct_general.use)) {
- if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
- nf_ct_match(ct, loser_ct)) {
- nf_ct_acct_merge(ct, ctinfo, loser_ct);
- nf_conntrack_put(&loser_ct->ct_general);
- nf_ct_set(skb, ct, oldinfo);
- return NF_ACCEPT;
- }
- nf_ct_put(ct);
- }
+ if (!l4proto->allow_clash)
+ goto drop;
+
+ ret = __nf_ct_resolve_clash(skb, h);
+ if (ret == NF_ACCEPT)
+ return ret;
+
+ ret = nf_ct_resolve_clash_harder(skb, reply_hash);
+ if (ret == NF_ACCEPT)
+ return ret;
+
+drop:
+ nf_ct_add_to_dying_list(loser_ct);
NF_CT_STAT_INC(net, drop);
+ NF_CT_STAT_INC(net, insert_failed);
return NF_DROP;
}
@@ -932,7 +1077,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
- struct nf_conn_tstamp *tstamp;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
@@ -989,6 +1133,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
if (unlikely(nf_ct_is_dying(ct))) {
nf_ct_add_to_dying_list(ct);
+ NF_CT_STAT_INC(net, insert_failed);
goto dying;
}
@@ -1009,13 +1154,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout += nfct_time_stamp;
- atomic_inc(&ct->ct_general.use);
- ct->status |= IPS_CONFIRMED;
- /* set conntrack timestamp, if enabled. */
- tstamp = nf_conn_tstamp_find(ct);
- if (tstamp)
- tstamp->start = ktime_get_real_ns();
+ __nf_conntrack_insert_prepare(ct);
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
@@ -1035,11 +1175,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT;
out:
- nf_ct_add_to_dying_list(ct);
- ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
+ ret = nf_ct_resolve_clash(skb, h, reply_hash);
dying:
nf_conntrack_double_unlock(hash, reply_hash);
- NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
return ret;
}
@@ -1795,7 +1933,7 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
if (do_acct)
- nf_ct_acct_update(ct, ctinfo, skb->len);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -1803,7 +1941,7 @@ bool nf_ct_kill_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb)
{
- nf_ct_acct_update(ct, ctinfo, skb->len);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
return nf_ct_delete(ct, 0, 0);
}
@@ -2497,7 +2635,6 @@ void nf_conntrack_init_end(void)
*/
#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
#define DYING_NULLS_VAL ((1<<30)+1)
-#define TEMPLATE_NULLS_VAL ((1<<30)+2)
int nf_conntrack_init_net(struct net *net)
{
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 6a1c8f1f6171..9ddfcd002d3b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -860,7 +860,7 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
struct ctnetlink_filter *filter;
#ifndef CONFIG_NF_CONNTRACK_MARK
- if (cda[CTA_MARK] && cda[CTA_MARK_MASK])
+ if (cda[CTA_MARK] || cda[CTA_MARK_MASK])
return ERR_PTR(-EOPNOTSUPP);
#endif
@@ -1533,6 +1533,7 @@ static int
ctnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
const struct nlattr *attr)
+ __must_hold(RCU)
{
struct nf_nat_hook *nat_hook;
int err;
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 7365b43f8f98..760ca2422816 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -81,6 +81,18 @@ static bool udp_error(struct sk_buff *skb,
return false;
}
+static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct,
+ struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo,
+ u32 extra_jiffies)
+{
+ if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY &&
+ ct->status & IPS_NAT_CLASH))
+ nf_ct_kill(ct);
+ else
+ nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies);
+}
+
/* Returns verdict for packet, and may modify conntracktype */
int nf_conntrack_udp_packet(struct nf_conn *ct,
struct sk_buff *skb,
@@ -116,8 +128,8 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_ct_refresh_acct(ct, ctinfo, skb,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
+ timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
@@ -198,8 +210,8 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct,
if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_ASSURED, ct);
} else {
- nf_ct_refresh_acct(ct, ctinfo, skb,
- timeouts[UDP_CT_UNREPLIED]);
+ nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo,
+ timeouts[UDP_CT_UNREPLIED]);
}
return NF_ACCEPT;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 410809c669e1..9b57330c81f8 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -411,7 +411,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu + 1;
return per_cpu_ptr(net->ct.stat, cpu);
}
-
+ (*pos)++;
return NULL;
}
@@ -1054,21 +1054,18 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
nf_conntrack_standalone_init_dccp_sysctl(net, table);
nf_conntrack_standalone_init_gre_sysctl(net, table);
- /* Don't export sysctls to unprivileged users */
+ /* Don't allow unprivileged users to alter certain sysctls */
if (net->user_ns != &init_user_ns) {
- table[NF_SYSCTL_CT_MAX].procname = NULL;
- table[NF_SYSCTL_CT_ACCT].procname = NULL;
- table[NF_SYSCTL_CT_HELPER].procname = NULL;
-#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
- table[NF_SYSCTL_CT_TIMESTAMP].procname = NULL;
-#endif
+ table[NF_SYSCTL_CT_MAX].mode = 0444;
+ table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
+ table[NF_SYSCTL_CT_HELPER].mode = 0444;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- table[NF_SYSCTL_CT_EVENTS].procname = NULL;
+ table[NF_SYSCTL_CT_EVENTS].mode = 0444;
#endif
- }
-
- if (!net_eq(&init_net, net))
table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+ } else if (!net_eq(&init_net, net)) {
+ table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+ }
net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
if (!net->ct.sysctl_header)
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 8af28e10b4e6..c0cb79495c35 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -252,6 +252,19 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
}
EXPORT_SYMBOL_GPL(flow_offload_add);
+void flow_offload_refresh(struct nf_flowtable *flow_table,
+ struct flow_offload *flow)
+{
+ flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
+
+ if (likely(!nf_flowtable_hw_offload(flow_table) ||
+ !test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags)))
+ return;
+
+ nf_flow_offload_add(flow_table, flow);
+}
+EXPORT_SYMBOL_GPL(flow_offload_refresh);
+
static inline bool nf_flow_has_expired(const struct flow_offload *flow)
{
return nf_flow_timeout_delta(flow->timeout) <= 0;
@@ -372,6 +385,50 @@ static void nf_flow_offload_work_gc(struct work_struct *work)
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
+int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,
+ flow_setup_cb_t *cb, void *cb_priv)
+{
+ struct flow_block *block = &flow_table->flow_block;
+ struct flow_block_cb *block_cb;
+ int err = 0;
+
+ down_write(&flow_table->flow_block_lock);
+ block_cb = flow_block_cb_lookup(block, cb, cb_priv);
+ if (block_cb) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ block_cb = flow_block_cb_alloc(cb, cb_priv, cb_priv, NULL);
+ if (IS_ERR(block_cb)) {
+ err = PTR_ERR(block_cb);
+ goto unlock;
+ }
+
+ list_add_tail(&block_cb->list, &block->cb_list);
+
+unlock:
+ up_write(&flow_table->flow_block_lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb);
+
+void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table,
+ flow_setup_cb_t *cb, void *cb_priv)
+{
+ struct flow_block *block = &flow_table->flow_block;
+ struct flow_block_cb *block_cb;
+
+ down_write(&flow_table->flow_block_lock);
+ block_cb = flow_block_cb_lookup(block, cb, cb_priv);
+ if (block_cb)
+ list_del(&block_cb->list);
+ else
+ WARN_ON(true);
+ up_write(&flow_table->flow_block_lock);
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb);
+
static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
__be16 port, __be16 new_port)
{
@@ -494,6 +551,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
flow_block_init(&flowtable->flow_block);
+ init_rwsem(&flowtable->flow_block_lock);
err = rhashtable_init(&flowtable->rhashtable,
&nf_flow_offload_rhash_params);
@@ -550,10 +608,14 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
mutex_lock(&flowtable_lock);
list_del(&flow_table->list);
mutex_unlock(&flowtable_lock);
+
cancel_delayed_work_sync(&flow_table->gc_work);
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
nf_flow_table_offload_flush(flow_table);
+ if (nf_flowtable_hw_offload(flow_table))
+ nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step,
+ flow_table);
rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 9e563fd3da0f..a3bca758b849 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -12,6 +12,7 @@
#include <net/ip6_route.h>
#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack_acct.h>
/* For layer 4 checksum field offset. */
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -146,11 +147,13 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
(nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+ nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
return -1;
+
+ iph = ip_hdr(skb);
if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
(nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+ nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
return -1;
return 0;
@@ -189,6 +192,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
return -1;
+ iph = ip_hdr(skb);
ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v4.s_addr = iph->saddr;
@@ -232,13 +236,6 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
return NF_STOLEN;
}
-static bool nf_flow_offload_refresh(struct nf_flowtable *flow_table,
- struct flow_offload *flow)
-{
- return nf_flowtable_hw_offload(flow_table) &&
- test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags);
-}
-
unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -279,8 +276,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
return NF_ACCEPT;
- if (unlikely(nf_flow_offload_refresh(flow_table, flow)))
- nf_flow_offload_add(flow_table, flow);
+ flow_offload_refresh(flow_table, flow);
if (nf_flow_offload_dst_check(&rt->dst)) {
flow_offload_teardown(flow);
@@ -290,11 +286,13 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
return NF_DROP;
- flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
iph = ip_hdr(skb);
ip_decrease_ttl(iph);
skb->tstamp = 0;
+ if (flow_table->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
+
if (unlikely(dst_xfrm(&rt->dst))) {
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->dev->ifindex;
@@ -426,11 +424,13 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow,
if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
(nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
return -1;
+
+ ip6h = ipv6_hdr(skb);
if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
(nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
return -1;
return 0;
@@ -459,6 +459,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
return -1;
+ ip6h = ipv6_hdr(skb);
ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v6 = ip6h->saddr;
@@ -508,8 +509,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
sizeof(*ip6h)))
return NF_ACCEPT;
- if (unlikely(nf_flow_offload_refresh(flow_table, flow)))
- nf_flow_offload_add(flow_table, flow);
+ flow_offload_refresh(flow_table, flow);
if (nf_flow_offload_dst_check(&rt->dst)) {
flow_offload_teardown(flow);
@@ -522,11 +522,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
if (nf_flow_nat_ipv6(flow, skb, dir) < 0)
return NF_DROP;
- flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
ip6h = ipv6_hdr(skb);
ip6h->hop_limit--;
skb->tstamp = 0;
+ if (flow_table->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
+
if (unlikely(dst_xfrm(&rt->dst))) {
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
IP6CB(skb)->iif = skb->dev->ifindex;
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index 83e1db37c3b0..e3b099c14eff 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -7,13 +7,13 @@
#include <linux/tc_act/tc_csum.h>
#include <net/flow_offload.h>
#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_tuple.h>
-static struct work_struct nf_flow_offload_work;
-static DEFINE_SPINLOCK(flow_offload_pending_list_lock);
-static LIST_HEAD(flow_offload_pending_list);
+static struct workqueue_struct *nf_flow_offload_wq;
struct flow_offload_work {
struct list_head list;
@@ -21,40 +21,68 @@ struct flow_offload_work {
int priority;
struct nf_flowtable *flowtable;
struct flow_offload *flow;
-};
-
-struct nf_flow_key {
- struct flow_dissector_key_meta meta;
- struct flow_dissector_key_control control;
- struct flow_dissector_key_basic basic;
- union {
- struct flow_dissector_key_ipv4_addrs ipv4;
- struct flow_dissector_key_ipv6_addrs ipv6;
- };
- struct flow_dissector_key_tcp tcp;
- struct flow_dissector_key_ports tp;
-} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
-
-struct nf_flow_match {
- struct flow_dissector dissector;
- struct nf_flow_key key;
- struct nf_flow_key mask;
-};
-
-struct nf_flow_rule {
- struct nf_flow_match match;
- struct flow_rule *rule;
+ struct work_struct work;
};
#define NF_FLOW_DISSECTOR(__match, __type, __field) \
(__match)->dissector.offset[__type] = \
offsetof(struct nf_flow_key, __field)
+static void nf_flow_rule_lwt_match(struct nf_flow_match *match,
+ struct ip_tunnel_info *tun_info)
+{
+ struct nf_flow_key *mask = &match->mask;
+ struct nf_flow_key *key = &match->key;
+ unsigned int enc_keys;
+
+ if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))
+ return;
+
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control);
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
+ key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id);
+ mask->enc_key_id.keyid = 0xffffffff;
+ enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) |
+ BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL);
+
+ if (ip_tunnel_info_af(tun_info) == AF_INET) {
+ NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
+ enc_ipv4);
+ key->enc_ipv4.src = tun_info->key.u.ipv4.dst;
+ key->enc_ipv4.dst = tun_info->key.u.ipv4.src;
+ if (key->enc_ipv4.src)
+ mask->enc_ipv4.src = 0xffffffff;
+ if (key->enc_ipv4.dst)
+ mask->enc_ipv4.dst = 0xffffffff;
+ enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS);
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ } else {
+ memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst,
+ sizeof(struct in6_addr));
+ memcpy(&key->enc_ipv6.dst, &tun_info->key.u.ipv6.src,
+ sizeof(struct in6_addr));
+ if (memcmp(&key->enc_ipv6.src, &in6addr_any,
+ sizeof(struct in6_addr)))
+ memset(&key->enc_ipv6.src, 0xff,
+ sizeof(struct in6_addr));
+ if (memcmp(&key->enc_ipv6.dst, &in6addr_any,
+ sizeof(struct in6_addr)))
+ memset(&key->enc_ipv6.dst, 0xff,
+ sizeof(struct in6_addr));
+ enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS);
+ key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ match->dissector.used_keys |= enc_keys;
+}
+
static int nf_flow_rule_match(struct nf_flow_match *match,
- const struct flow_offload_tuple *tuple)
+ const struct flow_offload_tuple *tuple,
+ struct dst_entry *other_dst)
{
struct nf_flow_key *mask = &match->mask;
struct nf_flow_key *key = &match->key;
+ struct ip_tunnel_info *tun_info;
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta);
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control);
@@ -64,6 +92,11 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp);
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp);
+ if (other_dst && other_dst->lwtstate) {
+ tun_info = lwt_tun_info(other_dst->lwtstate);
+ nf_flow_rule_lwt_match(match, tun_info);
+ }
+
key->meta.ingress_ifindex = tuple->iifidx;
mask->meta.ingress_ifindex = 0xffffffff;
@@ -87,6 +120,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
default:
return -EOPNOTSUPP;
}
+ mask->control.addr_type = 0xffff;
match->dissector.used_keys |= BIT(key->control.addr_type);
mask->basic.n_proto = 0xffff;
@@ -442,10 +476,52 @@ static void flow_offload_redirect(const struct flow_offload *flow,
dev_hold(rt->dst.dev);
}
+static void flow_offload_encap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry;
+ struct dst_entry *dst;
+
+ dst = flow->tuplehash[dir].tuple.dst_cache;
+ if (dst && dst->lwtstate) {
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = lwt_tun_info(dst->lwtstate);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_TUNNEL_ENCAP;
+ entry->tunnel = tun_info;
+ }
+ }
+}
+
+static void flow_offload_decap_tunnel(const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action_entry *entry;
+ struct dst_entry *dst;
+
+ dst = flow->tuplehash[!dir].tuple.dst_cache;
+ if (dst && dst->lwtstate) {
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = lwt_tun_info(dst->lwtstate);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) {
+ entry = flow_action_entry_next(flow_rule);
+ entry->id = FLOW_ACTION_TUNNEL_DECAP;
+ }
+ }
+}
+
int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ flow_offload_decap_tunnel(flow, dir, flow_rule);
+ flow_offload_encap_tunnel(flow, dir, flow_rule);
+
if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
return -1;
@@ -472,6 +548,9 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ flow_offload_decap_tunnel(flow, dir, flow_rule);
+ flow_offload_encap_tunnel(flow, dir, flow_rule);
+
if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
return -1;
@@ -502,6 +581,7 @@ nf_flow_offload_rule_alloc(struct net *net,
const struct flow_offload *flow = offload->flow;
const struct flow_offload_tuple *tuple;
struct nf_flow_rule *flow_rule;
+ struct dst_entry *other_dst;
int err = -ENOMEM;
flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL);
@@ -517,7 +597,8 @@ nf_flow_offload_rule_alloc(struct net *net,
flow_rule->rule->match.key = &flow_rule->match.key;
tuple = &flow->tuplehash[dir].tuple;
- err = nf_flow_rule_match(&flow_rule->match, tuple);
+ other_dst = flow->tuplehash[!dir].tuple.dst_cache;
+ err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst);
if (err < 0)
goto err_flow_match;
@@ -597,6 +678,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
struct nf_flow_rule *flow_rule,
enum flow_offload_tuple_dir dir,
int priority, int cmd,
+ struct flow_stats *stats,
struct list_head *block_cb_list)
{
struct flow_cls_offload cls_flow = {};
@@ -610,6 +692,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
if (cmd == FLOW_CLS_REPLACE)
cls_flow.rule = flow_rule->rule;
+ down_read(&flowtable->flow_block_lock);
list_for_each_entry(block_cb, block_cb_list, list) {
err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow,
block_cb->cb_priv);
@@ -618,6 +701,10 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
i++;
}
+ up_read(&flowtable->flow_block_lock);
+
+ if (cmd == FLOW_CLS_STATS)
+ memcpy(stats, &cls_flow.stats, sizeof(*stats));
return i;
}
@@ -628,7 +715,7 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload,
{
return nf_flow_offload_tuple(offload->flowtable, offload->flow,
flow_rule, dir, offload->priority,
- FLOW_CLS_REPLACE,
+ FLOW_CLS_REPLACE, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -636,7 +723,7 @@ static void flow_offload_tuple_del(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir)
{
nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
- offload->priority, FLOW_CLS_DESTROY,
+ offload->priority, FLOW_CLS_DESTROY, NULL,
&offload->flowtable->flow_block.cb_list);
}
@@ -682,19 +769,9 @@ static void flow_offload_tuple_stats(struct flow_offload_work *offload,
enum flow_offload_tuple_dir dir,
struct flow_stats *stats)
{
- struct nf_flowtable *flowtable = offload->flowtable;
- struct flow_cls_offload cls_flow = {};
- struct flow_block_cb *block_cb;
- struct netlink_ext_ack extack;
- __be16 proto = ETH_P_ALL;
-
- nf_flow_offload_init(&cls_flow, proto, offload->priority,
- FLOW_CLS_STATS,
- &offload->flow->tuplehash[dir].tuple, &extack);
-
- list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list)
- block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv);
- memcpy(stats, &cls_flow.stats, sizeof(*stats));
+ nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir,
+ offload->priority, FLOW_CLS_STATS, stats,
+ &offload->flowtable->flow_block.cb_list);
}
static void flow_offload_work_stats(struct flow_offload_work *offload)
@@ -708,19 +785,25 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
offload->flow->timeout = max_t(u64, offload->flow->timeout,
lastused + NF_FLOW_TIMEOUT);
+
+ if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) {
+ if (stats[0].pkts)
+ nf_ct_acct_add(offload->flow->ct,
+ FLOW_OFFLOAD_DIR_ORIGINAL,
+ stats[0].pkts, stats[0].bytes);
+ if (stats[1].pkts)
+ nf_ct_acct_add(offload->flow->ct,
+ FLOW_OFFLOAD_DIR_REPLY,
+ stats[1].pkts, stats[1].bytes);
+ }
}
static void flow_offload_work_handler(struct work_struct *work)
{
- struct flow_offload_work *offload, *next;
- LIST_HEAD(offload_pending_list);
-
- spin_lock_bh(&flow_offload_pending_list_lock);
- list_replace_init(&flow_offload_pending_list, &offload_pending_list);
- spin_unlock_bh(&flow_offload_pending_list_lock);
+ struct flow_offload_work *offload;
- list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
- switch (offload->cmd) {
+ offload = container_of(work, struct flow_offload_work, work);
+ switch (offload->cmd) {
case FLOW_CLS_REPLACE:
flow_offload_work_add(offload);
break;
@@ -732,19 +815,14 @@ static void flow_offload_work_handler(struct work_struct *work)
break;
default:
WARN_ON_ONCE(1);
- }
- list_del(&offload->list);
- kfree(offload);
}
+
+ kfree(offload);
}
static void flow_offload_queue_work(struct flow_offload_work *offload)
{
- spin_lock_bh(&flow_offload_pending_list_lock);
- list_add_tail(&offload->list, &flow_offload_pending_list);
- spin_unlock_bh(&flow_offload_pending_list_lock);
-
- schedule_work(&nf_flow_offload_work);
+ queue_work(nf_flow_offload_wq, &offload->work);
}
static struct flow_offload_work *
@@ -761,6 +839,7 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable,
offload->flow = flow;
offload->priority = flowtable->priority;
offload->flowtable = flowtable;
+ INIT_WORK(&offload->work, flow_offload_work_handler);
return offload;
}
@@ -811,7 +890,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable,
void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
{
if (nf_flowtable_hw_offload(flowtable))
- flush_work(&nf_flow_offload_work);
+ flush_workqueue(nf_flow_offload_wq);
}
static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
@@ -839,28 +918,47 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
return err;
}
-static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
- struct nf_flowtable *flowtable,
- struct net_device *dev,
- enum flow_block_command cmd,
- struct netlink_ext_ack *extack)
+static void nf_flow_table_block_offload_init(struct flow_block_offload *bo,
+ struct net *net,
+ enum flow_block_command cmd,
+ struct nf_flowtable *flowtable,
+ struct netlink_ext_ack *extack)
{
- int err;
-
- if (!nf_flowtable_hw_offload(flowtable))
- return 0;
-
- if (!dev->netdev_ops->ndo_setup_tc)
- return -EOPNOTSUPP;
-
memset(bo, 0, sizeof(*bo));
- bo->net = dev_net(dev);
+ bo->net = net;
bo->block = &flowtable->flow_block;
bo->command = cmd;
bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
bo->extack = extack;
INIT_LIST_HEAD(&bo->cb_list);
+}
+
+static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo,
+ struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
+ extack);
+ flow_indr_block_call(dev, bo, cmd, TC_SETUP_FT);
+ if (list_empty(&bo->cb_list))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static int nf_flow_table_offload_cmd(struct flow_block_offload *bo,
+ struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
+ extack);
err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo);
if (err < 0)
return err;
@@ -876,7 +974,15 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
struct flow_block_offload bo;
int err;
- err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack);
+ if (!nf_flowtable_hw_offload(flowtable))
+ return 0;
+
+ if (dev->netdev_ops->ndo_setup_tc)
+ err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd,
+ &extack);
+ else
+ err = nf_flow_table_indr_offload_cmd(&bo, flowtable, dev, cmd,
+ &extack);
if (err < 0)
return err;
@@ -884,22 +990,83 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
}
EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup);
-int nf_flow_table_offload_init(void)
+static void nf_flow_table_indr_block_ing_cmd(struct net_device *dev,
+ struct nf_flowtable *flowtable,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command cmd)
{
- INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler);
+ struct netlink_ext_ack extack = {};
+ struct flow_block_offload bo;
- return 0;
+ if (!flowtable)
+ return;
+
+ nf_flow_table_block_offload_init(&bo, dev_net(dev), cmd, flowtable,
+ &extack);
+
+ cb(dev, cb_priv, TC_SETUP_FT, &bo);
+
+ nf_flow_table_block_setup(flowtable, &bo, cmd);
}
-void nf_flow_table_offload_exit(void)
+static void nf_flow_table_indr_block_cb_cmd(struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command cmd)
{
- struct flow_offload_work *offload, *next;
- LIST_HEAD(offload_pending_list);
+ if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD))
+ return;
- cancel_work_sync(&nf_flow_offload_work);
+ nf_flow_table_indr_block_ing_cmd(dev, flowtable, cb, cb_priv, cmd);
+}
- list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
- list_del(&offload->list);
- kfree(offload);
+static void nf_flow_table_indr_block_cb(struct net_device *dev,
+ flow_indr_block_bind_cb_t *cb,
+ void *cb_priv,
+ enum flow_block_command cmd)
+{
+ struct net *net = dev_net(dev);
+ struct nft_flowtable *nft_ft;
+ struct nft_table *table;
+ struct nft_hook *hook;
+
+ mutex_lock(&net->nft.commit_mutex);
+ list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(nft_ft, &table->flowtables, list) {
+ list_for_each_entry(hook, &nft_ft->hook_list, list) {
+ if (hook->ops.dev != dev)
+ continue;
+
+ nf_flow_table_indr_block_cb_cmd(&nft_ft->data,
+ dev, cb,
+ cb_priv, cmd);
+ }
+ }
}
+ mutex_unlock(&net->nft.commit_mutex);
+}
+
+static struct flow_indr_block_entry block_ing_entry = {
+ .cb = nf_flow_table_indr_block_cb,
+ .list = LIST_HEAD_INIT(block_ing_entry.list),
+};
+
+int nf_flow_table_offload_init(void)
+{
+ nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ if (!nf_flow_offload_wq)
+ return -ENOMEM;
+
+ flow_indr_add_block_cb(&block_ing_entry);
+
+ return 0;
+}
+
+void nf_flow_table_offload_exit(void)
+{
+ flow_indr_del_block_cb(&block_ing_entry);
+ destroy_workqueue(nf_flow_offload_wq);
}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index f8f52ff99cfb..bbd1209694b8 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -46,25 +46,7 @@ void nf_unregister_queue_handler(struct net *net)
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
-static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb)
-{
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
-
- if (nf_bridge) {
- struct net_device *physdev;
-
- physdev = nf_bridge_get_physindev(skb);
- if (physdev)
- dev_put(physdev);
- physdev = nf_bridge_get_physoutdev(skb);
- if (physdev)
- dev_put(physdev);
- }
-#endif
-}
-
-void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
+static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
@@ -76,24 +58,34 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
if (state->sk)
sock_put(state->sk);
- nf_queue_entry_release_br_nf_refs(entry->skb);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (entry->physin)
+ dev_put(entry->physin);
+ if (entry->physout)
+ dev_put(entry->physout);
+#endif
+}
+
+void nf_queue_entry_free(struct nf_queue_entry *entry)
+{
+ nf_queue_entry_release_refs(entry);
+ kfree(entry);
}
-EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
+EXPORT_SYMBOL_GPL(nf_queue_entry_free);
-static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb)
+static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ const struct sk_buff *skb = entry->skb;
+ struct nf_bridge_info *nf_bridge;
+ nf_bridge = nf_bridge_info_get(skb);
if (nf_bridge) {
- struct net_device *physdev;
-
- physdev = nf_bridge_get_physindev(skb);
- if (physdev)
- dev_hold(physdev);
- physdev = nf_bridge_get_physoutdev(skb);
- if (physdev)
- dev_hold(physdev);
+ entry->physin = nf_bridge_get_physindev(skb);
+ entry->physout = nf_bridge_get_physoutdev(skb);
+ } else {
+ entry->physin = NULL;
+ entry->physout = NULL;
}
#endif
}
@@ -110,7 +102,12 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
if (state->sk)
sock_hold(state->sk);
- nf_queue_entry_get_br_nf_refs(entry->skb);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (entry->physin)
+ dev_hold(entry->physin);
+ if (entry->physout)
+ dev_hold(entry->physout);
+#endif
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
@@ -158,18 +155,16 @@ static void nf_ip6_saveroute(const struct sk_buff *skb,
static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
unsigned int index, unsigned int queuenum)
{
- int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
const struct nf_queue_handler *qh;
struct net *net = state->net;
unsigned int route_key_size;
+ int status;
/* QUEUE == DROP if no one is waiting, to be safe. */
qh = rcu_dereference(net->nf.queue_handler);
- if (!qh) {
- status = -ESRCH;
- goto err;
- }
+ if (!qh)
+ return -ESRCH;
switch (state->pf) {
case AF_INET:
@@ -184,14 +179,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
}
entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
- if (!entry) {
- status = -ENOMEM;
- goto err;
- }
+ if (!entry)
+ return -ENOMEM;
if (skb_dst(skb) && !skb_dst_force(skb)) {
- status = -ENETDOWN;
- goto err;
+ kfree(entry);
+ return -ENETDOWN;
}
*entry = (struct nf_queue_entry) {
@@ -201,6 +194,8 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
.size = sizeof(*entry) + route_key_size,
};
+ __nf_queue_entry_init_physdevs(entry);
+
nf_queue_entry_get_refs(entry);
switch (entry->state.pf) {
@@ -213,17 +208,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
}
status = qh->outfn(entry, queuenum);
-
if (status < 0) {
- nf_queue_entry_release_refs(entry);
- goto err;
+ nf_queue_entry_free(entry);
+ return status;
}
return 0;
-
-err:
- kfree(entry);
- return status;
}
/* Packets leaving via this function must come back through nf_reinject(). */
@@ -304,12 +294,10 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
hooks = nf_hook_entries_head(net, pf, entry->state.hook);
- nf_queue_entry_release_refs(entry);
-
i = entry->hook_index;
if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
kfree_skb(skb);
- kfree(entry);
+ nf_queue_entry_free(entry);
return;
}
@@ -348,6 +336,6 @@ next_hook:
kfree_skb(skb);
}
- kfree(entry);
+ nf_queue_entry_free(entry);
}
EXPORT_SYMBOL(nf_reinject);
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index b0930d4aba22..b9cbe1e2453e 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -267,7 +267,7 @@ static void *synproxy_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu + 1;
return per_cpu_ptr(snet->stats, cpu);
}
-
+ (*pos)++;
return NULL;
}
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d1318bdf49ca..d0ab5ffa1e2c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -520,7 +520,8 @@ static struct nft_table *nft_table_lookup(const struct net *net,
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &net->nft.tables, list,
+ lockdep_is_held(&net->nft.commit_mutex)) {
if (!nla_strcmp(nla, table->name) &&
table->family == family &&
nft_active_genmask(table, genmask))
@@ -1405,6 +1406,11 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
lockdep_commit_lock_is_held(net));
if (nft_dump_stats(skb, stats))
goto nla_put_failure;
+
+ if ((chain->flags & NFT_CHAIN_HW_OFFLOAD) &&
+ nla_put_be32(skb, NFTA_CHAIN_FLAGS,
+ htonl(NFT_CHAIN_HW_OFFLOAD)))
+ goto nla_put_failure;
}
if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
@@ -2518,8 +2524,8 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
module_put(type->owner);
}
-struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
- const struct nlattr *nla)
+static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
+ const struct nlattr *nla)
{
struct nft_expr_info info;
struct nft_expr *expr;
@@ -2552,6 +2558,24 @@ err1:
return ERR_PTR(err);
}
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
+{
+ int err;
+
+ if (src->ops->clone) {
+ dst->ops = src->ops;
+ err = src->ops->clone(dst, src);
+ if (err < 0)
+ return err;
+ } else {
+ memcpy(dst, src, src->ops->size);
+ }
+
+ __module_get(src->ops->type->owner);
+
+ return 0;
+}
+
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
nf_tables_expr_destroy(ctx, expr);
@@ -3261,25 +3285,17 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
/*
* Sets
*/
-
-static LIST_HEAD(nf_tables_set_types);
-
-int nft_register_set(struct nft_set_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_add_tail_rcu(&type->list, &nf_tables_set_types);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- return 0;
-}
-EXPORT_SYMBOL_GPL(nft_register_set);
-
-void nft_unregister_set(struct nft_set_type *type)
-{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_del_rcu(&type->list);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_unregister_set);
+static const struct nft_set_type *nft_set_types[] = {
+ &nft_set_hash_fast_type,
+ &nft_set_hash_type,
+ &nft_set_rhash_type,
+ &nft_set_bitmap_type,
+ &nft_set_rbtree_type,
+#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2)
+ &nft_set_pipapo_avx2_type,
+#endif
+ &nft_set_pipapo_type,
+};
#define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \
NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
@@ -3305,15 +3321,11 @@ nft_select_set_ops(const struct nft_ctx *ctx,
struct nft_set_estimate est, best;
const struct nft_set_type *type;
u32 flags = 0;
+ int i;
lockdep_assert_held(&ctx->net->nft.commit_mutex);
lockdep_nfnl_nft_mutex_not_held();
-#ifdef CONFIG_MODULES
- if (list_empty(&nf_tables_set_types)) {
- if (nft_request_module(ctx->net, "nft-set") == -EAGAIN)
- return ERR_PTR(-EAGAIN);
- }
-#endif
+
if (nla[NFTA_SET_FLAGS] != NULL)
flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
@@ -3322,7 +3334,8 @@ nft_select_set_ops(const struct nft_ctx *ctx,
best.lookup = ~0;
best.space = ~0;
- list_for_each_entry(type, &nf_tables_set_types, list) {
+ for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) {
+ type = nft_set_types[i];
ops = &type->ops;
if (!nft_set_ops_candidate(type, flags))
@@ -3353,11 +3366,6 @@ nft_select_set_ops(const struct nft_ctx *ctx,
break;
}
- if (!try_module_get(type->owner))
- continue;
- if (bops != NULL)
- module_put(to_set_type(bops)->owner);
-
bops = ops;
best = est;
}
@@ -3387,6 +3395,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
.len = NFT_USERDATA_MAXLEN },
[NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_SET_HANDLE] = { .type = NLA_U64 },
+ [NFTA_SET_EXPR] = { .type = NLA_NESTED },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -3590,8 +3599,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
{
struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
- struct nlattr *desc;
u32 portid = ctx->portid;
+ struct nlattr *nest;
u32 seq = ctx->seq;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
@@ -3647,9 +3656,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
goto nla_put_failure;
- desc = nla_nest_start_noflag(skb, NFTA_SET_DESC);
-
- if (desc == NULL)
+ nest = nla_nest_start_noflag(skb, NFTA_SET_DESC);
+ if (!nest)
goto nla_put_failure;
if (set->size &&
nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
@@ -3659,7 +3667,15 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
nf_tables_fill_set_concat(skb, set))
goto nla_put_failure;
- nla_nest_end(skb, desc);
+ nla_nest_end(skb, nest);
+
+ if (set->expr) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
+ if (nf_tables_fill_expr_info(skb, set->expr) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ }
nlmsg_end(skb, nlh);
return 0;
@@ -3906,6 +3922,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
const struct nft_set_ops *ops;
+ struct nft_expr *expr = NULL;
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
@@ -4015,6 +4032,9 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
return err;
}
+ if (nla[NFTA_SET_EXPR])
+ desc.expr = true;
+
table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
@@ -4056,21 +4076,27 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
size = ops->privsize(nla, &desc);
set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL);
- if (!set) {
- err = -ENOMEM;
- goto err1;
- }
+ if (!set)
+ return -ENOMEM;
name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
if (!name) {
err = -ENOMEM;
- goto err2;
+ goto err_set_name;
}
err = nf_tables_set_alloc_name(&ctx, set, name);
kfree(name);
if (err < 0)
- goto err2;
+ goto err_set_alloc_name;
+
+ if (nla[NFTA_SET_EXPR]) {
+ expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_alloc_name;
+ }
+ }
udata = NULL;
if (udlen) {
@@ -4087,6 +4113,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
set->dtype = dtype;
set->objtype = objtype;
set->dlen = desc.dlen;
+ set->expr = expr;
set->flags = flags;
set->size = desc.size;
set->policy = policy;
@@ -4102,34 +4129,37 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err = ops->init(set, &desc, nla);
if (err < 0)
- goto err3;
+ goto err_set_init;
err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
if (err < 0)
- goto err4;
+ goto err_set_trans;
list_add_tail_rcu(&set->list, &table->sets);
table->use++;
return 0;
-err4:
+err_set_trans:
ops->destroy(set);
-err3:
+err_set_init:
+ if (expr)
+ nft_expr_destroy(&ctx, expr);
+err_set_alloc_name:
kfree(set->name);
-err2:
+err_set_name:
kvfree(set);
-err1:
- module_put(to_set_type(ops)->owner);
return err;
}
-static void nft_set_destroy(struct nft_set *set)
+static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
if (WARN_ON(set->use > 0))
return;
+ if (set->expr)
+ nft_expr_destroy(ctx, set->expr);
+
set->ops->destroy(set);
- module_put(to_set_type(set->ops)->owner);
kfree(set->name);
kvfree(set);
}
@@ -4269,7 +4299,7 @@ EXPORT_SYMBOL_GPL(nf_tables_deactivate_set);
void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
{
if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
- nft_set_destroy(set);
+ nft_set_destroy(ctx, set);
}
EXPORT_SYMBOL_GPL(nf_tables_destroy_set);
@@ -4307,7 +4337,6 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
.align = __alignof__(u32),
},
};
-EXPORT_SYMBOL_GPL(nft_set_ext_types);
/*
* Set elements
@@ -4796,6 +4825,36 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx,
return trans;
}
+struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
+ const struct nft_set *set,
+ const struct nlattr *attr)
+{
+ struct nft_expr *expr;
+ int err;
+
+ expr = nft_expr_init(ctx, attr);
+ if (IS_ERR(expr))
+ return expr;
+
+ err = -EOPNOTSUPP;
+ if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL))
+ goto err_set_elem_expr;
+
+ if (expr->ops->type->flags & NFT_EXPR_GC) {
+ if (set->flags & NFT_SET_TIMEOUT)
+ goto err_set_elem_expr;
+ if (!set->ops->gc_init)
+ goto err_set_elem_expr;
+ set->ops->gc_init(set);
+ }
+
+ return expr;
+
+err_set_elem_expr:
+ nft_expr_destroy(ctx, expr);
+ return ERR_PTR(err);
+}
+
void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *key_end,
@@ -4827,6 +4886,17 @@ void *nft_set_elem_init(const struct nft_set *set,
return elem;
}
+static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_expr *expr)
+{
+ if (expr->ops->destroy_clone) {
+ expr->ops->destroy_clone(ctx, expr);
+ module_put(expr->ops->type->owner);
+ } else {
+ nf_tables_expr_destroy(ctx, expr);
+ }
+}
+
void nft_set_elem_destroy(const struct nft_set *set, void *elem,
bool destroy_expr)
{
@@ -4839,16 +4909,9 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
- if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
- struct nft_expr *expr = nft_set_ext_expr(ext);
+ if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext));
- if (expr->ops->destroy_clone) {
- expr->ops->destroy_clone(&ctx, expr);
- module_put(expr->ops->type->owner);
- } else {
- nf_tables_expr_destroy(&ctx, expr);
- }
- }
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
(*nft_set_ext_obj(ext))->use--;
kfree(elem);
@@ -4864,7 +4927,8 @@ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
+ nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
+
kfree(elem);
}
@@ -4878,6 +4942,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_elem elem;
struct nft_set_binding *binding;
struct nft_object *obj = NULL;
+ struct nft_expr *expr = NULL;
struct nft_userdata *udata;
struct nft_data_desc desc;
struct nft_data data;
@@ -4945,10 +5010,29 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return err;
}
+ if (nla[NFTA_SET_ELEM_EXPR] != NULL) {
+ expr = nft_set_elem_expr_alloc(ctx, set,
+ nla[NFTA_SET_ELEM_EXPR]);
+ if (IS_ERR(expr))
+ return PTR_ERR(expr);
+
+ err = -EOPNOTSUPP;
+ if (set->expr && set->expr->ops != expr->ops)
+ goto err_set_elem_expr;
+ } else if (set->expr) {
+ expr = kzalloc(set->expr->ops->size, GFP_KERNEL);
+ if (!expr)
+ return -ENOMEM;
+
+ err = nft_expr_clone(expr, set->expr);
+ if (err < 0)
+ goto err_set_elem_expr;
+ }
+
err = nft_setelem_parse_key(ctx, set, &elem.key.val,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
- return err;
+ goto err_set_elem_expr;
nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
@@ -4967,6 +5051,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
}
+ if (expr)
+ nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPR,
+ expr->ops->size);
+
if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
if (!(set->flags & NFT_SET_OBJECT)) {
err = -EINVAL;
@@ -5051,6 +5139,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
*nft_set_ext_obj(ext) = obj;
obj->use++;
}
+ if (expr) {
+ memcpy(nft_set_ext_expr(ext), expr, expr->ops->size);
+ kfree(expr);
+ expr = NULL;
+ }
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
if (trans == NULL)
@@ -5077,6 +5170,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = -EBUSY;
else if (!(nlmsg_flags & NLM_F_EXCL))
err = 0;
+ } else if (err == -ENOTEMPTY) {
+ /* ENOTEMPTY reports overlapping between this element
+ * and an existing one.
+ */
+ err = -EEXIST;
}
goto err_element_clash;
}
@@ -5098,7 +5196,8 @@ err_element_clash:
err_trans:
if (obj)
obj->use--;
- kfree(elem.priv);
+
+ nf_tables_set_elem_destroy(ctx, set, elem.priv);
err_parse_data:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
nft_data_release(&data, desc.type);
@@ -5106,6 +5205,9 @@ err_parse_key_end:
nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
err_parse_key:
nft_data_release(&elem.key.val, NFT_DATA_VALUE);
+err_set_elem_expr:
+ if (expr != NULL)
+ nft_expr_destroy(ctx, expr);
return err;
}
@@ -5360,7 +5462,6 @@ void nft_set_gc_batch_release(struct rcu_head *rcu)
nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
kfree(gcb);
}
-EXPORT_SYMBOL_GPL(nft_set_gc_batch_release);
struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
gfp_t gfp)
@@ -5373,7 +5474,6 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
gcb->head.set = set;
return gcb;
}
-EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
/*
* Stateful objects
@@ -6284,7 +6384,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
if (nla[NFTA_FLOWTABLE_FLAGS]) {
flowtable->data.flags =
ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
- if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD)
+ if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK)
goto err3;
}
@@ -6300,8 +6400,13 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
goto err4;
err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable);
- if (err < 0)
+ if (err < 0) {
+ list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
+ list_del_rcu(&hook->list);
+ kfree_rcu(hook, rcu);
+ }
goto err4;
+ }
err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
if (err < 0)
@@ -6967,7 +7072,7 @@ static void nft_commit_release(struct nft_trans *trans)
nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
break;
case NFT_MSG_DELSET:
- nft_set_destroy(nft_trans_set(trans));
+ nft_set_destroy(&trans->ctx, nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
nf_tables_set_elem_destroy(&trans->ctx,
@@ -7378,13 +7483,8 @@ static void nf_tables_module_autoload(struct net *net)
list_splice_init(&net->nft.module_list, &module_list);
mutex_unlock(&net->nft.commit_mutex);
list_for_each_entry_safe(req, next, &module_list, list) {
- if (req->done) {
- list_del(&req->list);
- kfree(req);
- } else {
- request_module("%s", req->module);
- req->done = true;
- }
+ request_module("%s", req->module);
+ req->done = true;
}
mutex_lock(&net->nft.commit_mutex);
list_splice(&module_list, &net->nft.module_list);
@@ -7403,7 +7503,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
break;
case NFT_MSG_NEWSET:
- nft_set_destroy(nft_trans_set(trans));
+ nft_set_destroy(&trans->ctx, nft_trans_set(trans));
break;
case NFT_MSG_NEWSETELEM:
nft_set_elem_destroy(nft_trans_elem_set(trans),
@@ -8129,7 +8229,7 @@ static void __nft_release_tables(struct net *net)
list_for_each_entry_safe(set, ns, &table->sets, list) {
list_del(&set->list);
table->use--;
- nft_set_destroy(set);
+ nft_set_destroy(&ctx, set);
}
list_for_each_entry_safe(obj, ne, &table->objects, list) {
nft_obj_del(obj);
@@ -8167,6 +8267,7 @@ static void __net_exit nf_tables_exit_net(struct net *net)
__nft_release_tables(net);
mutex_unlock(&net->nft.commit_mutex);
WARN_ON_ONCE(!list_empty(&net->nft.tables));
+ WARN_ON_ONCE(!list_empty(&net->nft.module_list));
}
static struct pernet_operations nf_tables_net_ops = {
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 2bb28483af22..954bccb7f32a 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -313,7 +313,7 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain,
nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
- flow_indr_block_call(dev, &bo, cmd);
+ flow_indr_block_call(dev, &bo, cmd, TC_SETUP_BLOCK);
if (list_empty(&bo.cb_list))
return -EOPNOTSUPP;
diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c
deleted file mode 100644
index 586b621007eb..000000000000
--- a/net/netfilter/nf_tables_set_core.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/module.h>
-#include <net/netfilter/nf_tables_core.h>
-
-static int __init nf_tables_set_module_init(void)
-{
- nft_register_set(&nft_set_hash_fast_type);
- nft_register_set(&nft_set_hash_type);
- nft_register_set(&nft_set_rhash_type);
- nft_register_set(&nft_set_bitmap_type);
- nft_register_set(&nft_set_rbtree_type);
- nft_register_set(&nft_set_pipapo_type);
-
- return 0;
-}
-
-static void __exit nf_tables_set_module_exit(void)
-{
- nft_unregister_set(&nft_set_pipapo_type);
- nft_unregister_set(&nft_set_rbtree_type);
- nft_unregister_set(&nft_set_bitmap_type);
- nft_unregister_set(&nft_set_rhash_type);
- nft_unregister_set(&nft_set_hash_type);
- nft_unregister_set(&nft_set_hash_fast_type);
-}
-
-module_init(nf_tables_set_module_init);
-module_exit(nf_tables_set_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 2481470dec36..5827117f2635 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -33,7 +33,7 @@ struct nf_acct {
refcount_t refcnt;
char name[NFACCT_NAME_MAX];
struct rcu_head rcu_head;
- char data[0];
+ char data[];
};
struct nfacct_filter {
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index de3a9596b7f1..a5f294aa8e4c 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -742,6 +742,8 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
[NFCTH_NAME] = { .type = NLA_NUL_STRING,
.len = NF_CT_HELPER_NAME_LEN-1 },
[NFCTH_QUEUE_NUM] = { .type = NLA_U32, },
+ [NFCTH_PRIV_DATA_LEN] = { .type = NLA_U32, },
+ [NFCTH_STATUS] = { .type = NLA_U32, },
};
static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = {
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 76535fd9278c..3243a31f6e82 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -737,12 +737,6 @@ static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
#define nf_bridge_adjust_segmented_data(s) do {} while (0)
#endif
-static void free_entry(struct nf_queue_entry *entry)
-{
- nf_queue_entry_release_refs(entry);
- kfree(entry);
-}
-
static int
__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
struct sk_buff *skb, struct nf_queue_entry *entry)
@@ -768,7 +762,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
entry_seg->skb = skb;
ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
if (ret)
- free_entry(entry_seg);
+ nf_queue_entry_free(entry_seg);
}
return ret;
}
@@ -827,7 +821,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
if (queued) {
if (err) /* some segments are already queued */
- free_entry(entry);
+ nf_queue_entry_free(entry);
kfree_skb(skb);
return 0;
}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 0ed2281f03be..bc37d6c59db4 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -93,7 +93,7 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = {
static int nft_bitwise_init_bool(struct nft_bitwise *priv,
const struct nlattr *const tb[])
{
- struct nft_data_desc d1, d2;
+ struct nft_data_desc mask, xor;
int err;
if (tb[NFTA_BITWISE_DATA])
@@ -103,29 +103,29 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv,
!tb[NFTA_BITWISE_XOR])
return -EINVAL;
- err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1,
+ err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask,
tb[NFTA_BITWISE_MASK]);
if (err < 0)
return err;
- if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) {
+ if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) {
err = -EINVAL;
goto err1;
}
- err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2,
+ err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor,
tb[NFTA_BITWISE_XOR]);
if (err < 0)
goto err1;
- if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) {
+ if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) {
err = -EINVAL;
goto err2;
}
return 0;
err2:
- nft_data_release(&priv->xor, d2.type);
+ nft_data_release(&priv->xor, xor.type);
err1:
- nft_data_release(&priv->mask, d1.type);
+ nft_data_release(&priv->mask, mask.type);
return err;
}
diff --git a/net/netfilter/nft_chain_nat.c b/net/netfilter/nft_chain_nat.c
index ff9ac8ae0031..eac4a901233f 100644
--- a/net/netfilter/nft_chain_nat.c
+++ b/net/netfilter/nft_chain_nat.c
@@ -89,6 +89,7 @@ static const struct nft_chain_type nft_chain_nat_inet = {
.name = "nat",
.type = NFT_CHAIN_T_NAT,
.family = NFPROTO_INET,
+ .owner = THIS_MODULE,
.hook_mask = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_LOCAL_IN) |
(1 << NF_INET_LOCAL_OUT) |
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 683785225a3e..64ca13a1885b 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -24,23 +24,6 @@ struct nft_dynset {
struct nft_set_binding binding;
};
-static int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
-{
- int err;
-
- if (src->ops->clone) {
- dst->ops = src->ops;
- err = src->ops->clone(dst, src);
- if (err < 0)
- return err;
- } else {
- memcpy(dst, src, src->ops->size);
- }
-
- __module_get(src->ops->type->owner);
- return 0;
-}
-
static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
struct nft_regs *regs)
{
@@ -81,7 +64,6 @@ void nft_dynset_eval(const struct nft_expr *expr,
const struct nft_dynset *priv = nft_expr_priv(expr);
struct nft_set *set = priv->set;
const struct nft_set_ext *ext;
- const struct nft_expr *sexpr;
u64 timeout;
if (priv->op == NFT_DYNSET_OP_DELETE) {
@@ -91,18 +73,13 @@ void nft_dynset_eval(const struct nft_expr *expr,
if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
expr, regs, &ext)) {
- sexpr = NULL;
- if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- sexpr = nft_set_ext_expr(ext);
-
if (priv->op == NFT_DYNSET_OP_UPDATE &&
nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
timeout = priv->timeout ? : set->timeout;
*nft_set_ext_expiration(ext) = get_jiffies_64() + timeout;
}
- if (sexpr != NULL)
- sexpr->ops->eval(sexpr, regs, pkt);
+ nft_set_elem_update_expr(ext, regs, pkt);
if (priv->invert)
regs->verdict.code = NFT_BREAK;
@@ -206,20 +183,14 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
if (!(set->flags & NFT_SET_EVAL))
return -EINVAL;
- priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
+ priv->expr = nft_set_elem_expr_alloc(ctx, set,
+ tb[NFTA_DYNSET_EXPR]);
if (IS_ERR(priv->expr))
return PTR_ERR(priv->expr);
- err = -EOPNOTSUPP;
- if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL))
- goto err1;
-
- if (priv->expr->ops->type->flags & NFT_EXPR_GC) {
- if (set->flags & NFT_SET_TIMEOUT)
- goto err1;
- if (!set->ops->gc_init)
- goto err1;
- set->ops->gc_init(set);
+ if (set->expr && set->expr->ops != priv->expr->ops) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
}
}
@@ -239,7 +210,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
err = nf_tables_bind_set(ctx, set, &priv->binding);
if (err < 0)
- goto err1;
+ goto err_expr_free;
if (set->size == 0)
set->size = 0xffff;
@@ -247,7 +218,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
priv->set = set;
return 0;
-err1:
+err_expr_free:
if (priv->expr != NULL)
nft_expr_destroy(ctx, priv->expr);
return err;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a5e8469859e3..07782836fad6 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -228,7 +228,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
unsigned int i, optl, tcphdr_len, offset;
struct tcphdr *tcph;
u8 *opt;
- u32 src;
tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
if (!tcph)
@@ -237,7 +236,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
opt = (u8 *)tcph;
for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
union {
- u8 octet;
__be16 v16;
__be32 v32;
} old, new;
@@ -259,13 +257,13 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (!tcph)
return;
- src = regs->data[priv->sreg];
offset = i + priv->offset;
switch (priv->len) {
case 2:
old.v16 = get_unaligned((u16 *)(opt + offset));
- new.v16 = src;
+ new.v16 = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg]);
switch (priv->type) {
case TCPOPT_MSS:
@@ -283,7 +281,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
old.v16, new.v16, false);
break;
case 4:
- new.v32 = src;
+ new.v32 = regs->data[priv->sreg];
old.v32 = get_unaligned((u32 *)(opt + offset));
if (old.v32 == new.v32)
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index aba11c2333f3..3087e23297db 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -28,6 +28,9 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
+ /* This is used by ifb only. */
+ skb_set_redirected(pkt->skb, true);
+
nf_fwd_netdev_egress(pkt, oif);
regs->verdict.code = NF_STOLEN;
}
@@ -190,6 +193,13 @@ nla_put_failure:
return -1;
}
+static int nft_fwd_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS));
+}
+
static struct nft_expr_type nft_fwd_netdev_type;
static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.type = &nft_fwd_netdev_type,
@@ -197,6 +207,7 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.eval = nft_fwd_neigh_eval,
.init = nft_fwd_neigh_init,
.dump = nft_fwd_neigh_dump,
+ .validate = nft_fwd_validate,
};
static const struct nft_expr_ops nft_fwd_netdev_ops = {
@@ -205,6 +216,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
.eval = nft_fwd_netdev_eval,
.init = nft_fwd_netdev_init,
.dump = nft_fwd_netdev_dump,
+ .validate = nft_fwd_validate,
.offload = nft_fwd_netdev_offload,
};
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 660bad688e2b..1e70359d633c 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -43,6 +43,7 @@ void nft_lookup_eval(const struct nft_expr *expr,
nft_data_copy(&regs->data[priv->dreg],
nft_set_ext_data(ext), set->dlen);
+ nft_set_elem_update_expr(ext, regs, pkt);
}
static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = {
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 1993af3a2979..a7de3a58f553 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -129,6 +129,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_LEN] = { .type = NLA_U32 },
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
[NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 },
};
static int nft_payload_init(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 87e8d9ba0c9b..32f0fc8be3a4 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -81,6 +81,7 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
u32 idx, off;
nft_bitmap_location(set, key, &idx, &off);
+ *ext = NULL;
return nft_bitmap_active(priv->bitmap, idx, off, genmask);
}
@@ -285,6 +286,8 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
/* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */
if (desc->klen > 2)
return false;
+ else if (desc->expr)
+ return false;
est->size = nft_bitmap_total_size(desc->klen);
est->lookup = NFT_SET_CLASS_O_1;
@@ -293,8 +296,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-struct nft_set_type nft_set_bitmap_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_bitmap_type = {
.ops = {
.privsize = nft_bitmap_privsize,
.elemsize = offsetof(struct nft_bitmap_elem, ext),
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index d350a7cd3af0..4d3f147e8d8d 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -662,8 +662,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
return true;
}
-struct nft_set_type nft_set_rhash_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_rhash_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT | NFT_SET_EVAL,
.ops = {
@@ -686,8 +685,7 @@ struct nft_set_type nft_set_rhash_type __read_mostly = {
},
};
-struct nft_set_type nft_set_hash_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_hash_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
.privsize = nft_hash_privsize,
@@ -706,8 +704,7 @@ struct nft_set_type nft_set_hash_type __read_mostly = {
},
};
-struct nft_set_type nft_set_hash_fast_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_hash_fast_type = {
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
.privsize = nft_hash_privsize,
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index f0cb1e13af50..87aabf651cfe 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -203,7 +203,7 @@
* ::
*
* rule indices in last field: 0 1
- * map to elements: 0x42 0x66
+ * map to elements: 0x66 0x42
*
*
* Matching
@@ -298,7 +298,7 @@
* ::
*
* rule indices in last field: 0 1
- * map to elements: 0x42 0x66
+ * map to elements: 0x66 0x42
*
* the matching element is at 0x42.
*
@@ -330,144 +330,22 @@
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/log2.h>
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <uapi/linux/netfilter/nf_tables.h>
-#include <net/ipv6.h> /* For the maximum length of a field */
#include <linux/bitmap.h>
#include <linux/bitops.h>
-/* Count of concatenated fields depends on count of 32-bit nftables registers */
-#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT
-
-/* Largest supported field size */
-#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr))
-#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE)
-
-/* Number of bits to be grouped together in lookup table buckets, arbitrary */
-#define NFT_PIPAPO_GROUP_BITS 4
-#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS)
-
-/* Fields are padded to 32 bits in input registers */
-#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \
- (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32)))
-#define NFT_PIPAPO_GROUPS_PADDING(x) \
- (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE)
-
-/* Number of buckets, given by 2 ^ n, with n grouped bits */
-#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS)
-
-/* Each n-bit range maps to up to n * 2 rules */
-#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2))
-
-/* Use the rest of mapping table buckets for rule indices, but it makes no sense
- * to exceed 32 bits
- */
-#if BITS_PER_LONG == 64
-#define NFT_PIPAPO_MAP_TOBITS 32
-#else
-#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS)
-#endif
-
-/* ...which gives us the highest allowed index for a rule */
-#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \
- - (1UL << NFT_PIPAPO_MAP_NBITS))
-
-#define nft_pipapo_for_each_field(field, index, match) \
- for ((field) = (match)->f, (index) = 0; \
- (index) < (match)->field_count; \
- (index)++, (field)++)
-
-/**
- * union nft_pipapo_map_bucket - Bucket of mapping table
- * @to: First rule number (in next field) this rule maps to
- * @n: Number of rules (in next field) this rule maps to
- * @e: If there's no next field, pointer to element this rule maps to
- */
-union nft_pipapo_map_bucket {
- struct {
-#if BITS_PER_LONG == 64
- static_assert(NFT_PIPAPO_MAP_TOBITS <= 32);
- u32 to;
-
- static_assert(NFT_PIPAPO_MAP_NBITS <= 32);
- u32 n;
-#else
- unsigned long to:NFT_PIPAPO_MAP_TOBITS;
- unsigned long n:NFT_PIPAPO_MAP_NBITS;
-#endif
- };
- struct nft_pipapo_elem *e;
-};
-
-/**
- * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
- * @groups: Amount of 4-bit groups
- * @rules: Number of inserted rules
- * @bsize: Size of each bucket in lookup table, in longs
- * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets
- * @mt: Mapping table: one bucket per rule
- */
-struct nft_pipapo_field {
- int groups;
- unsigned long rules;
- size_t bsize;
- unsigned long *lt;
- union nft_pipapo_map_bucket *mt;
-};
-
-/**
- * struct nft_pipapo_match - Data used for lookup and matching
- * @field_count Amount of fields in set
- * @scratch: Preallocated per-CPU maps for partial matching results
- * @bsize_max: Maximum lookup table bucket size of all fields, in longs
- * @rcu Matching data is swapped on commits
- * @f: Fields, with lookup and mapping tables
- */
-struct nft_pipapo_match {
- int field_count;
- unsigned long * __percpu *scratch;
- size_t bsize_max;
- struct rcu_head rcu;
- struct nft_pipapo_field f[0];
-};
+#include "nft_set_pipapo_avx2.h"
+#include "nft_set_pipapo.h"
/* Current working bitmap index, toggled between field matches */
static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index);
/**
- * struct nft_pipapo - Representation of a set
- * @match: Currently in-use matching data
- * @clone: Copy where pending insertions and deletions are kept
- * @groups: Total amount of 4-bit groups for fields in this set
- * @width: Total bytes to be matched for one packet, including padding
- * @dirty: Working copy has pending insertions or deletions
- * @last_gc: Timestamp of last garbage collection run, jiffies
- */
-struct nft_pipapo {
- struct nft_pipapo_match __rcu *match;
- struct nft_pipapo_match *clone;
- int groups;
- int width;
- bool dirty;
- unsigned long last_gc;
-};
-
-struct nft_pipapo_elem;
-
-/**
- * struct nft_pipapo_elem - API-facing representation of single set element
- * @ext: nftables API extensions
- */
-struct nft_pipapo_elem {
- struct nft_set_ext ext;
-};
-
-/**
* pipapo_refill() - For each set bit, set bits from selected mapping table item
* @map: Bitmap to be scanned for set bits
* @len: Length of bitmap in longs
@@ -484,9 +362,8 @@ struct nft_pipapo_elem {
*
* Return: -1 on no match, bit position on 'match_only', 0 otherwise.
*/
-static int pipapo_refill(unsigned long *map, int len, int rules,
- unsigned long *dst, union nft_pipapo_map_bucket *mt,
- bool match_only)
+int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool match_only)
{
unsigned long bitset;
int k, ret = -1;
@@ -503,7 +380,7 @@ static int pipapo_refill(unsigned long *map, int len, int rules,
return -1;
}
- if (unlikely(match_only)) {
+ if (match_only) {
bitmap_clear(map, i, 1);
return i;
}
@@ -559,26 +436,18 @@ static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1;
- unsigned long *lt = f->lt;
- int b, group;
+ int b;
- /* For each 4-bit group: select lookup table bucket depending on
+ /* For each bit group: select lookup table bucket depending on
* packet bytes value, then AND bucket value
*/
- for (group = 0; group < f->groups; group += 2) {
- u8 v;
-
- v = *rp >> 4;
- __bitmap_and(res_map, res_map, lt + v * f->bsize,
- f->bsize * BITS_PER_LONG);
- lt += f->bsize * NFT_PIPAPO_BUCKETS;
-
- v = *rp & 0x0f;
- rp++;
- __bitmap_and(res_map, res_map, lt + v * f->bsize,
- f->bsize * BITS_PER_LONG);
- lt += f->bsize * NFT_PIPAPO_BUCKETS;
- }
+ if (likely(f->bb == 8))
+ pipapo_and_field_buckets_8bit(f, res_map, rp);
+ else
+ pipapo_and_field_buckets_4bit(f, res_map, rp);
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+ rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
/* Now populate the bitmap for the next field, unless this is
* the last field, in which case return the matched 'ext'
@@ -621,7 +490,7 @@ next_match:
map_index = !map_index;
swap(res_map, fill_map);
- rp += NFT_PIPAPO_GROUPS_PADDING(f->groups);
+ rp += NFT_PIPAPO_GROUPS_PADDING(f);
}
out:
@@ -669,26 +538,19 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net,
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1;
- unsigned long *lt = f->lt;
- int b, group;
+ int b;
- /* For each 4-bit group: select lookup table bucket depending on
+ /* For each bit group: select lookup table bucket depending on
* packet bytes value, then AND bucket value
*/
- for (group = 0; group < f->groups; group++) {
- u8 v;
-
- if (group % 2) {
- v = *data & 0x0f;
- data++;
- } else {
- v = *data >> 4;
- }
- __bitmap_and(res_map, res_map, lt + v * f->bsize,
- f->bsize * BITS_PER_LONG);
+ if (f->bb == 8)
+ pipapo_and_field_buckets_8bit(f, res_map, data);
+ else if (f->bb == 4)
+ pipapo_and_field_buckets_4bit(f, res_map, data);
+ else
+ BUG();
- lt += f->bsize * NFT_PIPAPO_BUCKETS;
- }
+ data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
/* Now populate the bitmap for the next field, unless this is
* the last field, in which case return the matched 'ext'
@@ -713,7 +575,7 @@ next_match:
goto out;
}
- data += NFT_PIPAPO_GROUPS_PADDING(f->groups);
+ data += NFT_PIPAPO_GROUPS_PADDING(f);
/* Swap bitmap indices: fill_map will be the initial bitmap for
* the next field (i.e. the new res_map), and res_map is
@@ -736,8 +598,8 @@ out:
* @elem: nftables API element representation containing key data
* @flags: Unused
*/
-void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem, unsigned int flags)
+static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem, unsigned int flags)
{
return pipapo_get(net, set, (const u8 *)elem->key.val.data,
nft_genmask_cur(net));
@@ -763,6 +625,10 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
int group, bucket;
new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG);
+#ifdef NFT_PIPAPO_ALIGN
+ new_bucket_size = roundup(new_bucket_size,
+ NFT_PIPAPO_ALIGN / sizeof(*new_lt));
+#endif
if (new_bucket_size == f->bsize)
goto mt;
@@ -772,15 +638,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules)
else
copy = new_bucket_size;
- new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size *
- sizeof(*new_lt), GFP_KERNEL);
+ new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) *
+ new_bucket_size * sizeof(*new_lt) +
+ NFT_PIPAPO_ALIGN_HEADROOM,
+ GFP_KERNEL);
if (!new_lt)
return -ENOMEM;
- new_p = new_lt;
- old_p = old_lt;
+ new_p = NFT_PIPAPO_LT_ALIGN(new_lt);
+ old_p = NFT_PIPAPO_LT_ALIGN(old_lt);
+
for (group = 0; group < f->groups; group++) {
- for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) {
+ for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) {
memcpy(new_p, old_p, copy * sizeof(*new_p));
new_p += copy;
old_p += copy;
@@ -807,7 +676,7 @@ mt:
if (new_lt) {
f->bsize = new_bucket_size;
- f->lt = new_lt;
+ NFT_PIPAPO_LT_ASSIGN(f, new_lt);
kvfree(old_lt);
}
@@ -829,13 +698,196 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group,
{
unsigned long *pos;
- pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group;
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt);
+ pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group;
pos += f->bsize * v;
__set_bit(rule, pos);
}
/**
+ * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits
+ * @old_groups: Number of current groups
+ * @bsize: Size of one bucket, in longs
+ * @old_lt: Pointer to the current lookup table
+ * @new_lt: Pointer to the new, pre-allocated lookup table
+ *
+ * Each bucket with index b in the new lookup table, belonging to group g, is
+ * filled with the bit intersection between:
+ * - bucket with index given by the upper 4 bits of b, from group g, and
+ * - bucket with index given by the lower 4 bits of b, from group g + 1
+ *
+ * That is, given buckets from the new lookup table N(x, y) and the old lookup
+ * table O(x, y), with x bucket index, and y group index:
+ *
+ * N(b, g) := O(b / 16, g) & O(b % 16, g + 1)
+ *
+ * This ensures equivalence of the matching results on lookup. Two examples in
+ * pictures:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255
+ * 0 ^
+ * 1 | ^
+ * ... ( & ) |
+ * / \ |
+ * / \ .-( & )-.
+ * / bucket \ | |
+ * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 |
+ * 0 / \ | |
+ * 1 \ | |
+ * 2 | --'
+ * 3 '-
+ * ...
+ */
+static void pipapo_lt_4b_to_8b(int old_groups, int bsize,
+ unsigned long *old_lt, unsigned long *new_lt)
+{
+ int g, b, i;
+
+ for (g = 0; g < old_groups / 2; g++) {
+ int src_g0 = g * 2, src_g1 = g * 2 + 1;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) {
+ int src_b0 = b / NFT_PIPAPO_BUCKETS(4);
+ int src_b1 = b % NFT_PIPAPO_BUCKETS(4);
+ int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0;
+ int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1;
+
+ for (i = 0; i < bsize; i++) {
+ *new_lt = old_lt[src_i0 * bsize + i] &
+ old_lt[src_i1 * bsize + i];
+ new_lt++;
+ }
+ }
+ }
+}
+
+/**
+ * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits
+ * @old_groups: Number of current groups
+ * @bsize: Size of one bucket, in longs
+ * @old_lt: Pointer to the current lookup table
+ * @new_lt: Pointer to the new, pre-allocated lookup table
+ *
+ * Each bucket with index b in the new lookup table, belonging to group g, is
+ * filled with the bit union of:
+ * - all the buckets with index such that the upper four bits of the lower byte
+ * equal b, from group g, with g odd
+ * - all the buckets with index such that the lower four bits equal b, from
+ * group g, with g even
+ *
+ * That is, given buckets from the new lookup table N(x, y) and the old lookup
+ * table O(x, y), with x bucket index, and y group index:
+ *
+ * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4)
+ * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f)
+ *
+ * where U() denotes the arbitrary union operation (binary OR of n terms). This
+ * ensures equivalence of the matching results on lookup.
+ */
+static void pipapo_lt_8b_to_4b(int old_groups, int bsize,
+ unsigned long *old_lt, unsigned long *new_lt)
+{
+ int g, b, bsrc, i;
+
+ memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize *
+ sizeof(unsigned long));
+
+ for (g = 0; g < old_groups * 2; g += 2) {
+ int src_g = g / 2;
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) {
+ for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g;
+ bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1);
+ bsrc++) {
+ if (((bsrc & 0xf0) >> 4) != b)
+ continue;
+
+ for (i = 0; i < bsize; i++)
+ new_lt[i] |= old_lt[bsrc * bsize + i];
+ }
+
+ new_lt += bsize;
+ }
+
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) {
+ for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g;
+ bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1);
+ bsrc++) {
+ if ((bsrc & 0x0f) != b)
+ continue;
+
+ for (i = 0; i < bsize; i++)
+ new_lt[i] |= old_lt[bsrc * bsize + i];
+ }
+
+ new_lt += bsize;
+ }
+ }
+}
+
+/**
+ * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed
+ * @f: Field containing lookup table
+ */
+static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f)
+{
+ unsigned long *new_lt;
+ int groups, bb;
+ size_t lt_size;
+
+ lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize *
+ sizeof(*f->lt);
+
+ if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET &&
+ lt_size > NFT_PIPAPO_LT_SIZE_HIGH) {
+ groups = f->groups * 2;
+ bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET;
+
+ lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
+ sizeof(*f->lt);
+ } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET &&
+ lt_size < NFT_PIPAPO_LT_SIZE_LOW) {
+ groups = f->groups / 2;
+ bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET;
+
+ lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize *
+ sizeof(*f->lt);
+
+ /* Don't increase group width if the resulting lookup table size
+ * would exceed the upper size threshold for a "small" set.
+ */
+ if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH)
+ return;
+ } else {
+ return;
+ }
+
+ new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL);
+ if (!new_lt)
+ return;
+
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+ if (f->bb == 4 && bb == 8) {
+ pipapo_lt_4b_to_8b(f->groups, f->bsize,
+ NFT_PIPAPO_LT_ALIGN(f->lt),
+ NFT_PIPAPO_LT_ALIGN(new_lt));
+ } else if (f->bb == 8 && bb == 4) {
+ pipapo_lt_8b_to_4b(f->groups, f->bsize,
+ NFT_PIPAPO_LT_ALIGN(f->lt),
+ NFT_PIPAPO_LT_ALIGN(new_lt));
+ } else {
+ BUG();
+ }
+
+ f->groups = groups;
+ f->bb = bb;
+ kvfree(f->lt);
+ NFT_PIPAPO_LT_ASSIGN(f, new_lt);
+}
+
+/**
* pipapo_insert() - Insert new rule in field given input key and mask length
* @f: Field containing lookup table
* @k: Input key for classification, without nftables padding
@@ -849,7 +901,7 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group,
static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
int mask_bits)
{
- int rule = f->rules++, group, ret;
+ int rule = f->rules++, group, ret, bit_offset = 0;
ret = pipapo_resize(f, f->rules - 1, f->rules);
if (ret)
@@ -859,28 +911,33 @@ static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k,
int i, v;
u8 mask;
- if (group % 2)
- v = k[group / 2] & 0x0f;
- else
- v = k[group / 2] >> 4;
+ v = k[group / (BITS_PER_BYTE / f->bb)];
+ v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0);
+ v >>= (BITS_PER_BYTE - bit_offset) - f->bb;
+
+ bit_offset += f->bb;
+ bit_offset %= BITS_PER_BYTE;
- if (mask_bits >= (group + 1) * 4) {
+ if (mask_bits >= (group + 1) * f->bb) {
/* Not masked */
pipapo_bucket_set(f, rule, group, v);
- } else if (mask_bits <= group * 4) {
+ } else if (mask_bits <= group * f->bb) {
/* Completely masked */
- for (i = 0; i < NFT_PIPAPO_BUCKETS; i++)
+ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++)
pipapo_bucket_set(f, rule, group, i);
} else {
/* The mask limit falls on this group */
- mask = 0x0f >> (mask_bits - group * 4);
- for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) {
+ mask = GENMASK(f->bb - 1, 0);
+ mask >>= mask_bits - group * f->bb;
+ for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) {
if ((i & ~mask) == (v & ~mask))
pipapo_bucket_set(f, rule, group, i);
}
}
}
+ pipapo_lt_bits_adjust(f);
+
return 1;
}
@@ -1053,8 +1110,12 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
for_each_possible_cpu(i) {
unsigned long *scratch;
+#ifdef NFT_PIPAPO_ALIGN
+ unsigned long *scratch_aligned;
+#endif
- scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2,
+ scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 +
+ NFT_PIPAPO_ALIGN_HEADROOM,
GFP_KERNEL, cpu_to_node(i));
if (!scratch) {
/* On failure, there's no need to undo previous
@@ -1070,6 +1131,11 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
kfree(*per_cpu_ptr(clone->scratch, i));
*per_cpu_ptr(clone->scratch, i) = scratch;
+
+#ifdef NFT_PIPAPO_ALIGN
+ scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch);
+ *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned;
+#endif
}
return 0;
@@ -1098,21 +1164,41 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
struct nft_pipapo_field *f;
int i, bsize_max, err = 0;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
+ end = (const u8 *)nft_set_ext_key_end(ext)->data;
+ else
+ end = start;
+
dup = pipapo_get(net, set, start, genmask);
- if (PTR_ERR(dup) == -ENOENT) {
- if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) {
- end = (const u8 *)nft_set_ext_key_end(ext)->data;
- dup = pipapo_get(net, set, end, nft_genmask_next(net));
- } else {
- end = start;
+ if (!IS_ERR(dup)) {
+ /* Check if we already have the same exact entry */
+ const struct nft_data *dup_key, *dup_end;
+
+ dup_key = nft_set_ext_key(&dup->ext);
+ if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END))
+ dup_end = nft_set_ext_key_end(&dup->ext);
+ else
+ dup_end = dup_key;
+
+ if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) &&
+ !memcmp(end, dup_end->data, sizeof(*dup_end->data))) {
+ *ext2 = &dup->ext;
+ return -EEXIST;
}
+
+ return -ENOTEMPTY;
+ }
+
+ if (PTR_ERR(dup) == -ENOENT) {
+ /* Look for partially overlapping entries */
+ dup = pipapo_get(net, set, end, nft_genmask_next(net));
}
if (PTR_ERR(dup) != -ENOENT) {
if (IS_ERR(dup))
return PTR_ERR(dup);
*ext2 = &dup->ext;
- return -EEXIST;
+ return -ENOTEMPTY;
}
/* Validate */
@@ -1123,11 +1209,11 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
return -ENOSPC;
if (memcmp(start_p, end_p,
- f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0)
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0)
return -EINVAL;
- start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
- end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
/* Insert */
@@ -1141,22 +1227,19 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
rulemap[i].to = f->rules;
ret = memcmp(start, end,
- f->groups / NFT_PIPAPO_GROUPS_PER_BYTE);
- if (!ret) {
- ret = pipapo_insert(f, start,
- f->groups * NFT_PIPAPO_GROUP_BITS);
- } else {
- ret = pipapo_expand(f, start, end,
- f->groups * NFT_PIPAPO_GROUP_BITS);
- }
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f));
+ if (!ret)
+ ret = pipapo_insert(f, start, f->groups * f->bb);
+ else
+ ret = pipapo_expand(f, start, end, f->groups * f->bb);
if (f->bsize > bsize_max)
bsize_max = f->bsize;
rulemap[i].n = ret;
- start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
- end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) {
@@ -1200,23 +1283,35 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
if (!new->scratch)
goto out_scratch;
+#ifdef NFT_PIPAPO_ALIGN
+ new->scratch_aligned = alloc_percpu(*new->scratch_aligned);
+ if (!new->scratch_aligned)
+ goto out_scratch;
+#endif
+
rcu_head_init(&new->rcu);
src = old->f;
dst = new->f;
for (i = 0; i < old->field_count; i++) {
+ unsigned long *new_lt;
+
memcpy(dst, src, offsetof(struct nft_pipapo_field, lt));
- dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS *
- src->bsize * sizeof(*dst->lt),
- GFP_KERNEL);
- if (!dst->lt)
+ new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) *
+ src->bsize * sizeof(*dst->lt) +
+ NFT_PIPAPO_ALIGN_HEADROOM,
+ GFP_KERNEL);
+ if (!new_lt)
goto out_lt;
- memcpy(dst->lt, src->lt,
+ NFT_PIPAPO_LT_ASSIGN(dst, new_lt);
+
+ memcpy(NFT_PIPAPO_LT_ALIGN(new_lt),
+ NFT_PIPAPO_LT_ALIGN(src->lt),
src->bsize * sizeof(*dst->lt) *
- src->groups * NFT_PIPAPO_BUCKETS);
+ src->groups * NFT_PIPAPO_BUCKETS(src->bb));
dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL);
if (!dst->mt)
@@ -1237,8 +1332,11 @@ out_lt:
kvfree(dst->lt);
dst--;
}
- free_percpu(new->scratch);
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(new->scratch_aligned);
+#endif
out_scratch:
+ free_percpu(new->scratch);
kfree(new);
return ERR_PTR(-ENOMEM);
@@ -1394,9 +1492,10 @@ static void pipapo_drop(struct nft_pipapo_match *m,
unsigned long *pos;
int b;
- pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize;
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g *
+ NFT_PIPAPO_BUCKETS(f->bb) * f->bsize;
- for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) {
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) {
bitmap_cut(pos, pos, rulemap[i].to,
rulemap[i].n,
f->bsize * BITS_PER_LONG);
@@ -1414,6 +1513,8 @@ static void pipapo_drop(struct nft_pipapo_match *m,
;
}
f->rules -= rulemap[i].n;
+
+ pipapo_lt_bits_adjust(f);
}
}
@@ -1498,6 +1599,9 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
for_each_possible_cpu(i)
kfree(*per_cpu_ptr(m->scratch, i));
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(m->scratch_aligned);
+#endif
free_percpu(m->scratch);
pipapo_free_fields(m);
@@ -1690,30 +1794,33 @@ static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set,
static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule,
int rule_count, u8 *left, u8 *right)
{
+ int g, mask_len = 0, bit_offset = 0;
u8 *l = left, *r = right;
- int g, mask_len = 0;
for (g = 0; g < f->groups; g++) {
int b, x0, x1;
x0 = -1;
x1 = -1;
- for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) {
+ for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) {
unsigned long *pos;
- pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize;
+ pos = NFT_PIPAPO_LT_ALIGN(f->lt) +
+ (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize;
if (test_bit(first_rule, pos) && x0 == -1)
x0 = b;
if (test_bit(first_rule + rule_count - 1, pos))
x1 = b;
}
- if (g % 2) {
- *(l++) |= x0 & 0x0f;
- *(r++) |= x1 & 0x0f;
- } else {
- *l |= x0 << 4;
- *r |= x1 << 4;
+ *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset);
+ *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset);
+
+ bit_offset += f->bb;
+ if (bit_offset >= BITS_PER_BYTE) {
+ bit_offset %= BITS_PER_BYTE;
+ l++;
+ r++;
}
if (x1 - x0 == 0)
@@ -1748,8 +1855,9 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
pipapo_get_boundaries(f, first_rule, rule_count, left, right);
- return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) &&
- !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE);
+ return !memcmp(start, left,
+ f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) &&
+ !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f));
}
/**
@@ -1766,11 +1874,13 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem)
{
- const u8 *data = (const u8 *)elem->key.val.data;
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m = priv->clone;
+ struct nft_pipapo_elem *e = elem->priv;
int rules_f0, first_rule = 0;
- struct nft_pipapo_elem *e;
+ const u8 *data;
+
+ data = (const u8 *)nft_set_ext_key(&e->ext);
e = pipapo_get(net, set, data, 0);
if (IS_ERR(e))
@@ -1799,8 +1909,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
rules_fx = f->mt[start].n;
start = f->mt[start].to;
- match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
- match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups);
+ match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
if (i == m->field_count) {
@@ -1883,56 +1993,24 @@ static u64 nft_pipapo_privsize(const struct nlattr * const nla[],
}
/**
- * nft_pipapo_estimate() - Estimate set size, space and lookup complexity
- * @desc: Set description, element count and field description used here
+ * nft_pipapo_estimate() - Set size, space and lookup complexity
+ * @desc: Set description, element count and field description used
* @features: Flags: NFT_SET_INTERVAL needs to be there
* @est: Storage for estimation data
*
- * The size for this set type can vary dramatically, as it depends on the number
- * of rules (composing netmasks) the entries expand to. We compute the worst
- * case here.
- *
- * In general, for a non-ranged entry or a single composing netmask, we need
- * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that
- * is, each input bit needs four bits of matching data), plus a bucket in the
- * mapping table for each field.
- *
- * Return: true only for compatible range concatenations
+ * Return: true if set description is compatible, false otherwise
*/
static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est)
{
- unsigned long entry_size;
- int i;
-
- if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1)
+ if (!(features & NFT_SET_INTERVAL) ||
+ desc->field_count < NFT_PIPAPO_MIN_FIELDS)
return false;
- for (i = 0, entry_size = 0; i < desc->field_count; i++) {
- unsigned long rules;
-
- if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES)
- return false;
-
- /* Worst-case ranges for each concatenated field: each n-bit
- * field can expand to up to n * 2 rules in each bucket, and
- * each rule also needs a mapping bucket.
- */
- rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2;
- entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE;
- entry_size += rules * sizeof(union nft_pipapo_map_bucket);
- }
-
- /* Rules in lookup and mapping tables are needed for each entry */
- est->size = desc->size * entry_size;
- if (est->size && div_u64(est->size, desc->size) != entry_size)
+ est->size = pipapo_estimate_size(desc);
+ if (!est->size)
return false;
- est->size += sizeof(struct nft_pipapo) +
- sizeof(struct nft_pipapo_match) * 2;
-
- est->size += sizeof(struct nft_pipapo_field) * desc->field_count;
-
est->lookup = NFT_SET_CLASS_O_LOG_N;
est->space = NFT_SET_CLASS_O_N;
@@ -1959,38 +2037,52 @@ static int nft_pipapo_init(const struct nft_set *set,
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m;
struct nft_pipapo_field *f;
- int err, i;
+ int err, i, field_count;
+
+ field_count = desc->field_count ? : 1;
- if (desc->field_count > NFT_PIPAPO_MAX_FIELDS)
+ if (field_count > NFT_PIPAPO_MAX_FIELDS)
return -EINVAL;
- m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count,
+ m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count,
GFP_KERNEL);
if (!m)
return -ENOMEM;
- m->field_count = desc->field_count;
+ m->field_count = field_count;
m->bsize_max = 0;
m->scratch = alloc_percpu(unsigned long *);
if (!m->scratch) {
err = -ENOMEM;
- goto out_free;
+ goto out_scratch;
}
for_each_possible_cpu(i)
*per_cpu_ptr(m->scratch, i) = NULL;
+#ifdef NFT_PIPAPO_ALIGN
+ m->scratch_aligned = alloc_percpu(unsigned long *);
+ if (!m->scratch_aligned) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(m->scratch_aligned, i) = NULL;
+#endif
+
rcu_head_init(&m->rcu);
nft_pipapo_for_each_field(f, i, m) {
- f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE;
- priv->groups += f->groups;
+ int len = desc->field_len[i] ? : set->klen;
+
+ f->bb = NFT_PIPAPO_GROUP_BITS_INIT;
+ f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f);
- priv->width += round_up(desc->field_len[i], sizeof(u32));
+ priv->width += round_up(len, sizeof(u32));
f->bsize = 0;
f->rules = 0;
- f->lt = NULL;
+ NFT_PIPAPO_LT_ASSIGN(f, NULL);
f->mt = NULL;
}
@@ -2008,7 +2100,11 @@ static int nft_pipapo_init(const struct nft_set *set,
return 0;
out_free:
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(m->scratch_aligned);
+#endif
free_percpu(m->scratch);
+out_scratch:
kfree(m);
return err;
@@ -2043,16 +2139,21 @@ static void nft_pipapo_destroy(const struct nft_set *set)
nft_set_elem_destroy(set, e, true);
}
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(m->scratch_aligned);
+#endif
for_each_possible_cpu(cpu)
kfree(*per_cpu_ptr(m->scratch, cpu));
free_percpu(m->scratch);
-
pipapo_free_fields(m);
kfree(m);
priv->match = NULL;
}
if (priv->clone) {
+#ifdef NFT_PIPAPO_ALIGN
+ free_percpu(priv->clone->scratch_aligned);
+#endif
for_each_possible_cpu(cpu)
kfree(*per_cpu_ptr(priv->clone->scratch, cpu));
free_percpu(priv->clone->scratch);
@@ -2079,8 +2180,7 @@ static void nft_pipapo_gc_init(const struct nft_set *set)
priv->last_gc = jiffies;
}
-struct nft_set_type nft_set_pipapo_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_pipapo_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT,
.ops = {
@@ -2100,3 +2200,26 @@ struct nft_set_type nft_set_pipapo_type __read_mostly = {
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2)
+const struct nft_set_type nft_set_pipapo_avx2_type = {
+ .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
+ NFT_SET_TIMEOUT,
+ .ops = {
+ .lookup = nft_pipapo_avx2_lookup,
+ .insert = nft_pipapo_insert,
+ .activate = nft_pipapo_activate,
+ .deactivate = nft_pipapo_deactivate,
+ .flush = nft_pipapo_flush,
+ .remove = nft_pipapo_remove,
+ .walk = nft_pipapo_walk,
+ .get = nft_pipapo_get,
+ .privsize = nft_pipapo_privsize,
+ .estimate = nft_pipapo_avx2_estimate,
+ .init = nft_pipapo_init,
+ .destroy = nft_pipapo_destroy,
+ .gc_init = nft_pipapo_gc_init,
+ .elemsize = offsetof(struct nft_pipapo_elem, ext),
+ },
+};
+#endif
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
new file mode 100644
index 000000000000..25a75591583e
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo.h
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#ifndef _NFT_SET_PIPAPO_H
+
+#include <linux/log2.h>
+#include <net/ipv6.h> /* For the maximum length of a field */
+
+/* Count of concatenated fields depends on count of 32-bit nftables registers */
+#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT
+
+/* Restrict usage to multiple fields, make sure rbtree is used otherwise */
+#define NFT_PIPAPO_MIN_FIELDS 2
+
+/* Largest supported field size */
+#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr))
+#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE)
+
+/* Bits to be grouped together in table buckets depending on set size */
+#define NFT_PIPAPO_GROUP_BITS_INIT NFT_PIPAPO_GROUP_BITS_SMALL_SET
+#define NFT_PIPAPO_GROUP_BITS_SMALL_SET 8
+#define NFT_PIPAPO_GROUP_BITS_LARGE_SET 4
+#define NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4 \
+ BUILD_BUG_ON((NFT_PIPAPO_GROUP_BITS_SMALL_SET != 8) || \
+ (NFT_PIPAPO_GROUP_BITS_LARGE_SET != 4))
+#define NFT_PIPAPO_GROUPS_PER_BYTE(f) (BITS_PER_BYTE / (f)->bb)
+
+/* If a lookup table gets bigger than NFT_PIPAPO_LT_SIZE_HIGH, switch to the
+ * small group width, and switch to the big group width if the table gets
+ * smaller than NFT_PIPAPO_LT_SIZE_LOW.
+ *
+ * Picking 2MiB as threshold (for a single table) avoids as much as possible
+ * crossing page boundaries on most architectures (x86-64 and MIPS huge pages,
+ * ARMv7 supersections, POWER "large" pages, SPARC Level 1 regions, etc.), which
+ * keeps performance nice in case kvmalloc() gives us non-contiguous areas.
+ */
+#define NFT_PIPAPO_LT_SIZE_THRESHOLD (1 << 21)
+#define NFT_PIPAPO_LT_SIZE_HYSTERESIS (1 << 16)
+#define NFT_PIPAPO_LT_SIZE_HIGH NFT_PIPAPO_LT_SIZE_THRESHOLD
+#define NFT_PIPAPO_LT_SIZE_LOW NFT_PIPAPO_LT_SIZE_THRESHOLD - \
+ NFT_PIPAPO_LT_SIZE_HYSTERESIS
+
+/* Fields are padded to 32 bits in input registers */
+#define NFT_PIPAPO_GROUPS_PADDED_SIZE(f) \
+ (round_up((f)->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f), sizeof(u32)))
+#define NFT_PIPAPO_GROUPS_PADDING(f) \
+ (NFT_PIPAPO_GROUPS_PADDED_SIZE(f) - (f)->groups / \
+ NFT_PIPAPO_GROUPS_PER_BYTE(f))
+
+/* Number of buckets given by 2 ^ n, with n bucket bits */
+#define NFT_PIPAPO_BUCKETS(bb) (1 << (bb))
+
+/* Each n-bit range maps to up to n * 2 rules */
+#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2))
+
+/* Use the rest of mapping table buckets for rule indices, but it makes no sense
+ * to exceed 32 bits
+ */
+#if BITS_PER_LONG == 64
+#define NFT_PIPAPO_MAP_TOBITS 32
+#else
+#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS)
+#endif
+
+/* ...which gives us the highest allowed index for a rule */
+#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \
+ - (1UL << NFT_PIPAPO_MAP_NBITS))
+
+/* Definitions for vectorised implementations */
+#ifdef NFT_PIPAPO_ALIGN
+#define NFT_PIPAPO_ALIGN_HEADROOM \
+ (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN)
+#define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN))
+#define NFT_PIPAPO_LT_ASSIGN(field, x) \
+ do { \
+ (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \
+ (field)->lt = (x); \
+ } while (0)
+#else
+#define NFT_PIPAPO_ALIGN_HEADROOM 0
+#define NFT_PIPAPO_LT_ALIGN(lt) (lt)
+#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x))
+#endif /* NFT_PIPAPO_ALIGN */
+
+#define nft_pipapo_for_each_field(field, index, match) \
+ for ((field) = (match)->f, (index) = 0; \
+ (index) < (match)->field_count; \
+ (index)++, (field)++)
+
+/**
+ * union nft_pipapo_map_bucket - Bucket of mapping table
+ * @to: First rule number (in next field) this rule maps to
+ * @n: Number of rules (in next field) this rule maps to
+ * @e: If there's no next field, pointer to element this rule maps to
+ */
+union nft_pipapo_map_bucket {
+ struct {
+#if BITS_PER_LONG == 64
+ static_assert(NFT_PIPAPO_MAP_TOBITS <= 32);
+ u32 to;
+
+ static_assert(NFT_PIPAPO_MAP_NBITS <= 32);
+ u32 n;
+#else
+ unsigned long to:NFT_PIPAPO_MAP_TOBITS;
+ unsigned long n:NFT_PIPAPO_MAP_NBITS;
+#endif
+ };
+ struct nft_pipapo_elem *e;
+};
+
+/**
+ * struct nft_pipapo_field - Lookup, mapping tables and related data for a field
+ * @groups: Amount of bit groups
+ * @rules: Number of inserted rules
+ * @bsize: Size of each bucket in lookup table, in longs
+ * @bb: Number of bits grouped together in lookup table buckets
+ * @lt: Lookup table: 'groups' rows of buckets
+ * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes
+ * @mt: Mapping table: one bucket per rule
+ */
+struct nft_pipapo_field {
+ int groups;
+ unsigned long rules;
+ size_t bsize;
+ int bb;
+#ifdef NFT_PIPAPO_ALIGN
+ unsigned long *lt_aligned;
+#endif
+ unsigned long *lt;
+ union nft_pipapo_map_bucket *mt;
+};
+
+/**
+ * struct nft_pipapo_match - Data used for lookup and matching
+ * @field_count Amount of fields in set
+ * @scratch: Preallocated per-CPU maps for partial matching results
+ * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes
+ * @bsize_max: Maximum lookup table bucket size of all fields, in longs
+ * @rcu Matching data is swapped on commits
+ * @f: Fields, with lookup and mapping tables
+ */
+struct nft_pipapo_match {
+ int field_count;
+#ifdef NFT_PIPAPO_ALIGN
+ unsigned long * __percpu *scratch_aligned;
+#endif
+ unsigned long * __percpu *scratch;
+ size_t bsize_max;
+ struct rcu_head rcu;
+ struct nft_pipapo_field f[];
+};
+
+/**
+ * struct nft_pipapo - Representation of a set
+ * @match: Currently in-use matching data
+ * @clone: Copy where pending insertions and deletions are kept
+ * @width: Total bytes to be matched for one packet, including padding
+ * @dirty: Working copy has pending insertions or deletions
+ * @last_gc: Timestamp of last garbage collection run, jiffies
+ */
+struct nft_pipapo {
+ struct nft_pipapo_match __rcu *match;
+ struct nft_pipapo_match *clone;
+ int width;
+ bool dirty;
+ unsigned long last_gc;
+};
+
+struct nft_pipapo_elem;
+
+/**
+ * struct nft_pipapo_elem - API-facing representation of single set element
+ * @ext: nftables API extensions
+ */
+struct nft_pipapo_elem {
+ struct nft_set_ext ext;
+};
+
+int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool match_only);
+
+/**
+ * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets
+ * @f: Field including lookup table
+ * @dst: Area to store result
+ * @data: Input data selecting table buckets
+ */
+static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
+ unsigned long *dst,
+ const u8 *data)
+{
+ unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
+ int group;
+
+ for (group = 0; group < f->groups; group += BITS_PER_BYTE / 4, data++) {
+ u8 v;
+
+ v = *data >> 4;
+ __bitmap_and(dst, dst, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
+
+ v = *data & 0x0f;
+ __bitmap_and(dst, dst, lt + v * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
+ }
+}
+
+/**
+ * pipapo_and_field_buckets_8bit() - Intersect 8-bit buckets
+ * @f: Field including lookup table
+ * @dst: Area to store result
+ * @data: Input data selecting table buckets
+ */
+static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f,
+ unsigned long *dst,
+ const u8 *data)
+{
+ unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
+ int group;
+
+ for (group = 0; group < f->groups; group++, data++) {
+ __bitmap_and(dst, dst, lt + *data * f->bsize,
+ f->bsize * BITS_PER_LONG);
+ lt += f->bsize * NFT_PIPAPO_BUCKETS(8);
+ }
+}
+
+/**
+ * pipapo_estimate_size() - Estimate worst-case for set size
+ * @desc: Set description, element count and field description used here
+ *
+ * The size for this set type can vary dramatically, as it depends on the number
+ * of rules (composing netmasks) the entries expand to. We compute the worst
+ * case here.
+ *
+ * In general, for a non-ranged entry or a single composing netmask, we need
+ * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that
+ * is, each input bit needs four bits of matching data), plus a bucket in the
+ * mapping table for each field.
+ *
+ * Return: worst-case set size in bytes, 0 on any overflow
+ */
+static u64 pipapo_estimate_size(const struct nft_set_desc *desc)
+{
+ unsigned long entry_size;
+ u64 size;
+ int i;
+
+ for (i = 0, entry_size = 0; i < desc->field_count; i++) {
+ unsigned long rules;
+
+ if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES)
+ return 0;
+
+ /* Worst-case ranges for each concatenated field: each n-bit
+ * field can expand to up to n * 2 rules in each bucket, and
+ * each rule also needs a mapping bucket.
+ */
+ rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2;
+ entry_size += rules *
+ NFT_PIPAPO_BUCKETS(NFT_PIPAPO_GROUP_BITS_INIT) /
+ BITS_PER_BYTE;
+ entry_size += rules * sizeof(union nft_pipapo_map_bucket);
+ }
+
+ /* Rules in lookup and mapping tables are needed for each entry */
+ size = desc->size * entry_size;
+ if (size && div_u64(size, desc->size) != entry_size)
+ return 0;
+
+ size += sizeof(struct nft_pipapo) + sizeof(struct nft_pipapo_match) * 2;
+
+ size += sizeof(struct nft_pipapo_field) * desc->field_count;
+
+ return size;
+}
+
+#endif /* _NFT_SET_PIPAPO_H */
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
new file mode 100644
index 000000000000..d65ae0e23028
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -0,0 +1,1223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
+ *
+ * Copyright (c) 2019-2020 Red Hat GmbH
+ *
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include <linux/compiler.h>
+#include <asm/fpu/api.h>
+
+#include "nft_set_pipapo_avx2.h"
+#include "nft_set_pipapo.h"
+
+#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG)
+
+/* Load from memory into YMM register with non-temporal hint ("stream load"),
+ * that is, don't fetch lines from memory into the cache. This avoids pushing
+ * precious packet data out of the cache hierarchy, and is appropriate when:
+ *
+ * - loading buckets from lookup tables, as they are not going to be used
+ * again before packets are entirely classified
+ *
+ * - loading the result bitmap from the previous field, as it's never used
+ * again
+ */
+#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \
+ asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
+
+/* Stream a single lookup table bucket into YMM register given lookup table,
+ * group index, value of packet bits, bucket size.
+ */
+#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \
+ NFT_PIPAPO_AVX2_LOAD(reg, \
+ lt[((group) * NFT_PIPAPO_BUCKETS(4) + \
+ (v)) * (bsize)])
+#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \
+ NFT_PIPAPO_AVX2_LOAD(reg, \
+ lt[((group) * NFT_PIPAPO_BUCKETS(8) + \
+ (v)) * (bsize)])
+
+/* Bitwise AND: the staple operation of this algorithm */
+#define NFT_PIPAPO_AVX2_AND(dst, a, b) \
+ asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
+
+/* Jump to label if @reg is zero */
+#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
+ asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
+ "je %l[" #label "]" : : : : label)
+
+/* Store 256 bits from YMM register into memory. Contrary to bucket load
+ * operation, we don't bypass the cache here, as stored matching results
+ * are always used shortly after.
+ */
+#define NFT_PIPAPO_AVX2_STORE(loc, reg) \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
+
+/* Zero out a complete YMM register, @reg */
+#define NFT_PIPAPO_AVX2_ZERO(reg) \
+ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
+
+/* Current working bitmap index, toggled between field matches */
+static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
+
+/**
+ * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
+ *
+ * This zeroes out ymm15, which is later used whenever we need to clear a
+ * memory location, by storing its content into memory.
+ */
+static void nft_pipapo_avx2_prepare(void)
+{
+ NFT_PIPAPO_AVX2_ZERO(15);
+}
+
+/**
+ * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
+ * @data: Base memory area
+ * @start: First bit to set
+ * @len: Count of bits to fill
+ *
+ * This is nothing else than a version of bitmap_set(), as used e.g. by
+ * pipapo_refill(), tailored for the microarchitectures using it and better
+ * suited for the specific usage: it's very likely that we'll set a small number
+ * of bits, not crossing a word boundary, and correct branch prediction is
+ * critical here.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ */
+static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
+{
+ int offset = start % BITS_PER_LONG;
+ unsigned long mask;
+
+ data += start / BITS_PER_LONG;
+
+ if (likely(len == 1)) {
+ *data |= BIT(offset);
+ return;
+ }
+
+ if (likely(len < BITS_PER_LONG || offset)) {
+ if (likely(len + offset <= BITS_PER_LONG)) {
+ *data |= GENMASK(len - 1 + offset, offset);
+ return;
+ }
+
+ *data |= ~0UL << offset;
+ len -= BITS_PER_LONG - offset;
+ data++;
+
+ if (len <= BITS_PER_LONG) {
+ mask = ~0UL >> (BITS_PER_LONG - len);
+ *data |= mask;
+ return;
+ }
+ }
+
+ memset(data, 0xff, len / BITS_PER_BYTE);
+ data += len / BITS_PER_LONG;
+
+ len %= BITS_PER_LONG;
+ if (len)
+ *data |= ~0UL >> (BITS_PER_LONG - len);
+}
+
+/**
+ * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
+ * @offset: Start from given bitmap (equivalent to bucket) offset, in longs
+ * @map: Bitmap to be scanned for set bits
+ * @dst: Destination bitmap
+ * @mt: Mapping table containing bit set specifiers
+ * @len: Length of bitmap in longs
+ * @last: Return index of first set bit, if this is the last field
+ *
+ * This is an alternative implementation of pipapo_refill() suitable for usage
+ * with AVX2 lookup routines: we know there are four words to be scanned, at
+ * a given offset inside the map, for each matching iteration.
+ *
+ * This function doesn't actually use any AVX2 instruction.
+ *
+ * Return: first set bit index if @last, index of first filled word otherwise.
+ */
+static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
+ unsigned long *dst,
+ union nft_pipapo_map_bucket *mt, bool last)
+{
+ int ret = -1;
+
+#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \
+ do { \
+ while (map[(x)]) { \
+ int r = __builtin_ctzl(map[(x)]); \
+ int i = (offset + (x)) * BITS_PER_LONG + r; \
+ \
+ if (last) \
+ return i; \
+ \
+ nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \
+ \
+ if (ret == -1) \
+ ret = mt[i].to; \
+ \
+ map[(x)] &= ~(1UL << r); \
+ } \
+ } while (0)
+
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
+ NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
+#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * Load buckets from lookup table corresponding to the values of each 4-bit
+ * group of packet bytes, and perform a bitwise intersection between them. If
+ * this is the first field in the set, simply AND the buckets together
+ * (equivalent to using an all-ones starting bitmap), use the provided starting
+ * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
+ * working bitmap, @fill.
+ *
+ * This is used for 8-bit fields (i.e. protocol numbers).
+ *
+ * Out-of-order (and superscalar) execution is vital here, so it's critical to
+ * avoid false data dependencies. CPU and compiler could (mostly) take care of
+ * this on their own, but the operation ordering is explicitly given here with
+ * a likely execution order in mind, to highlight possible stalls. That's why
+ * a number of logically distinct operations (i.e. loading buckets, intersecting
+ * buckets) are interleaved.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_AND(4, 2, 3);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 16-bit fields (i.e. ports).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(7, 6, 7);
+ }
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_AND(9, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 6, 7);
+ NFT_PIPAPO_AVX2_AND(12, 8, 9);
+ NFT_PIPAPO_AVX2_AND(13, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 6, 7);
+ NFT_PIPAPO_AVX2_AND(13, 8, 9);
+ NFT_PIPAPO_AVX2_AND(14, 10, 11);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(1, 12, 13);
+ NFT_PIPAPO_AVX2_AND(1, 1, 14);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 9, 10);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ NFT_PIPAPO_AVX2_AND(8, 6, 7);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
+ pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
+ pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
+ pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf,
+ pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf,
+ pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
+ pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
+ pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
+ };
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
+ NFT_PIPAPO_AVX2_AND(8, 1, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
+ NFT_PIPAPO_AVX2_AND(10, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
+ NFT_PIPAPO_AVX2_AND(12, 7, 8);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize);
+ NFT_PIPAPO_AVX2_AND(14, 9, 10);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize);
+ NFT_PIPAPO_AVX2_AND(7, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 8, 9);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 12, 13);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 14, 0);
+ NFT_PIPAPO_AVX2_AND(7, 1, 2);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 3, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 5, 6);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
+ NFT_PIPAPO_AVX2_AND(13, 7, 8);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize);
+ NFT_PIPAPO_AVX2_AND(1, 9, 10);
+ NFT_PIPAPO_AVX2_AND(2, 11, 12);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 13, 14);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize);
+ NFT_PIPAPO_AVX2_AND(9, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
+ NFT_PIPAPO_AVX2_AND(11, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
+
+ NFT_PIPAPO_AVX2_AND(0, 6, 7);
+ NFT_PIPAPO_AVX2_AND(1, 8, 9);
+ NFT_PIPAPO_AVX2_AND(2, 10, 11);
+ NFT_PIPAPO_AVX2_AND(3, 12, 0);
+
+ /* Stalls */
+ NFT_PIPAPO_AVX2_AND(4, 1, 2);
+ NFT_PIPAPO_AVX2_AND(5, 3, 4);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 8-bit fields (i.e. protocol numbers).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_AND(2, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 16-bit fields (i.e. ports).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(4, 3, 2);
+ }
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(7, 4, 5);
+ NFT_PIPAPO_AVX2_AND(0, 6, 7);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first) {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 6, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(7, 2, 3);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_AND(1, 6, 7);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ } else {
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_AND(5, 0, 1);
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(3, 0, 1);
+ NFT_PIPAPO_AVX2_AND(4, 2, 3);
+ }
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * See nft_pipapo_avx2_lookup_4b_2().
+ *
+ * This is used for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
+ unsigned long *lt = f->lt, bsize = f->bsize;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+ for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
+ int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (!first)
+ NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
+ if (!first) {
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
+ NFT_PIPAPO_AVX2_AND(1, 1, 0);
+ }
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 1, 2);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 3, 4);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize);
+
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize);
+ NFT_PIPAPO_AVX2_AND(3, 5, 6);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
+
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
+ NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
+ NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
+ NFT_PIPAPO_AVX2_AND(2, 6, 7);
+ NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
+ NFT_PIPAPO_AVX2_AND(4, 0, 1);
+
+ /* Stall */
+ NFT_PIPAPO_AVX2_AND(5, 2, 3);
+ NFT_PIPAPO_AVX2_AND(6, 4, 5);
+
+ NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
+
+ b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
+ if (last)
+ return b;
+
+ if (unlikely(ret == -1))
+ ret = b / XSAVE_YMM_SIZE;
+
+ continue;
+
+nomatch:
+ NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
+nothing:
+ ;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
+ * @map: Previous match result, used as initial bitmap
+ * @fill: Destination bitmap to be filled with current match result
+ * @f: Field, containing lookup and mapping tables
+ * @offset: Ignore buckets before the given index, no bits are filled there
+ * @pkt: Packet data, pointer to input nftables register
+ * @first: If this is the first field, don't source previous result
+ * @last: Last field: stop at the first match and return bit index
+ *
+ * This function should never be called, but is provided for the case the field
+ * size doesn't match any of the known data types. Matching rate is
+ * substantially lower than AVX2 routines.
+ *
+ * Return: -1 on no match, rule index of match if @last, otherwise first long
+ * word index to be checked next (i.e. first filled word).
+ */
+static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
+ struct nft_pipapo_field *f, int offset,
+ const u8 *pkt, bool first, bool last)
+{
+ unsigned long *lt = f->lt, bsize = f->bsize;
+ int i, ret = -1, b;
+
+ lt += offset * NFT_PIPAPO_LONGS_PER_M256;
+
+ if (first)
+ memset(map, 0xff, bsize * sizeof(*map));
+
+ for (i = offset; i < bsize; i++) {
+ if (f->bb == 8)
+ pipapo_and_field_buckets_8bit(f, map, pkt);
+ else
+ pipapo_and_field_buckets_4bit(f, map, pkt);
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+ b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
+
+ if (last)
+ return b;
+
+ if (ret == -1)
+ ret = b / XSAVE_YMM_SIZE;
+ }
+
+ return ret;
+}
+
+/**
+ * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
+ * @desc: Set description, element count and field description used
+ * @features: Flags: NFT_SET_INTERVAL needs to be there
+ * @est: Storage for estimation data
+ *
+ * Return: true if set is compatible and AVX2 available, false otherwise.
+ */
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ if (!(features & NFT_SET_INTERVAL) ||
+ desc->field_count < NFT_PIPAPO_MIN_FIELDS)
+ return false;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
+ return false;
+
+ est->size = pipapo_estimate_size(desc);
+ if (!est->size)
+ return false;
+
+ est->lookup = NFT_SET_CLASS_O_LOG_N;
+
+ est->space = NFT_SET_CLASS_O_N;
+
+ return true;
+}
+
+/**
+ * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @elem: nftables API element representation containing key data
+ * @ext: nftables API extension pointer, filled with matching reference
+ *
+ * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
+ *
+ * This implementation exploits the repetitive characteristic of the algorithm
+ * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
+ *
+ * Return: true on match, false otherwise.
+ */
+bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ unsigned long *res, *fill, *scratch;
+ u8 genmask = nft_genmask_cur(net);
+ const u8 *rp = (const u8 *)key;
+ struct nft_pipapo_match *m;
+ struct nft_pipapo_field *f;
+ bool map_index;
+ int i, ret = 0;
+
+ m = rcu_dereference(priv->match);
+
+ /* This also protects access to all data related to scratch maps */
+ kernel_fpu_begin();
+
+ scratch = *raw_cpu_ptr(m->scratch_aligned);
+ if (unlikely(!scratch)) {
+ kernel_fpu_end();
+ return false;
+ }
+ map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
+
+ res = scratch + (map_index ? m->bsize_max : 0);
+ fill = scratch + (map_index ? 0 : m->bsize_max);
+
+ /* Starting map doesn't need to be set for this implementation */
+
+ nft_pipapo_avx2_prepare();
+
+next_match:
+ nft_pipapo_for_each_field(f, i, m) {
+ bool last = i == m->field_count - 1, first = !i;
+
+#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
+ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
+ ret, rp, \
+ first, last))
+
+ if (likely(f->bb == 8)) {
+ if (f->groups == 1) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
+ } else if (f->groups == 2) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
+ } else if (f->groups == 4) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
+ } else if (f->groups == 6) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
+ } else if (f->groups == 16) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
+ } else {
+ ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
+ ret, rp,
+ first, last);
+ }
+ } else {
+ if (f->groups == 2) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
+ } else if (f->groups == 4) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
+ } else if (f->groups == 8) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
+ } else if (f->groups == 12) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
+ } else if (f->groups == 32) {
+ NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
+ } else {
+ ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
+ ret, rp,
+ first, last);
+ }
+ }
+ NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
+
+#undef NFT_SET_PIPAPO_AVX2_LOOKUP
+
+ if (ret < 0)
+ goto out;
+
+ if (last) {
+ *ext = &f->mt[ret].e->ext;
+ if (unlikely(nft_set_elem_expired(*ext) ||
+ !nft_set_elem_active(*ext, genmask))) {
+ ret = 0;
+ goto next_match;
+ }
+
+ goto out;
+ }
+
+ swap(res, fill);
+ rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ }
+
+out:
+ if (i % 2)
+ raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
+ kernel_fpu_end();
+
+ return ret >= 0;
+}
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
new file mode 100644
index 000000000000..396caf7bfca8
--- /dev/null
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _NFT_SET_PIPAPO_AVX2_H
+
+#ifdef CONFIG_AS_AVX2
+#include <asm/fpu/xstate.h>
+#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
+
+bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, const struct nft_set_ext **ext);
+bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est);
+#endif /* CONFIG_AS_AVX2 */
+
+#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 5000b938ab1e..3a5552e14f75 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -33,6 +33,11 @@ static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
}
+static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
+{
+ return !nft_rbtree_interval_end(rbe);
+}
+
static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
const struct nft_rbtree_elem *interval)
{
@@ -64,7 +69,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
if (interval &&
nft_rbtree_equal(set, this, interval) &&
nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(interval))
+ nft_rbtree_interval_start(interval))
continue;
interval = rbe;
} else if (d > 0)
@@ -89,7 +94,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
- !nft_rbtree_interval_end(interval)) {
+ nft_rbtree_interval_start(interval)) {
*ext = &interval->ext;
return true;
}
@@ -208,8 +213,43 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
u8 genmask = nft_genmask_next(net);
struct nft_rbtree_elem *rbe;
struct rb_node *parent, **p;
+ bool overlap = false;
int d;
+ /* Detect overlaps as we descend the tree. Set the flag in these cases:
+ *
+ * a1. |__ _ _? >|__ _ _ (insert start after existing start)
+ * a2. _ _ __>| ?_ _ __| (insert end before existing end)
+ * a3. _ _ ___| ?_ _ _>| (insert end after existing end)
+ * a4. >|__ _ _ _ _ __| (insert start before existing end)
+ *
+ * and clear it later on, as we eventually reach the points indicated by
+ * '?' above, in the cases described below. We'll always meet these
+ * later, locally, due to tree ordering, and overlaps for the intervals
+ * that are the closest together are always evaluated last.
+ *
+ * b1. |__ _ _! >|__ _ _ (insert start after existing end)
+ * b2. _ _ __>| !_ _ __| (insert end before existing start)
+ * b3. !_____>| (insert end after existing start)
+ *
+ * Case a4. resolves to b1.:
+ * - if the inserted start element is the leftmost, because the '0'
+ * element in the tree serves as end element
+ * - otherwise, if an existing end is found. Note that end elements are
+ * always inserted after corresponding start elements.
+ *
+ * For a new, rightmost pair of elements, we'll hit cases b1. and b3.,
+ * in that order.
+ *
+ * The flag is also cleared in two special cases:
+ *
+ * b4. |__ _ _!|<_ _ _ (insert start right before existing end)
+ * b5. |__ _ >|!__ _ _ (insert end right after existing start)
+ *
+ * which always happen as last step and imply that no further
+ * overlapping is possible.
+ */
+
parent = NULL;
p = &priv->root.rb_node;
while (*p != NULL) {
@@ -218,17 +258,42 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
d = memcmp(nft_set_ext_key(&rbe->ext),
nft_set_ext_key(&new->ext),
set->klen);
- if (d < 0)
+ if (d < 0) {
p = &parent->rb_left;
- else if (d > 0)
+
+ if (nft_rbtree_interval_start(new)) {
+ overlap = nft_rbtree_interval_start(rbe) &&
+ nft_set_elem_active(&rbe->ext,
+ genmask);
+ } else {
+ overlap = nft_rbtree_interval_end(rbe) &&
+ nft_set_elem_active(&rbe->ext,
+ genmask);
+ }
+ } else if (d > 0) {
p = &parent->rb_right;
- else {
+
+ if (nft_rbtree_interval_end(new)) {
+ overlap = nft_rbtree_interval_end(rbe) &&
+ nft_set_elem_active(&rbe->ext,
+ genmask);
+ } else if (nft_rbtree_interval_end(rbe) &&
+ nft_set_elem_active(&rbe->ext, genmask)) {
+ overlap = true;
+ }
+ } else {
if (nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(new)) {
+ nft_rbtree_interval_start(new)) {
p = &parent->rb_left;
- } else if (!nft_rbtree_interval_end(rbe) &&
+
+ if (nft_set_elem_active(&rbe->ext, genmask))
+ overlap = false;
+ } else if (nft_rbtree_interval_start(rbe) &&
nft_rbtree_interval_end(new)) {
p = &parent->rb_right;
+
+ if (nft_set_elem_active(&rbe->ext, genmask))
+ overlap = false;
} else if (nft_set_elem_active(&rbe->ext, genmask)) {
*ext = &rbe->ext;
return -EEXIST;
@@ -237,6 +302,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
}
}
}
+
+ if (overlap)
+ return -ENOTEMPTY;
+
rb_link_node_rcu(&new->node, parent, p);
rb_insert_color(&new->node, &priv->root);
return 0;
@@ -317,10 +386,10 @@ static void *nft_rbtree_deactivate(const struct net *net,
parent = parent->rb_right;
else {
if (nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(this)) {
+ nft_rbtree_interval_start(this)) {
parent = parent->rb_left;
continue;
- } else if (!nft_rbtree_interval_end(rbe) &&
+ } else if (nft_rbtree_interval_start(rbe) &&
nft_rbtree_interval_end(this)) {
parent = parent->rb_right;
continue;
@@ -481,8 +550,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-struct nft_set_type nft_set_rbtree_type __read_mostly = {
- .owner = THIS_MODULE,
+const struct nft_set_type nft_set_rbtree_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
.ops = {
.privsize = nft_rbtree_privsize,
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 4c3f2e24c7cb..30be5787fbde 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -11,6 +11,7 @@
#include <net/ip_tunnels.h>
#include <net/vxlan.h>
#include <net/erspan.h>
+#include <net/geneve.h>
struct nft_tunnel {
enum nft_tunnel_keys key:8;
@@ -144,6 +145,7 @@ struct nft_tunnel_opts {
union {
struct vxlan_metadata vxlan;
struct erspan_metadata erspan;
+ u8 data[IP_TUNNEL_OPTS_MAX];
} u;
u32 len;
__be16 flags;
@@ -301,9 +303,53 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
return 0;
}
+static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 },
+ [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 },
+ [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+};
+
+static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
+ struct nft_tunnel_opts *opts)
+{
+ struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len;
+ struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1];
+ int err, data_len;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_GENEVE_MAX, attr,
+ nft_tunnel_opts_geneve_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_GENEVE_CLASS] ||
+ !tb[NFTA_TUNNEL_KEY_GENEVE_TYPE] ||
+ !tb[NFTA_TUNNEL_KEY_GENEVE_DATA])
+ return -EINVAL;
+
+ attr = tb[NFTA_TUNNEL_KEY_GENEVE_DATA];
+ data_len = nla_len(attr);
+ if (data_len % 4)
+ return -EINVAL;
+
+ opts->len += sizeof(*opt) + data_len;
+ if (opts->len > IP_TUNNEL_OPTS_MAX)
+ return -EINVAL;
+
+ memcpy(opt->opt_data, nla_data(attr), data_len);
+ opt->length = data_len / 4;
+ opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]);
+ opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]);
+ opts->flags = TUNNEL_GENEVE_OPT;
+
+ return 0;
+}
+
static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_OPTS_UNSPEC] = {
+ .strict_start_type = NFTA_TUNNEL_KEY_OPTS_GENEVE },
[NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, },
[NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_OPTS_GENEVE] = { .type = NLA_NESTED, },
};
static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
@@ -311,22 +357,43 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
struct ip_tunnel_info *info,
struct nft_tunnel_opts *opts)
{
- struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1];
- int err;
+ int err, rem, type = 0;
+ struct nlattr *nla;
- err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr,
- nft_tunnel_opts_policy, NULL);
+ err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX,
+ nft_tunnel_opts_policy, NULL);
if (err < 0)
return err;
- if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) {
- err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN],
- opts);
- } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) {
- err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN],
- opts);
- } else {
- return -EOPNOTSUPP;
+ nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(nla)) {
+ case NFTA_TUNNEL_KEY_OPTS_VXLAN:
+ if (type)
+ return -EINVAL;
+ err = nft_tunnel_obj_vxlan_init(nla, opts);
+ if (err)
+ return err;
+ type = TUNNEL_VXLAN_OPT;
+ break;
+ case NFTA_TUNNEL_KEY_OPTS_ERSPAN:
+ if (type)
+ return -EINVAL;
+ err = nft_tunnel_obj_erspan_init(nla, opts);
+ if (err)
+ return err;
+ type = TUNNEL_ERSPAN_OPT;
+ break;
+ case NFTA_TUNNEL_KEY_OPTS_GENEVE:
+ if (type && type != TUNNEL_GENEVE_OPT)
+ return -EINVAL;
+ err = nft_tunnel_obj_geneve_init(nla, opts);
+ if (err)
+ return err;
+ type = TUNNEL_GENEVE_OPT;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
}
return err;
@@ -339,6 +406,8 @@ static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] =
[NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, },
[NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, },
[NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, },
+ [NFTA_TUNNEL_KEY_SPORT] = { .type = NLA_U16, },
+ [NFTA_TUNNEL_KEY_DPORT] = { .type = NLA_U16, },
[NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, },
};
@@ -516,6 +585,25 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
break;
}
nla_nest_end(skb, inner);
+ } else if (opts->flags & TUNNEL_GENEVE_OPT) {
+ struct geneve_opt *opt;
+ int offset = 0;
+
+ inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE);
+ if (!inner)
+ goto failure;
+ while (opts->len > offset) {
+ opt = (struct geneve_opt *)opts->u.data + offset;
+ if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE,
+ opt->type) ||
+ nla_put(skb, NFTA_TUNNEL_KEY_GENEVE_DATA,
+ opt->length * 4, opt->opt_data))
+ goto inner_failure;
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+ nla_nest_end(skb, inner);
}
nla_nest_end(skb, nest);
return 0;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e27c6c5ba9df..cd2b034eef59 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1551,6 +1551,9 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file));
struct nf_mttg_trav *trav = seq->private;
+ if (ppos != NULL)
+ ++(*ppos);
+
switch (trav->class) {
case MTTG_TRAV_INIT:
trav->class = MTTG_TRAV_NFP_UNSPEC;
@@ -1576,9 +1579,6 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
default:
return NULL;
}
-
- if (ppos != NULL)
- ++*ppos;
return trav;
}
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index f56d3ed93e56..75bd0e5dd312 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/timer.h>
+#include <linux/alarmtimer.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/netfilter.h>
@@ -30,6 +31,7 @@
struct idletimer_tg {
struct list_head entry;
+ struct alarm alarm;
struct timer_list timer;
struct work_struct work;
@@ -37,6 +39,7 @@ struct idletimer_tg {
struct device_attribute attr;
unsigned int refcnt;
+ u8 timer_type;
};
static LIST_HEAD(idletimer_tg_list);
@@ -62,20 +65,29 @@ static ssize_t idletimer_tg_show(struct device *dev,
{
struct idletimer_tg *timer;
unsigned long expires = 0;
+ struct timespec64 ktimespec = {};
+ long time_diff = 0;
mutex_lock(&list_mutex);
timer = __idletimer_tg_find_by_label(attr->attr.name);
- if (timer)
- expires = timer->timer.expires;
+ if (timer) {
+ if (timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm);
+ ktimespec = ktime_to_timespec64(expires_alarm);
+ time_diff = ktimespec.tv_sec;
+ } else {
+ expires = timer->timer.expires;
+ time_diff = jiffies_to_msecs(expires - jiffies) / 1000;
+ }
+ }
mutex_unlock(&list_mutex);
- if (time_after(expires, jiffies))
- return sprintf(buf, "%u\n",
- jiffies_to_msecs(expires - jiffies) / 1000);
+ if (time_after(expires, jiffies) || ktimespec.tv_sec > 0)
+ return snprintf(buf, PAGE_SIZE, "%ld\n", time_diff);
- return sprintf(buf, "0\n");
+ return snprintf(buf, PAGE_SIZE, "0\n");
}
static void idletimer_tg_work(struct work_struct *work)
@@ -95,6 +107,16 @@ static void idletimer_tg_expired(struct timer_list *t)
schedule_work(&timer->work);
}
+static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm,
+ ktime_t now)
+{
+ struct idletimer_tg *timer = alarm->data;
+
+ pr_debug("alarm %s expired\n", timer->attr.attr.name);
+ schedule_work(&timer->work);
+ return ALARMTIMER_NORESTART;
+}
+
static int idletimer_check_sysfs_name(const char *name, unsigned int size)
{
int ret;
@@ -160,6 +182,68 @@ out:
return ret;
}
+static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info)
+{
+ int ret;
+
+ info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL);
+ if (!info->timer) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = idletimer_check_sysfs_name(info->label, sizeof(info->label));
+ if (ret < 0)
+ goto out_free_timer;
+
+ sysfs_attr_init(&info->timer->attr.attr);
+ info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL);
+ if (!info->timer->attr.attr.name) {
+ ret = -ENOMEM;
+ goto out_free_timer;
+ }
+ info->timer->attr.attr.mode = 0444;
+ info->timer->attr.show = idletimer_tg_show;
+
+ ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ if (ret < 0) {
+ pr_debug("couldn't add file to sysfs");
+ goto out_free_attr;
+ }
+
+ /* notify userspace */
+ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD);
+
+ list_add(&info->timer->entry, &idletimer_tg_list);
+ pr_debug("timer type value is %u", info->timer_type);
+ info->timer->timer_type = info->timer_type;
+ info->timer->refcnt = 1;
+
+ INIT_WORK(&info->timer->work, idletimer_tg_work);
+
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t tout;
+ alarm_init(&info->timer->alarm, ALARM_BOOTTIME,
+ idletimer_tg_alarmproc);
+ info->timer->alarm.data = info->timer;
+ tout = ktime_set(info->timeout, 0);
+ alarm_start_relative(&info->timer->alarm, tout);
+ } else {
+ timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ }
+
+ return 0;
+
+out_free_attr:
+ kfree(info->timer->attr.attr.name);
+out_free_timer:
+ kfree(info->timer);
+out:
+ return ret;
+}
+
/*
* The actual xt_tables plugin.
*/
@@ -177,13 +261,30 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
return XT_CONTINUE;
}
-static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+/*
+ * The actual xt_tables plugin.
+ */
+static unsigned int idletimer_tg_target_v1(struct sk_buff *skb,
+ const struct xt_action_param *par)
{
- struct idletimer_tg_info *info = par->targinfo;
- int ret;
+ const struct idletimer_tg_info_v1 *info = par->targinfo;
- pr_debug("checkentry targinfo%s\n", info->label);
+ pr_debug("resetting timer %s, timeout period %u\n",
+ info->label, info->timeout);
+
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ ktime_t tout = ktime_set(info->timeout, 0);
+ alarm_start_relative(&info->timer->alarm, tout);
+ } else {
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ }
+ return XT_CONTINUE;
+}
+
+static int idletimer_tg_helper(struct idletimer_tg_info *info)
+{
if (info->timeout == 0) {
pr_debug("timeout value is zero\n");
return -EINVAL;
@@ -198,7 +299,23 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
pr_debug("label is empty or not nul-terminated\n");
return -EINVAL;
}
+ return 0;
+}
+
+static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ ret = idletimer_tg_helper(info);
+ if(ret < 0)
+ {
+ pr_debug("checkentry helper return invalid\n");
+ return -EINVAL;
+ }
mutex_lock(&list_mutex);
info->timer = __idletimer_tg_find_by_label(info->label);
@@ -222,6 +339,65 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
return 0;
}
+static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par)
+{
+ struct idletimer_tg_info_v1 *info = par->targinfo;
+ int ret;
+
+ pr_debug("checkentry targinfo%s\n", info->label);
+
+ ret = idletimer_tg_helper((struct idletimer_tg_info *)info);
+ if(ret < 0)
+ {
+ pr_debug("checkentry helper return invalid\n");
+ return -EINVAL;
+ }
+
+ if (info->timer_type > XT_IDLETIMER_ALARM) {
+ pr_debug("invalid value for timer type\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&list_mutex);
+
+ info->timer = __idletimer_tg_find_by_label(info->label);
+ if (info->timer) {
+ if (info->timer->timer_type != info->timer_type) {
+ pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n");
+ mutex_unlock(&list_mutex);
+ return -EINVAL;
+ }
+
+ info->timer->refcnt++;
+ if (info->timer_type & XT_IDLETIMER_ALARM) {
+ /* calculate remaining expiry time */
+ ktime_t tout = alarm_expires_remaining(&info->timer->alarm);
+ struct timespec64 ktimespec = ktime_to_timespec64(tout);
+
+ if (ktimespec.tv_sec > 0) {
+ pr_debug("time_expiry_remaining %lld\n",
+ ktimespec.tv_sec);
+ alarm_start_relative(&info->timer->alarm, tout);
+ }
+ } else {
+ mod_timer(&info->timer->timer,
+ msecs_to_jiffies(info->timeout * 1000) + jiffies);
+ }
+ pr_debug("increased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ } else {
+ ret = idletimer_tg_create_v1(info);
+ if (ret < 0) {
+ pr_debug("failed to create timer\n");
+ mutex_unlock(&list_mutex);
+ return ret;
+ }
+ }
+
+ mutex_unlock(&list_mutex);
+ return 0;
+}
+
static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
{
const struct idletimer_tg_info *info = par->targinfo;
@@ -247,7 +423,38 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
mutex_unlock(&list_mutex);
}
-static struct xt_target idletimer_tg __read_mostly = {
+static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par)
+{
+ const struct idletimer_tg_info_v1 *info = par->targinfo;
+
+ pr_debug("destroy targinfo %s\n", info->label);
+
+ mutex_lock(&list_mutex);
+
+ if (--info->timer->refcnt == 0) {
+ pr_debug("deleting timer %s\n", info->label);
+
+ list_del(&info->timer->entry);
+ if (info->timer->timer_type & XT_IDLETIMER_ALARM) {
+ alarm_cancel(&info->timer->alarm);
+ } else {
+ del_timer_sync(&info->timer->timer);
+ }
+ cancel_work_sync(&info->timer->work);
+ sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+ kfree(info->timer->attr.attr.name);
+ kfree(info->timer);
+ } else {
+ pr_debug("decreased refcnt of timer %s to %u\n",
+ info->label, info->timer->refcnt);
+ }
+
+ mutex_unlock(&list_mutex);
+}
+
+
+static struct xt_target idletimer_tg[] __read_mostly = {
+ {
.name = "IDLETIMER",
.family = NFPROTO_UNSPEC,
.target = idletimer_tg_target,
@@ -256,6 +463,20 @@ static struct xt_target idletimer_tg __read_mostly = {
.checkentry = idletimer_tg_checkentry,
.destroy = idletimer_tg_destroy,
.me = THIS_MODULE,
+ },
+ {
+ .name = "IDLETIMER",
+ .family = NFPROTO_UNSPEC,
+ .revision = 1,
+ .target = idletimer_tg_target_v1,
+ .targetsize = sizeof(struct idletimer_tg_info_v1),
+ .usersize = offsetof(struct idletimer_tg_info_v1, timer),
+ .checkentry = idletimer_tg_checkentry_v1,
+ .destroy = idletimer_tg_destroy_v1,
+ .me = THIS_MODULE,
+ },
+
+
};
static struct class *idletimer_tg_class;
@@ -283,7 +504,8 @@ static int __init idletimer_tg_init(void)
idletimer_tg_kobj = &idletimer_tg_device->kobj;
- err = xt_register_target(&idletimer_tg);
+ err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg));
+
if (err < 0) {
pr_debug("couldn't register xt target\n");
goto out_dev;
@@ -300,7 +522,7 @@ out:
static void __exit idletimer_tg_exit(void)
{
- xt_unregister_target(&idletimer_tg);
+ xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg));
device_destroy(idletimer_tg_class, MKDEV(0, 0));
class_destroy(idletimer_tg_class);
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 2317721f3ecb..75625d13e976 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -21,8 +21,6 @@ MODULE_DESCRIPTION("Xtables: packet security mark modification");
MODULE_ALIAS("ipt_SECMARK");
MODULE_ALIAS("ip6t_SECMARK");
-#define PFX "SECMARK: "
-
static u8 mode;
static unsigned int
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index bccd47cd7190..9c5cfd74a0ee 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -36,6 +36,7 @@
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/mutex.h>
#include <linux/kernel.h>
+#include <linux/refcount.h>
#include <uapi/linux/netfilter/xt_hashlimit.h>
#define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \
@@ -114,7 +115,7 @@ struct dsthash_ent {
struct xt_hashlimit_htable {
struct hlist_node node; /* global list of all htables */
- int use;
+ refcount_t use;
u_int8_t family;
bool rnd_initialized;
@@ -131,7 +132,7 @@ struct xt_hashlimit_htable {
const char *name;
struct net *net;
- struct hlist_head hash[0]; /* hashtable itself */
+ struct hlist_head hash[]; /* hashtable itself */
};
static int
@@ -315,7 +316,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg,
for (i = 0; i < hinfo->cfg.size; i++)
INIT_HLIST_HEAD(&hinfo->hash[i]);
- hinfo->use = 1;
+ refcount_set(&hinfo->use, 1);
hinfo->count = 0;
hinfo->family = family;
hinfo->rnd_initialized = false;
@@ -401,15 +402,6 @@ static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo)
remove_proc_entry(hinfo->name, parent);
}
-static void htable_destroy(struct xt_hashlimit_htable *hinfo)
-{
- cancel_delayed_work_sync(&hinfo->gc_work);
- htable_remove_proc_entry(hinfo);
- htable_selective_cleanup(hinfo, true);
- kfree(hinfo->name);
- vfree(hinfo);
-}
-
static struct xt_hashlimit_htable *htable_find_get(struct net *net,
const char *name,
u_int8_t family)
@@ -420,7 +412,7 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net,
hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) {
if (!strcmp(name, hinfo->name) &&
hinfo->family == family) {
- hinfo->use++;
+ refcount_inc(&hinfo->use);
return hinfo;
}
}
@@ -429,12 +421,16 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net,
static void htable_put(struct xt_hashlimit_htable *hinfo)
{
- mutex_lock(&hashlimit_mutex);
- if (--hinfo->use == 0) {
+ if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) {
hlist_del(&hinfo->node);
- htable_destroy(hinfo);
+ htable_remove_proc_entry(hinfo);
+ mutex_unlock(&hashlimit_mutex);
+
+ cancel_delayed_work_sync(&hinfo->gc_work);
+ htable_selective_cleanup(hinfo, true);
+ kfree(hinfo->name);
+ vfree(hinfo);
}
- mutex_unlock(&hashlimit_mutex);
}
/* The algorithm used is the Simple Token Bucket Filter (TBF)
@@ -837,6 +833,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3);
}
+#define HASHLIMIT_MAX_SIZE 1048576
+
static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
struct xt_hashlimit_htable **hinfo,
struct hashlimit_cfg3 *cfg,
@@ -847,6 +845,14 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par,
if (cfg->gc_interval == 0 || cfg->expire == 0)
return -EINVAL;
+ if (cfg->size > HASHLIMIT_MAX_SIZE) {
+ cfg->size = HASHLIMIT_MAX_SIZE;
+ pr_info_ratelimited("size too large, truncated to %u\n", cfg->size);
+ }
+ if (cfg->max > HASHLIMIT_MAX_SIZE) {
+ cfg->max = HASHLIMIT_MAX_SIZE;
+ pr_info_ratelimited("max too large, truncated to %u\n", cfg->max);
+ }
if (par->family == NFPROTO_IPV4) {
if (cfg->srcmask > 32 || cfg->dstmask > 32)
return -EINVAL;
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 0a9708004e20..19bef176145e 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -71,7 +71,7 @@ struct recent_entry {
u_int8_t ttl;
u_int8_t index;
u_int16_t nstamps;
- unsigned long stamps[0];
+ unsigned long stamps[];
};
struct recent_table {
@@ -82,7 +82,7 @@ struct recent_table {
unsigned int entries;
u8 nstamps_max_mask;
struct list_head lru_list;
- struct list_head iphash[0];
+ struct list_head iphash[];
};
struct recent_net {
@@ -492,12 +492,12 @@ static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos)
const struct recent_entry *e = v;
const struct list_head *head = e->list.next;
+ (*pos)++;
while (head == &t->iphash[st->bucket]) {
if (++st->bucket >= ip_list_hash_size)
return NULL;
head = t->iphash[st->bucket].next;
}
- (*pos)++;
return list_entry(head, struct recent_entry, list);
}
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index f5d34da0646e..a1f2320ecc16 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -143,7 +143,8 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain,
if (domain != NULL) {
bkt = netlbl_domhsh_hash(domain);
bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt];
- list_for_each_entry_rcu(iter, bkt_list, list)
+ list_for_each_entry_rcu(iter, bkt_list, list,
+ lockdep_is_held(&netlbl_domhsh_lock))
if (iter->valid &&
netlbl_family_match(iter->family, family) &&
strcmp(iter->domain, domain) == 0)
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index d2e4ab8d1cb1..77bb1bb22c3b 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -207,7 +207,8 @@ static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
bkt = netlbl_unlhsh_hash(ifindex);
bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt];
- list_for_each_entry_rcu(iter, bkt_list, list)
+ list_for_each_entry_rcu(iter, bkt_list, list,
+ lockdep_is_held(&netlbl_unlhsh_lock))
if (iter->valid && iter->ifindex == ifindex)
return iter;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 4e31721e7293..5ded01ca8b20 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -71,7 +71,7 @@
struct listeners {
struct rcu_head rcu;
- unsigned long masks[0];
+ unsigned long masks[];
};
/* state bits */
@@ -1014,7 +1014,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
if (nlk->netlink_bind && groups) {
int group;
- for (group = 0; group < nlk->ngroups; group++) {
+ /* nl_groups is a u32, so cap the maximum groups we can bind */
+ for (group = 0; group < BITS_PER_TYPE(u32); group++) {
if (!test_bit(group, &groups))
continue;
err = nlk->netlink_bind(net, group + 1);
@@ -1033,7 +1034,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
netlink_insert(sk, nladdr->nl_pid) :
netlink_autobind(sock);
if (err) {
- netlink_undo_bind(nlk->ngroups, groups, sk);
+ netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk);
goto unlock;
}
}
@@ -2391,19 +2392,14 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
if (nlk_has_extack && extack && extack->_msg)
tlvlen += nla_total_size(strlen(extack->_msg) + 1);
- if (err) {
- if (!(nlk->flags & NETLINK_F_CAP_ACK))
- payload += nlmsg_len(nlh);
- else
- flags |= NLM_F_CAPPED;
- if (nlk_has_extack && extack && extack->bad_attr)
- tlvlen += nla_total_size(sizeof(u32));
- } else {
+ if (err && !(nlk->flags & NETLINK_F_CAP_ACK))
+ payload += nlmsg_len(nlh);
+ else
flags |= NLM_F_CAPPED;
-
- if (nlk_has_extack && extack && extack->cookie_len)
- tlvlen += nla_total_size(extack->cookie_len);
- }
+ if (err && nlk_has_extack && extack && extack->bad_attr)
+ tlvlen += nla_total_size(sizeof(u32));
+ if (nlk_has_extack && extack && extack->cookie_len)
+ tlvlen += nla_total_size(extack->cookie_len);
if (tlvlen)
flags |= NLM_F_ACK_TLVS;
@@ -2426,20 +2422,16 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG,
extack->_msg));
}
- if (err) {
- if (extack->bad_attr &&
- !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
- (u8 *)extack->bad_attr >= in_skb->data +
- in_skb->len))
- WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
- (u8 *)extack->bad_attr -
- in_skb->data));
- } else {
- if (extack->cookie_len)
- WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
- extack->cookie_len,
- extack->cookie));
- }
+ if (err && extack->bad_attr &&
+ !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
+ (u8 *)extack->bad_attr >= in_skb->data +
+ in_skb->len))
+ WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
+ (u8 *)extack->bad_attr -
+ (u8 *)nlh));
+ if (extack->cookie_len)
+ WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
+ extack->cookie_len, extack->cookie));
}
nlmsg_end(skb, rep);
@@ -2582,6 +2574,7 @@ static void *__netlink_seq_next(struct seq_file *seq)
}
static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
+ __acquires(RCU)
{
struct nl_seq_iter *iter = seq->private;
void *obj = SEQ_START_TOKEN;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 0522b2b1fd95..9f357aa22b94 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -497,8 +497,9 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family,
err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr,
family->policy, validate, extack);
- if (err && parallel) {
- kfree(attrbuf);
+ if (err) {
+ if (parallel)
+ kfree(attrbuf);
return ERR_PTR(err);
}
return attrbuf;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 58d5373c513c..7b1a74f74aad 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1230,6 +1230,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
#ifdef CONFIG_PROC_FS
static void *nr_info_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&nr_list_lock)
{
spin_lock_bh(&nr_list_lock);
return seq_hlist_start_head(&nr_list, *pos);
@@ -1241,6 +1242,7 @@ static void *nr_info_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void nr_info_stop(struct seq_file *seq, void *v)
+ __releases(&nr_list_lock)
{
spin_unlock_bh(&nr_list_lock);
}
diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c
index d41335bad1f8..79f12d8c7b86 100644
--- a/net/netrom/nr_route.c
+++ b/net/netrom/nr_route.c
@@ -838,6 +838,7 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25)
#ifdef CONFIG_PROC_FS
static void *nr_node_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&nr_node_list_lock)
{
spin_lock_bh(&nr_node_list_lock);
return seq_hlist_start_head(&nr_node_list, *pos);
@@ -849,6 +850,7 @@ static void *nr_node_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void nr_node_stop(struct seq_file *seq, void *v)
+ __releases(&nr_node_list_lock)
{
spin_unlock_bh(&nr_node_list_lock);
}
@@ -893,6 +895,7 @@ const struct seq_operations nr_node_seqops = {
};
static void *nr_neigh_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&nr_neigh_list_lock)
{
spin_lock_bh(&nr_neigh_list_lock);
return seq_hlist_start_head(&nr_neigh_list, *pos);
@@ -904,6 +907,7 @@ static void *nr_neigh_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void nr_neigh_stop(struct seq_file *seq, void *v)
+ __releases(&nr_neigh_list_lock)
{
spin_unlock_bh(&nr_neigh_list_lock);
}
diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c
index 65aaa9d7c813..304b1a9bb18a 100644
--- a/net/nfc/digital_dep.c
+++ b/net/nfc/digital_dep.c
@@ -71,7 +71,7 @@ struct digital_atr_req {
u8 bs;
u8 br;
u8 pp;
- u8 gb[0];
+ u8 gb[];
} __packed;
struct digital_atr_res {
@@ -83,7 +83,7 @@ struct digital_atr_res {
u8 br;
u8 to;
u8 pp;
- u8 gb[0];
+ u8 gb[];
} __packed;
struct digital_psl_req {
diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c
index 6f1b096e601c..43811b5219b5 100644
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -181,13 +181,20 @@ exit:
void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
struct sk_buff *skb)
{
- u8 gate = hdev->pipes[pipe].gate;
u8 status = NFC_HCI_ANY_OK;
struct hci_create_pipe_resp *create_info;
struct hci_delete_pipe_noti *delete_info;
struct hci_all_pipe_cleared_noti *cleared_info;
+ u8 gate;
- pr_debug("from gate %x pipe %x cmd %x\n", gate, pipe, cmd);
+ pr_debug("from pipe %x cmd %x\n", pipe, cmd);
+
+ if (pipe >= NFC_HCI_MAX_PIPES) {
+ status = NFC_HCI_ANY_E_NOK;
+ goto exit;
+ }
+
+ gate = hdev->pipes[pipe].gate;
switch (cmd) {
case NFC_HCI_ADM_NOTIFY_PIPE_CREATED:
@@ -375,8 +382,14 @@ void nfc_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe, u8 event,
struct sk_buff *skb)
{
int r = 0;
- u8 gate = hdev->pipes[pipe].gate;
+ u8 gate;
+
+ if (pipe >= NFC_HCI_MAX_PIPES) {
+ pr_err("Discarded event %x to invalid pipe %x\n", event, pipe);
+ goto exit;
+ }
+ gate = hdev->pipes[pipe].gate;
if (gate == NFC_HCI_INVALID_GATE) {
pr_err("Discarded event %x to unopened pipe %x\n", event, pipe);
goto exit;
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index eee0dddb7749..e894254c17d4 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -32,6 +32,7 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
[NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING,
.len = NFC_DEVICE_NAME_MAXSIZE },
[NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 },
+ [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 },
[NFC_ATTR_COMM_MODE] = { .type = NLA_U8 },
[NFC_ATTR_RF_MODE] = { .type = NLA_U8 },
[NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 },
@@ -43,7 +44,10 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
[NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED },
[NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING,
.len = NFC_FIRMWARE_NAME_MAXSIZE },
+ [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 },
[NFC_ATTR_SE_APDU] = { .type = NLA_BINARY },
+ [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 },
+ [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 },
[NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY },
};
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 7fbfe2adfffa..fc0efd8833c8 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -964,6 +964,25 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
return ovs_dp_upcall(dp, skb, key, &upcall, cutlen);
}
+static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key,
+ const struct nlattr *attr, bool last)
+{
+ /* The first action is always 'OVS_DEC_TTL_ATTR_ARG'. */
+ struct nlattr *dec_ttl_arg = nla_data(attr);
+ int rem = nla_len(attr);
+
+ if (nla_len(dec_ttl_arg)) {
+ struct nlattr *actions = nla_next(dec_ttl_arg, &rem);
+
+ if (actions)
+ return clone_execute(dp, skb, key, 0, actions, rem,
+ last, false);
+ }
+ consume_skb(skb);
+ return 0;
+}
+
/* When 'last' is true, sample() should always consume the 'skb'.
* Otherwise, sample() should keep 'skb' intact regardless what
* actions are executed within sample().
@@ -1180,6 +1199,45 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
nla_len(actions), last, clone_flow_key);
}
+static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
+{
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct ipv6hdr *nh;
+
+ err = skb_ensure_writable(skb, skb_network_offset(skb) +
+ sizeof(*nh));
+ if (unlikely(err))
+ return err;
+
+ nh = ipv6_hdr(skb);
+
+ if (nh->hop_limit <= 1)
+ return -EHOSTUNREACH;
+
+ key->ip.ttl = --nh->hop_limit;
+ } else {
+ struct iphdr *nh;
+ u8 old_ttl;
+
+ err = skb_ensure_writable(skb, skb_network_offset(skb) +
+ sizeof(*nh));
+ if (unlikely(err))
+ return err;
+
+ nh = ip_hdr(skb);
+ if (nh->ttl <= 1)
+ return -EHOSTUNREACH;
+
+ old_ttl = nh->ttl--;
+ csum_replace2(&nh->check, htons(old_ttl << 8),
+ htons(nh->ttl << 8));
+ key->ip.ttl = nh->ttl;
+ }
+ return 0;
+}
+
/* Execute a list of actions against 'skb'. */
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
@@ -1365,6 +1423,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
break;
}
+
+ case OVS_ACTION_ATTR_DEC_TTL:
+ err = execute_dec_ttl(skb, key);
+ if (err == -EHOSTUNREACH) {
+ err = dec_ttl_exception_handler(dp, skb, key,
+ a, true);
+ return err;
+ }
+ break;
}
if (unlikely(err)) {
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 659c2a790fe7..d8ae541d22a8 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -179,7 +179,8 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no)
struct hlist_head *head;
head = vport_hash_bucket(dp, port_no);
- hlist_for_each_entry_rcu(vport, head, dp_hash_node) {
+ hlist_for_each_entry_rcu(vport, head, dp_hash_node,
+ lockdep_ovsl_is_held()) {
if (vport->port_no == port_no)
return vport;
}
@@ -304,7 +305,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
struct sk_buff *segs, *nskb;
int err;
- BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET);
segs = __skb_gso_segment(skb, NETIF_F_SG, false);
if (IS_ERR(segs))
return PTR_ERR(segs);
@@ -644,6 +645,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
[OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
+ [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 },
};
static const struct genl_ops dp_packet_genl_ops[] = {
@@ -2042,7 +2044,8 @@ static unsigned int ovs_get_max_headroom(struct datapath *dp)
int i;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
- hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
+ lockdep_ovsl_is_held()) {
dev = vport->dev;
dev_headroom = netdev_get_fwd_headroom(dev);
if (dev_headroom > max_headroom)
@@ -2061,7 +2064,8 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom)
dp->max_headroom = new_headroom;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
- hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node,
+ lockdep_ovsl_is_held())
netdev_set_rx_headroom(vport->dev, new_headroom);
}
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 7da4230627f5..79252d4887ff 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -80,6 +80,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
case OVS_ACTION_ATTR_METER:
case OVS_ACTION_ATTR_CHECK_PKT_LEN:
case OVS_ACTION_ATTR_ADD_MPLS:
+ case OVS_ACTION_ATTR_DEC_TTL:
default:
return true;
}
@@ -2495,6 +2496,39 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr,
return 0;
}
+static int validate_and_copy_dec_ttl(struct net *net,
+ const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci,
+ u32 mpls_label_count, bool log)
+{
+ int start, err;
+ u32 nested = true;
+
+ if (!nla_len(attr))
+ return ovs_nla_add_action(sfa, OVS_ACTION_ATTR_DEC_TTL,
+ NULL, 0, log);
+
+ start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log);
+ if (start < 0)
+ return start;
+
+ err = ovs_nla_add_action(sfa, OVS_DEC_TTL_ATTR_ACTION, &nested,
+ sizeof(nested), log);
+
+ if (err)
+ return err;
+
+ err = __ovs_nla_copy_actions(net, attr, key, sfa, eth_type,
+ vlan_tci, mpls_label_count, log);
+ if (err)
+ return err;
+
+ add_nested_action_end(*sfa, start);
+ return 0;
+}
+
static int validate_and_copy_clone(struct net *net,
const struct nlattr *attr,
const struct sw_flow_key *key,
@@ -2708,10 +2742,6 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
switch (key_type) {
- const struct ovs_key_ipv4 *ipv4_key;
- const struct ovs_key_ipv6 *ipv6_key;
- int err;
-
case OVS_KEY_ATTR_PRIORITY:
case OVS_KEY_ATTR_SKB_MARK:
case OVS_KEY_ATTR_CT_MARK:
@@ -2723,7 +2753,9 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
break;
- case OVS_KEY_ATTR_TUNNEL:
+ case OVS_KEY_ATTR_TUNNEL: {
+ int err;
+
if (masked)
return -EINVAL; /* Masked tunnel set not supported. */
@@ -2732,8 +2764,10 @@ static int validate_set(const struct nlattr *a,
if (err)
return err;
break;
+ }
+ case OVS_KEY_ATTR_IPV4: {
+ const struct ovs_key_ipv4 *ipv4_key;
- case OVS_KEY_ATTR_IPV4:
if (eth_type != htons(ETH_P_IP))
return -EINVAL;
@@ -2753,8 +2787,10 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
}
break;
+ }
+ case OVS_KEY_ATTR_IPV6: {
+ const struct ovs_key_ipv6 *ipv6_key;
- case OVS_KEY_ATTR_IPV6:
if (eth_type != htons(ETH_P_IPV6))
return -EINVAL;
@@ -2781,7 +2817,7 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
break;
-
+ }
case OVS_KEY_ATTR_TCP:
if ((eth_type != htons(ETH_P_IP) &&
eth_type != htons(ETH_P_IPV6)) ||
@@ -3007,6 +3043,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
[OVS_ACTION_ATTR_CLONE] = (u32)-1,
[OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1,
[OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls),
+ [OVS_ACTION_ATTR_DEC_TTL] = (u32)-1,
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
@@ -3267,6 +3304,15 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
break;
}
+ case OVS_ACTION_ATTR_DEC_TTL:
+ err = validate_and_copy_dec_ttl(net, a, key, sfa,
+ eth_type, vlan_tci,
+ mpls_label_count, log);
+ if (err)
+ return err;
+ skip_copy = true;
+ break;
+
default:
OVS_NLERR(log, "Unknown Action type %d", type);
return -EINVAL;
@@ -3438,6 +3484,26 @@ out:
return err;
}
+static int dec_ttl_action_to_attr(const struct nlattr *attr,
+ struct sk_buff *skb)
+{
+ int err = 0, rem = nla_len(attr);
+ struct nlattr *start;
+
+ start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL);
+
+ if (!start)
+ return -EMSGSIZE;
+
+ err = ovs_nla_put_actions(nla_data(attr), rem, skb);
+ if (err)
+ nla_nest_cancel(skb, start);
+ else
+ nla_nest_end(skb, start);
+
+ return err;
+}
+
static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
{
const struct nlattr *ovs_key = nla_data(a);
@@ -3538,6 +3604,12 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb)
return err;
break;
+ case OVS_ACTION_ATTR_DEC_TTL:
+ err = dec_ttl_action_to_attr(a, skb);
+ if (err)
+ return err;
+ break;
+
default:
if (nla_put(skb, type, nla_len(a), nla_data(a)))
return -EMSGSIZE;
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 5904e93e5765..fd8a01ca7a2d 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -585,7 +585,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
head = find_bucket(ti, hash);
(*n_mask_hit)++;
- hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
+ hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
+ lockdep_ovsl_is_held()) {
if (flow->mask == mask && flow->flow_table.hash == hash &&
flow_cmp_masked_key(flow, &masked_key, &mask->range))
return flow;
@@ -769,7 +770,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
hash = ufid_hash(ufid);
head = find_bucket(ti, hash);
- hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) {
+ hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
+ lockdep_ovsl_is_held()) {
if (flow->ufid_table.hash == hash &&
ovs_flow_cmp_ufid(flow, ufid))
return flow;
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index 3323b79ff548..5010d1ddd4bd 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -61,7 +61,8 @@ static struct dp_meter *lookup_meter(const struct datapath *dp,
struct hlist_head *head;
head = meter_hash_bucket(dp, meter_id);
- hlist_for_each_entry_rcu(meter, head, dp_hash_node) {
+ hlist_for_each_entry_rcu(meter, head, dp_hash_node,
+ lockdep_ovsl_is_held()) {
if (meter->id == meter_id)
return meter;
}
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 5da9392b03d6..47febb4504f0 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -96,7 +96,8 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
struct hlist_head *bucket = hash_bucket(net, name);
struct vport *vport;
- hlist_for_each_entry_rcu(vport, bucket, hash_node)
+ hlist_for_each_entry_rcu(vport, bucket, hash_node,
+ lockdep_ovsl_is_held())
if (!strcmp(name, ovs_vport_name(vport)) &&
net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 30c6879d6774..29bd405adbbd 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2173,6 +2173,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct timespec64 ts;
__u32 ts_status;
bool is_drop_n_account = false;
+ unsigned int slot_id = 0;
bool do_vnet = false;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
@@ -2274,6 +2275,20 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
TP_STATUS_KERNEL, (macoff+snaplen));
if (!h.raw)
goto drop_n_account;
+
+ if (po->tp_version <= TPACKET_V2) {
+ slot_id = po->rx_ring.head;
+ if (test_bit(slot_id, po->rx_ring.rx_owner_map))
+ goto drop_n_account;
+ __set_bit(slot_id, po->rx_ring.rx_owner_map);
+ }
+
+ if (do_vnet &&
+ virtio_net_hdr_from_skb(skb, h.raw + macoff -
+ sizeof(struct virtio_net_hdr),
+ vio_le(), true, 0))
+ goto drop_n_account;
+
if (po->tp_version <= TPACKET_V2) {
packet_increment_rx_head(po, &po->rx_ring);
/*
@@ -2286,12 +2301,6 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
status |= TP_STATUS_LOSING;
}
- if (do_vnet &&
- virtio_net_hdr_from_skb(skb, h.raw + macoff -
- sizeof(struct virtio_net_hdr),
- vio_le(), true, 0))
- goto drop_n_account;
-
po->stats.stats1.tp_packets++;
if (copy_skb) {
status |= TP_STATUS_COPY;
@@ -2379,7 +2388,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
#endif
if (po->tp_version <= TPACKET_V2) {
+ spin_lock(&sk->sk_receive_queue.lock);
__packet_set_status(po, h.raw, status);
+ __clear_bit(slot_id, po->rx_ring.rx_owner_map);
+ spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk);
} else {
prb_clear_blk_fill_status(&po->rx_ring);
@@ -4276,6 +4288,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
{
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
+ unsigned long *rx_owner_map = NULL;
int was_running, order = 0;
struct packet_ring_buffer *rb;
struct sk_buff_head *rb_queue;
@@ -4361,6 +4374,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
}
break;
default:
+ if (!tx_ring) {
+ rx_owner_map = bitmap_alloc(req->tp_frame_nr,
+ GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
+ if (!rx_owner_map)
+ goto out_free_pg_vec;
+ }
break;
}
}
@@ -4390,6 +4409,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
err = 0;
spin_lock_bh(&rb_queue->lock);
swap(rb->pg_vec, pg_vec);
+ if (po->tp_version <= TPACKET_V2)
+ swap(rb->rx_owner_map, rx_owner_map);
rb->frame_max = (req->tp_frame_nr - 1);
rb->head = 0;
rb->frame_size = req->tp_frame_size;
@@ -4421,6 +4442,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
}
out_free_pg_vec:
+ bitmap_free(rx_owner_map);
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 82fb2b10f790..907f4cd2a718 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -70,7 +70,10 @@ struct packet_ring_buffer {
unsigned int __percpu *pending_refcnt;
- struct tpacket_kbdq_core prb_bdqc;
+ union {
+ unsigned long *rx_owner_map;
+ struct tpacket_kbdq_core prb_bdqc;
+ };
};
extern struct mutex fanout_mutex;
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
index 1c6d6c120fb7..32d4e923925d 100644
--- a/net/qrtr/Makefile
+++ b/net/qrtr/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_QRTR) := qrtr.o
+obj-$(CONFIG_QRTR) := qrtr.o ns.o
obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o
qrtr-smd-y := smd.o
diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c
new file mode 100644
index 000000000000..e7d0fe3f4330
--- /dev/null
+++ b/net/qrtr/ns.c
@@ -0,0 +1,757 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2015, Sony Mobile Communications Inc.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2020, Linaro Ltd.
+ */
+
+#include <linux/module.h>
+#include <linux/qrtr.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "qrtr.h"
+
+static RADIX_TREE(nodes, GFP_KERNEL);
+
+static struct {
+ struct socket *sock;
+ struct sockaddr_qrtr bcast_sq;
+ struct list_head lookups;
+ struct workqueue_struct *workqueue;
+ struct work_struct work;
+ int local_node;
+} qrtr_ns;
+
+static const char * const qrtr_ctrl_pkt_strings[] = {
+ [QRTR_TYPE_HELLO] = "hello",
+ [QRTR_TYPE_BYE] = "bye",
+ [QRTR_TYPE_NEW_SERVER] = "new-server",
+ [QRTR_TYPE_DEL_SERVER] = "del-server",
+ [QRTR_TYPE_DEL_CLIENT] = "del-client",
+ [QRTR_TYPE_RESUME_TX] = "resume-tx",
+ [QRTR_TYPE_EXIT] = "exit",
+ [QRTR_TYPE_PING] = "ping",
+ [QRTR_TYPE_NEW_LOOKUP] = "new-lookup",
+ [QRTR_TYPE_DEL_LOOKUP] = "del-lookup",
+};
+
+struct qrtr_server_filter {
+ unsigned int service;
+ unsigned int instance;
+ unsigned int ifilter;
+};
+
+struct qrtr_lookup {
+ unsigned int service;
+ unsigned int instance;
+
+ struct sockaddr_qrtr sq;
+ struct list_head li;
+};
+
+struct qrtr_server {
+ unsigned int service;
+ unsigned int instance;
+
+ unsigned int node;
+ unsigned int port;
+
+ struct list_head qli;
+};
+
+struct qrtr_node {
+ unsigned int id;
+ struct radix_tree_root servers;
+};
+
+static struct qrtr_node *node_get(unsigned int node_id)
+{
+ struct qrtr_node *node;
+
+ node = radix_tree_lookup(&nodes, node_id);
+ if (node)
+ return node;
+
+ /* If node didn't exist, allocate and insert it to the tree */
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return NULL;
+
+ node->id = node_id;
+
+ radix_tree_insert(&nodes, node_id, node);
+
+ return node;
+}
+
+static int server_match(const struct qrtr_server *srv,
+ const struct qrtr_server_filter *f)
+{
+ unsigned int ifilter = f->ifilter;
+
+ if (f->service != 0 && srv->service != f->service)
+ return 0;
+ if (!ifilter && f->instance)
+ ifilter = ~0;
+
+ return (srv->instance & ifilter) == f->instance;
+}
+
+static int service_announce_new(struct sockaddr_qrtr *dest,
+ struct qrtr_server *srv)
+{
+ struct qrtr_ctrl_pkt pkt;
+ struct msghdr msg = { };
+ struct kvec iv;
+
+ trace_printk("advertising new server [%d:%x]@[%d:%d]\n",
+ srv->service, srv->instance, srv->node, srv->port);
+
+ iv.iov_base = &pkt;
+ iv.iov_len = sizeof(pkt);
+
+ memset(&pkt, 0, sizeof(pkt));
+ pkt.cmd = cpu_to_le32(QRTR_TYPE_NEW_SERVER);
+ pkt.server.service = cpu_to_le32(srv->service);
+ pkt.server.instance = cpu_to_le32(srv->instance);
+ pkt.server.node = cpu_to_le32(srv->node);
+ pkt.server.port = cpu_to_le32(srv->port);
+
+ msg.msg_name = (struct sockaddr *)dest;
+ msg.msg_namelen = sizeof(*dest);
+
+ return kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
+}
+
+static int service_announce_del(struct sockaddr_qrtr *dest,
+ struct qrtr_server *srv)
+{
+ struct qrtr_ctrl_pkt pkt;
+ struct msghdr msg = { };
+ struct kvec iv;
+ int ret;
+
+ trace_printk("advertising removal of server [%d:%x]@[%d:%d]\n",
+ srv->service, srv->instance, srv->node, srv->port);
+
+ iv.iov_base = &pkt;
+ iv.iov_len = sizeof(pkt);
+
+ memset(&pkt, 0, sizeof(pkt));
+ pkt.cmd = cpu_to_le32(QRTR_TYPE_DEL_SERVER);
+ pkt.server.service = cpu_to_le32(srv->service);
+ pkt.server.instance = cpu_to_le32(srv->instance);
+ pkt.server.node = cpu_to_le32(srv->node);
+ pkt.server.port = cpu_to_le32(srv->port);
+
+ msg.msg_name = (struct sockaddr *)dest;
+ msg.msg_namelen = sizeof(*dest);
+
+ ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
+ if (ret < 0)
+ pr_err("failed to announce del service\n");
+
+ return ret;
+}
+
+static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv,
+ bool new)
+{
+ struct qrtr_ctrl_pkt pkt;
+ struct msghdr msg = { };
+ struct kvec iv;
+ int ret;
+
+ iv.iov_base = &pkt;
+ iv.iov_len = sizeof(pkt);
+
+ memset(&pkt, 0, sizeof(pkt));
+ pkt.cmd = new ? cpu_to_le32(QRTR_TYPE_NEW_SERVER) :
+ cpu_to_le32(QRTR_TYPE_DEL_SERVER);
+ if (srv) {
+ pkt.server.service = cpu_to_le32(srv->service);
+ pkt.server.instance = cpu_to_le32(srv->instance);
+ pkt.server.node = cpu_to_le32(srv->node);
+ pkt.server.port = cpu_to_le32(srv->port);
+ }
+
+ msg.msg_name = (struct sockaddr *)to;
+ msg.msg_namelen = sizeof(*to);
+
+ ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
+ if (ret < 0)
+ pr_err("failed to send lookup notification\n");
+}
+
+static int announce_servers(struct sockaddr_qrtr *sq)
+{
+ struct radix_tree_iter iter;
+ struct qrtr_server *srv;
+ struct qrtr_node *node;
+ void __rcu **slot;
+ int ret;
+
+ node = node_get(qrtr_ns.local_node);
+ if (!node)
+ return 0;
+
+ /* Announce the list of servers registered in this node */
+ radix_tree_for_each_slot(slot, &node->servers, &iter, 0) {
+ srv = radix_tree_deref_slot(slot);
+
+ ret = service_announce_new(sq, srv);
+ if (ret < 0) {
+ pr_err("failed to announce new service\n");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static struct qrtr_server *server_add(unsigned int service,
+ unsigned int instance,
+ unsigned int node_id,
+ unsigned int port)
+{
+ struct qrtr_server *srv;
+ struct qrtr_server *old;
+ struct qrtr_node *node;
+
+ if (!service || !port)
+ return NULL;
+
+ srv = kzalloc(sizeof(*srv), GFP_KERNEL);
+ if (!srv)
+ return NULL;
+
+ srv->service = service;
+ srv->instance = instance;
+ srv->node = node_id;
+ srv->port = port;
+
+ node = node_get(node_id);
+ if (!node)
+ goto err;
+
+ /* Delete the old server on the same port */
+ old = radix_tree_lookup(&node->servers, port);
+ if (old) {
+ radix_tree_delete(&node->servers, port);
+ kfree(old);
+ }
+
+ radix_tree_insert(&node->servers, port, srv);
+
+ trace_printk("add server [%d:%x]@[%d:%d]\n", srv->service,
+ srv->instance, srv->node, srv->port);
+
+ return srv;
+
+err:
+ kfree(srv);
+ return NULL;
+}
+
+static int server_del(struct qrtr_node *node, unsigned int port)
+{
+ struct qrtr_lookup *lookup;
+ struct qrtr_server *srv;
+ struct list_head *li;
+
+ srv = radix_tree_lookup(&node->servers, port);
+ if (!srv)
+ return -ENOENT;
+
+ radix_tree_delete(&node->servers, port);
+
+ /* Broadcast the removal of local servers */
+ if (srv->node == qrtr_ns.local_node)
+ service_announce_del(&qrtr_ns.bcast_sq, srv);
+
+ /* Announce the service's disappearance to observers */
+ list_for_each(li, &qrtr_ns.lookups) {
+ lookup = container_of(li, struct qrtr_lookup, li);
+ if (lookup->service && lookup->service != srv->service)
+ continue;
+ if (lookup->instance && lookup->instance != srv->instance)
+ continue;
+
+ lookup_notify(&lookup->sq, srv, false);
+ }
+
+ kfree(srv);
+
+ return 0;
+}
+
+static int say_hello(struct sockaddr_qrtr *dest)
+{
+ struct qrtr_ctrl_pkt pkt;
+ struct msghdr msg = { };
+ struct kvec iv;
+ int ret;
+
+ iv.iov_base = &pkt;
+ iv.iov_len = sizeof(pkt);
+
+ memset(&pkt, 0, sizeof(pkt));
+ pkt.cmd = cpu_to_le32(QRTR_TYPE_HELLO);
+
+ msg.msg_name = (struct sockaddr *)dest;
+ msg.msg_namelen = sizeof(*dest);
+
+ ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
+ if (ret < 0)
+ pr_err("failed to send hello msg\n");
+
+ return ret;
+}
+
+/* Announce the list of servers registered on the local node */
+static int ctrl_cmd_hello(struct sockaddr_qrtr *sq)
+{
+ int ret;
+
+ ret = say_hello(sq);
+ if (ret < 0)
+ return ret;
+
+ return announce_servers(sq);
+}
+
+static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
+{
+ struct qrtr_node *local_node;
+ struct radix_tree_iter iter;
+ struct qrtr_ctrl_pkt pkt;
+ struct qrtr_server *srv;
+ struct sockaddr_qrtr sq;
+ struct msghdr msg = { };
+ struct qrtr_node *node;
+ void __rcu **slot;
+ struct kvec iv;
+ int ret;
+
+ iv.iov_base = &pkt;
+ iv.iov_len = sizeof(pkt);
+
+ node = node_get(from->sq_node);
+ if (!node)
+ return 0;
+
+ /* Advertise removal of this client to all servers of remote node */
+ radix_tree_for_each_slot(slot, &node->servers, &iter, 0) {
+ srv = radix_tree_deref_slot(slot);
+ server_del(node, srv->port);
+ }
+
+ /* Advertise the removal of this client to all local servers */
+ local_node = node_get(qrtr_ns.local_node);
+ if (!local_node)
+ return 0;
+
+ memset(&pkt, 0, sizeof(pkt));
+ pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE);
+ pkt.client.node = cpu_to_le32(from->sq_node);
+
+ radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) {
+ srv = radix_tree_deref_slot(slot);
+
+ sq.sq_family = AF_QIPCRTR;
+ sq.sq_node = srv->node;
+ sq.sq_port = srv->port;
+
+ msg.msg_name = (struct sockaddr *)&sq;
+ msg.msg_namelen = sizeof(sq);
+
+ ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
+ if (ret < 0) {
+ pr_err("failed to send bye cmd\n");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
+ unsigned int node_id, unsigned int port)
+{
+ struct qrtr_node *local_node;
+ struct radix_tree_iter iter;
+ struct qrtr_lookup *lookup;
+ struct qrtr_ctrl_pkt pkt;
+ struct msghdr msg = { };
+ struct qrtr_server *srv;
+ struct sockaddr_qrtr sq;
+ struct qrtr_node *node;
+ struct list_head *tmp;
+ struct list_head *li;
+ void __rcu **slot;
+ struct kvec iv;
+ int ret;
+
+ iv.iov_base = &pkt;
+ iv.iov_len = sizeof(pkt);
+
+ /* Don't accept spoofed messages */
+ if (from->sq_node != node_id)
+ return -EINVAL;
+
+ /* Local DEL_CLIENT messages comes from the port being closed */
+ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port)
+ return -EINVAL;
+
+ /* Remove any lookups by this client */
+ list_for_each_safe(li, tmp, &qrtr_ns.lookups) {
+ lookup = container_of(li, struct qrtr_lookup, li);
+ if (lookup->sq.sq_node != node_id)
+ continue;
+ if (lookup->sq.sq_port != port)
+ continue;
+
+ list_del(&lookup->li);
+ kfree(lookup);
+ }
+
+ /* Remove the server belonging to this port */
+ node = node_get(node_id);
+ if (node)
+ server_del(node, port);
+
+ /* Advertise the removal of this client to all local servers */
+ local_node = node_get(qrtr_ns.local_node);
+ if (!local_node)
+ return 0;
+
+ memset(&pkt, 0, sizeof(pkt));
+ pkt.cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT);
+ pkt.client.node = cpu_to_le32(node_id);
+ pkt.client.port = cpu_to_le32(port);
+
+ radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) {
+ srv = radix_tree_deref_slot(slot);
+
+ sq.sq_family = AF_QIPCRTR;
+ sq.sq_node = srv->node;
+ sq.sq_port = srv->port;
+
+ msg.msg_name = (struct sockaddr *)&sq;
+ msg.msg_namelen = sizeof(sq);
+
+ ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
+ if (ret < 0) {
+ pr_err("failed to send del client cmd\n");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int ctrl_cmd_new_server(struct sockaddr_qrtr *from,
+ unsigned int service, unsigned int instance,
+ unsigned int node_id, unsigned int port)
+{
+ struct qrtr_lookup *lookup;
+ struct qrtr_server *srv;
+ struct list_head *li;
+ int ret = 0;
+
+ /* Ignore specified node and port for local servers */
+ if (from->sq_node == qrtr_ns.local_node) {
+ node_id = from->sq_node;
+ port = from->sq_port;
+ }
+
+ /* Don't accept spoofed messages */
+ if (from->sq_node != node_id)
+ return -EINVAL;
+
+ srv = server_add(service, instance, node_id, port);
+ if (!srv)
+ return -EINVAL;
+
+ if (srv->node == qrtr_ns.local_node) {
+ ret = service_announce_new(&qrtr_ns.bcast_sq, srv);
+ if (ret < 0) {
+ pr_err("failed to announce new service\n");
+ return ret;
+ }
+ }
+
+ /* Notify any potential lookups about the new server */
+ list_for_each(li, &qrtr_ns.lookups) {
+ lookup = container_of(li, struct qrtr_lookup, li);
+ if (lookup->service && lookup->service != service)
+ continue;
+ if (lookup->instance && lookup->instance != instance)
+ continue;
+
+ lookup_notify(&lookup->sq, srv, true);
+ }
+
+ return ret;
+}
+
+static int ctrl_cmd_del_server(struct sockaddr_qrtr *from,
+ unsigned int service, unsigned int instance,
+ unsigned int node_id, unsigned int port)
+{
+ struct qrtr_node *node;
+
+ /* Ignore specified node and port for local servers*/
+ if (from->sq_node == qrtr_ns.local_node) {
+ node_id = from->sq_node;
+ port = from->sq_port;
+ }
+
+ /* Don't accept spoofed messages */
+ if (from->sq_node != node_id)
+ return -EINVAL;
+
+ /* Local servers may only unregister themselves */
+ if (from->sq_node == qrtr_ns.local_node && from->sq_port != port)
+ return -EINVAL;
+
+ node = node_get(node_id);
+ if (!node)
+ return -ENOENT;
+
+ return server_del(node, port);
+}
+
+static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from,
+ unsigned int service, unsigned int instance)
+{
+ struct radix_tree_iter node_iter;
+ struct qrtr_server_filter filter;
+ struct radix_tree_iter srv_iter;
+ struct qrtr_lookup *lookup;
+ struct qrtr_node *node;
+ void __rcu **node_slot;
+ void __rcu **srv_slot;
+
+ /* Accept only local observers */
+ if (from->sq_node != qrtr_ns.local_node)
+ return -EINVAL;
+
+ lookup = kzalloc(sizeof(*lookup), GFP_KERNEL);
+ if (!lookup)
+ return -ENOMEM;
+
+ lookup->sq = *from;
+ lookup->service = service;
+ lookup->instance = instance;
+ list_add_tail(&lookup->li, &qrtr_ns.lookups);
+
+ memset(&filter, 0, sizeof(filter));
+ filter.service = service;
+ filter.instance = instance;
+
+ radix_tree_for_each_slot(node_slot, &nodes, &node_iter, 0) {
+ node = radix_tree_deref_slot(node_slot);
+
+ radix_tree_for_each_slot(srv_slot, &node->servers,
+ &srv_iter, 0) {
+ struct qrtr_server *srv;
+
+ srv = radix_tree_deref_slot(srv_slot);
+ if (!server_match(srv, &filter))
+ continue;
+
+ lookup_notify(from, srv, true);
+ }
+ }
+
+ /* Empty notification, to indicate end of listing */
+ lookup_notify(from, NULL, true);
+
+ return 0;
+}
+
+static void ctrl_cmd_del_lookup(struct sockaddr_qrtr *from,
+ unsigned int service, unsigned int instance)
+{
+ struct qrtr_lookup *lookup;
+ struct list_head *tmp;
+ struct list_head *li;
+
+ list_for_each_safe(li, tmp, &qrtr_ns.lookups) {
+ lookup = container_of(li, struct qrtr_lookup, li);
+ if (lookup->sq.sq_node != from->sq_node)
+ continue;
+ if (lookup->sq.sq_port != from->sq_port)
+ continue;
+ if (lookup->service != service)
+ continue;
+ if (lookup->instance && lookup->instance != instance)
+ continue;
+
+ list_del(&lookup->li);
+ kfree(lookup);
+ }
+}
+
+static void qrtr_ns_worker(struct work_struct *work)
+{
+ const struct qrtr_ctrl_pkt *pkt;
+ size_t recv_buf_size = 4096;
+ struct sockaddr_qrtr sq;
+ struct msghdr msg = { };
+ unsigned int cmd;
+ ssize_t msglen;
+ void *recv_buf;
+ struct kvec iv;
+ int ret;
+
+ msg.msg_name = (struct sockaddr *)&sq;
+ msg.msg_namelen = sizeof(sq);
+
+ recv_buf = kzalloc(recv_buf_size, GFP_KERNEL);
+ if (!recv_buf)
+ return;
+
+ for (;;) {
+ iv.iov_base = recv_buf;
+ iv.iov_len = recv_buf_size;
+
+ msglen = kernel_recvmsg(qrtr_ns.sock, &msg, &iv, 1,
+ iv.iov_len, MSG_DONTWAIT);
+
+ if (msglen == -EAGAIN)
+ break;
+
+ if (msglen < 0) {
+ pr_err("error receiving packet: %zd\n", msglen);
+ break;
+ }
+
+ pkt = recv_buf;
+ cmd = le32_to_cpu(pkt->cmd);
+ if (cmd < ARRAY_SIZE(qrtr_ctrl_pkt_strings) &&
+ qrtr_ctrl_pkt_strings[cmd])
+ trace_printk("%s from %d:%d\n",
+ qrtr_ctrl_pkt_strings[cmd], sq.sq_node,
+ sq.sq_port);
+
+ ret = 0;
+ switch (cmd) {
+ case QRTR_TYPE_HELLO:
+ ret = ctrl_cmd_hello(&sq);
+ break;
+ case QRTR_TYPE_BYE:
+ ret = ctrl_cmd_bye(&sq);
+ break;
+ case QRTR_TYPE_DEL_CLIENT:
+ ret = ctrl_cmd_del_client(&sq,
+ le32_to_cpu(pkt->client.node),
+ le32_to_cpu(pkt->client.port));
+ break;
+ case QRTR_TYPE_NEW_SERVER:
+ ret = ctrl_cmd_new_server(&sq,
+ le32_to_cpu(pkt->server.service),
+ le32_to_cpu(pkt->server.instance),
+ le32_to_cpu(pkt->server.node),
+ le32_to_cpu(pkt->server.port));
+ break;
+ case QRTR_TYPE_DEL_SERVER:
+ ret = ctrl_cmd_del_server(&sq,
+ le32_to_cpu(pkt->server.service),
+ le32_to_cpu(pkt->server.instance),
+ le32_to_cpu(pkt->server.node),
+ le32_to_cpu(pkt->server.port));
+ break;
+ case QRTR_TYPE_EXIT:
+ case QRTR_TYPE_PING:
+ case QRTR_TYPE_RESUME_TX:
+ break;
+ case QRTR_TYPE_NEW_LOOKUP:
+ ret = ctrl_cmd_new_lookup(&sq,
+ le32_to_cpu(pkt->server.service),
+ le32_to_cpu(pkt->server.instance));
+ break;
+ case QRTR_TYPE_DEL_LOOKUP:
+ ctrl_cmd_del_lookup(&sq,
+ le32_to_cpu(pkt->server.service),
+ le32_to_cpu(pkt->server.instance));
+ break;
+ }
+
+ if (ret < 0)
+ pr_err("failed while handling packet from %d:%d",
+ sq.sq_node, sq.sq_port);
+ }
+
+ kfree(recv_buf);
+}
+
+static void qrtr_ns_data_ready(struct sock *sk)
+{
+ queue_work(qrtr_ns.workqueue, &qrtr_ns.work);
+}
+
+void qrtr_ns_init(void)
+{
+ struct sockaddr_qrtr sq;
+ int ret;
+
+ INIT_LIST_HEAD(&qrtr_ns.lookups);
+ INIT_WORK(&qrtr_ns.work, qrtr_ns_worker);
+
+ ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM,
+ PF_QIPCRTR, &qrtr_ns.sock);
+ if (ret < 0)
+ return;
+
+ ret = kernel_getsockname(qrtr_ns.sock, (struct sockaddr *)&sq);
+ if (ret < 0) {
+ pr_err("failed to get socket name\n");
+ goto err_sock;
+ }
+
+ qrtr_ns.sock->sk->sk_data_ready = qrtr_ns_data_ready;
+
+ sq.sq_port = QRTR_PORT_CTRL;
+ qrtr_ns.local_node = sq.sq_node;
+
+ ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq));
+ if (ret < 0) {
+ pr_err("failed to bind to socket\n");
+ goto err_sock;
+ }
+
+ qrtr_ns.bcast_sq.sq_family = AF_QIPCRTR;
+ qrtr_ns.bcast_sq.sq_node = QRTR_NODE_BCAST;
+ qrtr_ns.bcast_sq.sq_port = QRTR_PORT_CTRL;
+
+ qrtr_ns.workqueue = alloc_workqueue("qrtr_ns_handler", WQ_UNBOUND, 1);
+ if (!qrtr_ns.workqueue)
+ goto err_sock;
+
+ ret = say_hello(&qrtr_ns.bcast_sq);
+ if (ret < 0)
+ goto err_wq;
+
+ return;
+
+err_wq:
+ destroy_workqueue(qrtr_ns.workqueue);
+err_sock:
+ sock_release(qrtr_ns.sock);
+}
+EXPORT_SYMBOL_GPL(qrtr_ns_init);
+
+void qrtr_ns_remove(void)
+{
+ cancel_work_sync(&qrtr_ns.work);
+ destroy_workqueue(qrtr_ns.workqueue);
+ sock_release(qrtr_ns.sock);
+}
+EXPORT_SYMBOL_GPL(qrtr_ns_remove);
+
+MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>");
+MODULE_DESCRIPTION("Qualcomm IPC Router Nameservice");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 5a8e42ad1504..e22092e4a783 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -7,7 +7,6 @@
#include <linux/netlink.h>
#include <linux/qrtr.h>
#include <linux/termios.h> /* For TIOCINQ/OUTQ */
-#include <linux/numa.h>
#include <linux/spinlock.h>
#include <linux/wait.h>
@@ -96,7 +95,7 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk)
return container_of(sk, struct qrtr_sock, sk);
}
-static unsigned int qrtr_local_nid = NUMA_NO_NODE;
+static unsigned int qrtr_local_nid = 1;
/* for node ids */
static RADIX_TREE(qrtr_nodes, GFP_ATOMIC);
@@ -1241,38 +1240,6 @@ static int qrtr_create(struct net *net, struct socket *sock,
return 0;
}
-static const struct nla_policy qrtr_policy[IFA_MAX + 1] = {
- [IFA_LOCAL] = { .type = NLA_U32 },
-};
-
-static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
-{
- struct nlattr *tb[IFA_MAX + 1];
- struct ifaddrmsg *ifm;
- int rc;
-
- if (!netlink_capable(skb, CAP_NET_ADMIN))
- return -EPERM;
-
- if (!netlink_capable(skb, CAP_SYS_ADMIN))
- return -EPERM;
-
- ASSERT_RTNL();
-
- rc = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
- qrtr_policy, extack);
- if (rc < 0)
- return rc;
-
- ifm = nlmsg_data(nlh);
- if (!tb[IFA_LOCAL])
- return -EINVAL;
-
- qrtr_local_nid = nla_get_u32(tb[IFA_LOCAL]);
- return 0;
-}
-
static const struct net_proto_family qrtr_family = {
.owner = THIS_MODULE,
.family = AF_QIPCRTR,
@@ -1293,11 +1260,7 @@ static int __init qrtr_proto_init(void)
return rc;
}
- rc = rtnl_register_module(THIS_MODULE, PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, 0);
- if (rc) {
- sock_unregister(qrtr_family.family);
- proto_unregister(&qrtr_proto);
- }
+ qrtr_ns_init();
return rc;
}
@@ -1305,7 +1268,7 @@ postcore_initcall(qrtr_proto_init);
static void __exit qrtr_proto_fini(void)
{
- rtnl_unregister(PF_QIPCRTR, RTM_NEWADDR);
+ qrtr_ns_remove();
sock_unregister(qrtr_family.family);
proto_unregister(&qrtr_proto);
}
diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h
index b81e6953c04b..dc2b67f17927 100644
--- a/net/qrtr/qrtr.h
+++ b/net/qrtr/qrtr.h
@@ -29,4 +29,8 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep);
int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len);
+void qrtr_ns_init(void);
+
+void qrtr_ns_remove(void);
+
#endif
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 3341eee87bf9..585e6b3b69ce 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -162,10 +162,9 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
if (write)
gup_flags |= FOLL_WRITE;
- ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
+ ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
if (ret >= 0 && ret < nr_pages) {
- while (ret--)
- put_page(pages[ret]);
+ unpin_user_pages(pages, ret);
ret = -EFAULT;
}
@@ -300,8 +299,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* to release anything.
*/
if (!need_odp) {
- for (i = 0 ; i < nents; i++)
- put_page(sg_page(&sg[i]));
+ unpin_user_pages(pages, nr_pages);
kfree(sg);
}
ret = PTR_ERR(trans_private);
@@ -325,7 +323,12 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
if (cookie_ret)
*cookie_ret = cookie;
- if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
+ if (args->cookie_addr &&
+ put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) {
+ if (!need_odp) {
+ unpin_user_pages(pages, nr_pages);
+ kfree(sg);
+ }
ret = -EFAULT;
goto out;
}
@@ -496,9 +499,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)
* is the case for a RDMA_READ which copies from remote
* to local memory
*/
- if (!ro->op_write)
- set_page_dirty(page);
- put_page(page);
+ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write);
}
}
@@ -515,8 +516,7 @@ void rds_atomic_free_op(struct rm_atomic_op *ao)
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
- set_page_dirty(page);
- put_page(page);
+ unpin_user_pages_dirty_lock(&page, 1, true);
kfree(ao->op_notifier);
ao->op_notifier = NULL;
@@ -944,7 +944,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
return ret;
err:
if (page)
- put_page(page);
+ unpin_user_page(page);
rm->atomic.op_active = 0;
kfree(rm->atomic.op_notifier);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index fe42f986cd94..15ee92d79581 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -285,7 +285,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
gfp_t gfp,
rxrpc_notify_rx_t notify_rx,
bool upgrade,
- bool intr,
+ enum rxrpc_interruptibility interruptibility,
unsigned int debug_id)
{
struct rxrpc_conn_parameters cp;
@@ -310,7 +310,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
memset(&p, 0, sizeof(p));
p.user_call_ID = user_call_ID;
p.tx_total_len = tx_total_len;
- p.intr = intr;
+ p.interruptibility = interruptibility;
memset(&cp, 0, sizeof(cp));
cp.local = rx->local;
@@ -371,45 +371,18 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call);
* rxrpc_kernel_check_life - Check to see whether a call is still alive
* @sock: The socket the call is on
* @call: The call to check
- * @_life: Where to store the life value
*
- * Allow a kernel service to find out whether a call is still alive - ie. we're
- * getting ACKs from the server. Passes back in *_life a number representing
- * the life state which can be compared to that returned by a previous call and
- * return true if the call is still alive.
- *
- * If the life state stalls, rxrpc_kernel_probe_life() should be called and
- * then 2RTT waited.
+ * Allow a kernel service to find out whether a call is still alive -
+ * ie. whether it has completed.
*/
bool rxrpc_kernel_check_life(const struct socket *sock,
- const struct rxrpc_call *call,
- u32 *_life)
+ const struct rxrpc_call *call)
{
- *_life = call->acks_latest;
return call->state != RXRPC_CALL_COMPLETE;
}
EXPORT_SYMBOL(rxrpc_kernel_check_life);
/**
- * rxrpc_kernel_probe_life - Poke the peer to see if it's still alive
- * @sock: The socket the call is on
- * @call: The call to check
- *
- * In conjunction with rxrpc_kernel_check_life(), allow a kernel service to
- * find out whether a call is still alive by pinging it. This should cause the
- * life state to be bumped in about 2*RTT.
- *
- * The must be called in TASK_RUNNING state on pain of might_sleep() objecting.
- */
-void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call)
-{
- rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false,
- rxrpc_propose_ack_ping_for_check_life);
- rxrpc_send_ack_packet(call, true, NULL);
-}
-EXPORT_SYMBOL(rxrpc_kernel_probe_life);
-
-/**
* rxrpc_kernel_get_epoch - Retrieve the epoch value from a call.
* @sock: The socket the call is on
* @call: The call to query
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 7d730c438404..3eb1ab40ca5c 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -489,7 +489,6 @@ enum rxrpc_call_flag {
RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */
- RXRPC_CALL_IS_INTR, /* The call is interruptible */
RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */
};
@@ -598,6 +597,7 @@ struct rxrpc_call {
atomic_t usage;
u16 service_id; /* service ID */
u8 security_ix; /* Security type */
+ enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */
u32 call_id; /* call ID on connection */
u32 cid; /* connection ID plus channel index */
int debug_id; /* debug ID for printks */
@@ -675,7 +675,6 @@ struct rxrpc_call {
/* transmission-phase ACK management */
ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
- rxrpc_serial_t acks_latest; /* serial number of latest ACK received */
rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */
rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */
@@ -721,7 +720,7 @@ struct rxrpc_call_params {
u32 normal; /* Max time since last call packet (msec) */
} timeouts;
u8 nr_timeouts; /* Number of timeouts specified */
- bool intr; /* The call is interruptible */
+ enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */
};
struct rxrpc_send_params {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index c9f34b0a11df..f07970207b54 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -237,8 +237,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
return call;
}
- if (p->intr)
- __set_bit(RXRPC_CALL_IS_INTR, &call->flags);
+ call->interruptibility = p->interruptibility;
call->tx_total_len = p->tx_total_len;
trace_rxrpc_call(call->debug_id, rxrpc_call_new_client,
atomic_read(&call->usage),
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index ea7d4c21f889..f2a1a5dbb5a7 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -655,13 +655,20 @@ static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp)
add_wait_queue_exclusive(&call->waitq, &myself);
for (;;) {
- if (test_bit(RXRPC_CALL_IS_INTR, &call->flags))
+ switch (call->interruptibility) {
+ case RXRPC_INTERRUPTIBLE:
+ case RXRPC_PREINTERRUPTIBLE:
set_current_state(TASK_INTERRUPTIBLE);
- else
+ break;
+ case RXRPC_UNINTERRUPTIBLE:
+ default:
set_current_state(TASK_UNINTERRUPTIBLE);
+ break;
+ }
if (call->call_id)
break;
- if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) &&
+ if ((call->interruptibility == RXRPC_INTERRUPTIBLE ||
+ call->interruptibility == RXRPC_PREINTERRUPTIBLE) &&
signal_pending(current)) {
ret = -ERESTARTSYS;
break;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index ef10fbf71b15..69e09d69c896 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -882,7 +882,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
before(prev_pkt, call->ackr_prev_seq))
goto out;
call->acks_latest_ts = skb->tstamp;
- call->acks_latest = sp->hdr.serial;
call->ackr_first_seq = first_soft_ack;
call->ackr_prev_seq = prev_pkt;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 813fd6888142..0fcf157aa09f 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -18,6 +18,21 @@
#include "ar-internal.h"
/*
+ * Return true if there's sufficient Tx queue space.
+ */
+static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
+{
+ unsigned int win_size =
+ min_t(unsigned int, call->tx_winsize,
+ call->cong_cwnd + call->cong_extra);
+ rxrpc_seq_t tx_win = READ_ONCE(call->tx_hard_ack);
+
+ if (_tx_win)
+ *_tx_win = tx_win;
+ return call->tx_top - tx_win < win_size;
+}
+
+/*
* Wait for space to appear in the Tx queue or a signal to occur.
*/
static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx,
@@ -26,9 +41,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx,
{
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (call->tx_top - call->tx_hard_ack <
- min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra))
+ if (rxrpc_check_tx_space(call, NULL))
return 0;
if (call->state >= RXRPC_CALL_COMPLETE)
@@ -49,7 +62,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx,
* Wait for space to appear in the Tx queue uninterruptibly, but with
* a timeout of 2*RTT if no progress was made and a signal occurred.
*/
-static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
+static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
struct rxrpc_call *call)
{
rxrpc_seq_t tx_start, tx_win;
@@ -58,8 +71,8 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
rtt = READ_ONCE(call->peer->rtt);
rtt2 = nsecs_to_jiffies64(rtt) * 2;
- if (rtt2 < 1)
- rtt2 = 1;
+ if (rtt2 < 2)
+ rtt2 = 2;
timeout = rtt2;
tx_start = READ_ONCE(call->tx_hard_ack);
@@ -68,16 +81,13 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
set_current_state(TASK_UNINTERRUPTIBLE);
tx_win = READ_ONCE(call->tx_hard_ack);
- if (call->tx_top - tx_win <
- min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra))
+ if (rxrpc_check_tx_space(call, &tx_win))
return 0;
if (call->state >= RXRPC_CALL_COMPLETE)
return call->error;
- if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) &&
- timeout == 0 &&
+ if (timeout == 0 &&
tx_win == tx_start && signal_pending(current))
return -EINTR;
@@ -92,6 +102,26 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
}
/*
+ * Wait for space to appear in the Tx queue uninterruptibly.
+ */
+static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ long *timeo)
+{
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (rxrpc_check_tx_space(call, NULL))
+ return 0;
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return call->error;
+
+ trace_rxrpc_transmit(call, rxrpc_transmit_wait);
+ *timeo = schedule_timeout(*timeo);
+ }
+}
+
+/*
* wait for space to appear in the transmit/ACK window
* - caller holds the socket locked
*/
@@ -108,10 +138,19 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
add_wait_queue(&call->waitq, &myself);
- if (waitall)
- ret = rxrpc_wait_for_tx_window_nonintr(rx, call);
- else
- ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo);
+ switch (call->interruptibility) {
+ case RXRPC_INTERRUPTIBLE:
+ if (waitall)
+ ret = rxrpc_wait_for_tx_window_waitall(rx, call);
+ else
+ ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo);
+ break;
+ case RXRPC_PREINTERRUPTIBLE:
+ case RXRPC_UNINTERRUPTIBLE:
+ default:
+ ret = rxrpc_wait_for_tx_window_nonintr(rx, call, timeo);
+ break;
+ }
remove_wait_queue(&call->waitq, &myself);
set_current_state(TASK_RUNNING);
@@ -302,9 +341,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
_debug("alloc");
- if (call->tx_top - call->tx_hard_ack >=
- min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra)) {
+ if (!rxrpc_check_tx_space(call, NULL)) {
ret = -EAGAIN;
if (msg->msg_flags & MSG_DONTWAIT)
goto maybe_error;
@@ -619,7 +656,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
.call.tx_total_len = -1,
.call.user_call_ID = 0,
.call.nr_timeouts = 0,
- .call.intr = true,
+ .call.interruptibility = RXRPC_INTERRUPTIBLE,
.abort_code = 0,
.command = RXRPC_CMD_SEND_DATA,
.exclusive = false,
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index edde0e519438..bfbefb7bff9d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -972,7 +972,7 @@ config NET_ACT_TUNNEL_KEY
config NET_ACT_CT
tristate "connection tracking tc action"
- depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
+ depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT && NF_FLOW_TABLE
help
Say Y here to allow sending the packets to conntrack module.
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 90a31b15585f..df4560909157 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -185,7 +185,9 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
return nla_total_size(0) /* action number nested */
+ nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
+ cookie_len /* TCA_ACT_COOKIE */
+ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS */
+ nla_total_size(0) /* TCA_ACT_STATS nested */
+ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */
/* TCA_STATS_BASIC */
+ nla_total_size_64bit(sizeof(struct gnet_stats_basic))
/* TCA_STATS_PKT64 */
@@ -787,13 +789,20 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
}
rcu_read_unlock();
- if (a->tcfa_flags) {
- struct nla_bitfield32 flags = { a->tcfa_flags,
- a->tcfa_flags, };
+ if (a->hw_stats != TCA_ACT_HW_STATS_ANY &&
+ nla_put_bitfield32(skb, TCA_ACT_HW_STATS,
+ a->hw_stats, TCA_ACT_HW_STATS_ANY))
+ goto nla_put_failure;
- if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags))
- goto nla_put_failure;
- }
+ if (a->used_hw_stats_valid &&
+ nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS,
+ a->used_hw_stats, TCA_ACT_HW_STATS_ANY))
+ goto nla_put_failure;
+
+ if (a->tcfa_flags &&
+ nla_put_bitfield32(skb, TCA_ACT_FLAGS,
+ a->tcfa_flags, a->tcfa_flags))
+ goto nla_put_failure;
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
@@ -853,7 +862,23 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
return c;
}
+static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr)
+{
+ struct nla_bitfield32 hw_stats_bf;
+
+ /* If the user did not pass the attr, that means he does
+ * not care about the type. Return "any" in that case
+ * which is setting on all supported types.
+ */
+ if (!hw_stats_attr)
+ return TCA_ACT_HW_STATS_ANY;
+ hw_stats_bf = nla_get_bitfield32(hw_stats_attr);
+ return hw_stats_bf.value;
+}
+
static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS;
+static const u32 tca_act_hw_stats_allowed = TCA_ACT_HW_STATS_ANY;
+
static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
[TCA_ACT_KIND] = { .type = NLA_STRING },
[TCA_ACT_INDEX] = { .type = NLA_U32 },
@@ -862,6 +887,8 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = {
[TCA_ACT_OPTIONS] = { .type = NLA_NESTED },
[TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32,
.validation_data = &tca_act_flags_allowed },
+ [TCA_ACT_HW_STATS] = { .type = NLA_BITFIELD32,
+ .validation_data = &tca_act_hw_stats_allowed },
};
struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
@@ -871,6 +898,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct netlink_ext_ack *extack)
{
struct nla_bitfield32 flags = { 0, 0 };
+ u8 hw_stats = TCA_ACT_HW_STATS_ANY;
struct tc_action *a;
struct tc_action_ops *a_o;
struct tc_cookie *cookie = NULL;
@@ -902,6 +930,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
goto err_out;
}
}
+ hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]);
if (tb[TCA_ACT_FLAGS])
flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]);
} else {
@@ -952,6 +981,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
if (!name && tb[TCA_ACT_COOKIE])
tcf_set_action_cookie(&a->act_cookie, cookie);
+ if (!name)
+ a->hw_stats = hw_stats;
+
/* module count goes up only when brand new policy is created
* if it exists and is only bound to in a_o->init() then
* ACT_P_CREATED is not returned (a zero is).
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 46f47e58b3be..54d5652cfe6c 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -12,6 +12,7 @@
#include <linux/bpf.h>
#include <net/netlink.h>
+#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -53,6 +54,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
bpf_compute_data_pointers(skb);
filter_res = BPF_PROG_RUN(filter, skb);
}
+ if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
+ skb_orphan(skb);
rcu_read_unlock();
/* A BPF program may overwrite the default action opcode.
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index f685c0d73708..1a766393be62 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -15,6 +15,7 @@
#include <linux/pkt_cls.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/rhashtable.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -24,6 +25,7 @@
#include <uapi/linux/tc_act/tc_ct.h>
#include <net/tc_act/tc_ct.h>
+#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_zones.h>
@@ -31,6 +33,523 @@
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <uapi/linux/netfilter/nf_nat.h>
+static struct workqueue_struct *act_ct_wq;
+static struct rhashtable zones_ht;
+static DEFINE_MUTEX(zones_mutex);
+
+struct tcf_ct_flow_table {
+ struct rhash_head node; /* In zones tables */
+
+ struct rcu_work rwork;
+ struct nf_flowtable nf_ft;
+ refcount_t ref;
+ u16 zone;
+
+ bool dying;
+};
+
+static const struct rhashtable_params zones_params = {
+ .head_offset = offsetof(struct tcf_ct_flow_table, node),
+ .key_offset = offsetof(struct tcf_ct_flow_table, zone),
+ .key_len = sizeof_field(struct tcf_ct_flow_table, zone),
+ .automatic_shrinking = true,
+};
+
+static struct flow_action_entry *
+tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
+{
+ int i = flow_action->num_entries++;
+
+ return &flow_action->entries[i];
+}
+
+static void tcf_ct_add_mangle_action(struct flow_action *action,
+ enum flow_action_mangle_base htype,
+ u32 offset,
+ u32 mask,
+ u32 val)
+{
+ struct flow_action_entry *entry;
+
+ entry = tcf_ct_flow_table_flow_action_get_next(action);
+ entry->id = FLOW_ACTION_MANGLE;
+ entry->mangle.htype = htype;
+ entry->mangle.mask = ~mask;
+ entry->mangle.offset = offset;
+ entry->mangle.val = val;
+}
+
+/* The following nat helper functions check if the inverted reverse tuple
+ * (target) is different then the current dir tuple - meaning nat for ports
+ * and/or ip is needed, and add the relevant mangle actions.
+ */
+static void
+tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
+ offsetof(struct iphdr, saddr),
+ 0xFFFFFFFF,
+ be32_to_cpu(target.src.u3.ip));
+ if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
+ offsetof(struct iphdr, daddr),
+ 0xFFFFFFFF,
+ be32_to_cpu(target.dst.u3.ip));
+}
+
+static void
+tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action,
+ union nf_inet_addr *addr,
+ u32 offset)
+{
+ int i;
+
+ for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
+ i * sizeof(u32) + offset,
+ 0xFFFFFFFF, be32_to_cpu(addr->ip6[i]));
+}
+
+static void
+tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
+ tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3,
+ offsetof(struct ipv6hdr,
+ saddr));
+ if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
+ tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3,
+ offsetof(struct ipv6hdr,
+ daddr));
+}
+
+static void
+tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ __be16 target_src = target.src.u.tcp.port;
+ __be16 target_dst = target.dst.u.tcp.port;
+
+ if (target_src != tuple->src.u.tcp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ offsetof(struct tcphdr, source),
+ 0xFFFF, be16_to_cpu(target_src));
+ if (target_dst != tuple->dst.u.tcp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ offsetof(struct tcphdr, dest),
+ 0xFFFF, be16_to_cpu(target_dst));
+}
+
+static void
+tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
+ struct nf_conntrack_tuple target,
+ struct flow_action *action)
+{
+ __be16 target_src = target.src.u.udp.port;
+ __be16 target_dst = target.dst.u.udp.port;
+
+ if (target_src != tuple->src.u.udp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ offsetof(struct udphdr, source),
+ 0xFFFF, be16_to_cpu(target_src));
+ if (target_dst != tuple->dst.u.udp.port)
+ tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+ offsetof(struct udphdr, dest),
+ 0xFFFF, be16_to_cpu(target_dst));
+}
+
+static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct flow_action *action)
+{
+ struct nf_conn_labels *ct_labels;
+ struct flow_action_entry *entry;
+ enum ip_conntrack_info ctinfo;
+ u32 *act_ct_labels;
+
+ entry = tcf_ct_flow_table_flow_action_get_next(action);
+ entry->id = FLOW_ACTION_CT_METADATA;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ entry->ct_metadata.mark = ct->mark;
+#endif
+ ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
+ IP_CT_ESTABLISHED_REPLY;
+ /* aligns with the CT reference on the SKB nf_ct_set */
+ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
+
+ act_ct_labels = entry->ct_metadata.labels;
+ ct_labels = nf_ct_labels_find(ct);
+ if (ct_labels)
+ memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE);
+ else
+ memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE);
+}
+
+static int tcf_ct_flow_table_add_action_nat(struct net *net,
+ struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct flow_action *action)
+{
+ const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
+ struct nf_conntrack_tuple target;
+
+ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
+
+ switch (tuple->src.l3num) {
+ case NFPROTO_IPV4:
+ tcf_ct_flow_table_add_action_nat_ipv4(tuple, target,
+ action);
+ break;
+ case NFPROTO_IPV6:
+ tcf_ct_flow_table_add_action_nat_ipv6(tuple, target,
+ action);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action);
+ break;
+ case IPPROTO_UDP:
+ tcf_ct_flow_table_add_action_nat_udp(tuple, target, action);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int tcf_ct_flow_table_fill_actions(struct net *net,
+ const struct flow_offload *flow,
+ enum flow_offload_tuple_dir tdir,
+ struct nf_flow_rule *flow_rule)
+{
+ struct flow_action *action = &flow_rule->rule->action;
+ int num_entries = action->num_entries;
+ struct nf_conn *ct = flow->ct;
+ enum ip_conntrack_dir dir;
+ int i, err;
+
+ switch (tdir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ dir = IP_CT_DIR_ORIGINAL;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ dir = IP_CT_DIR_REPLY;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action);
+ if (err)
+ goto err_nat;
+
+ tcf_ct_flow_table_add_action_meta(ct, dir, action);
+ return 0;
+
+err_nat:
+ /* Clear filled actions */
+ for (i = num_entries; i < action->num_entries; i++)
+ memset(&action->entries[i], 0, sizeof(action->entries[i]));
+ action->num_entries = num_entries;
+
+ return err;
+}
+
+static struct nf_flowtable_type flowtable_ct = {
+ .action = tcf_ct_flow_table_fill_actions,
+ .owner = THIS_MODULE,
+};
+
+static int tcf_ct_flow_table_get(struct tcf_ct_params *params)
+{
+ struct tcf_ct_flow_table *ct_ft;
+ int err = -ENOMEM;
+
+ mutex_lock(&zones_mutex);
+ ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
+ if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
+ goto out_unlock;
+
+ ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
+ if (!ct_ft)
+ goto err_alloc;
+ refcount_set(&ct_ft->ref, 1);
+
+ ct_ft->zone = params->zone;
+ err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
+ if (err)
+ goto err_insert;
+
+ ct_ft->nf_ft.type = &flowtable_ct;
+ ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD;
+ err = nf_flow_table_init(&ct_ft->nf_ft);
+ if (err)
+ goto err_init;
+
+ __module_get(THIS_MODULE);
+out_unlock:
+ params->ct_ft = ct_ft;
+ params->nf_ft = &ct_ft->nf_ft;
+ mutex_unlock(&zones_mutex);
+
+ return 0;
+
+err_init:
+ rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+err_insert:
+ kfree(ct_ft);
+err_alloc:
+ mutex_unlock(&zones_mutex);
+ return err;
+}
+
+static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
+{
+ struct tcf_ct_flow_table *ct_ft;
+
+ ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
+ rwork);
+ nf_flow_table_free(&ct_ft->nf_ft);
+ kfree(ct_ft);
+
+ module_put(THIS_MODULE);
+}
+
+static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
+{
+ struct tcf_ct_flow_table *ct_ft = params->ct_ft;
+
+ if (refcount_dec_and_test(&params->ct_ft->ref)) {
+ rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+ INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
+ queue_rcu_work(act_ct_wq, &ct_ft->rwork);
+ }
+}
+
+static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
+ struct nf_conn *ct,
+ bool tcp)
+{
+ struct flow_offload *entry;
+ int err;
+
+ if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return;
+
+ entry = flow_offload_alloc(ct);
+ if (!entry) {
+ WARN_ON_ONCE(1);
+ goto err_alloc;
+ }
+
+ if (tcp) {
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
+
+ err = flow_offload_add(&ct_ft->nf_ft, entry);
+ if (err)
+ goto err_add;
+
+ return;
+
+err_add:
+ flow_offload_free(entry);
+err_alloc:
+ clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+}
+
+static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ bool tcp = false;
+
+ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY)
+ return;
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+ tcp = true;
+ if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+ return;
+ break;
+ case IPPROTO_UDP:
+ break;
+ default:
+ return;
+ }
+
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+ ct->status & IPS_SEQ_ADJUST)
+ return;
+
+ tcf_ct_flow_table_add(ct_ft, ct, tcp);
+}
+
+static bool
+tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct tcphdr **tcph)
+{
+ struct flow_ports *ports;
+ unsigned int thoff;
+ struct iphdr *iph;
+
+ if (!pskb_network_may_pull(skb, sizeof(*iph)))
+ return false;
+
+ iph = ip_hdr(skb);
+ thoff = iph->ihl * 4;
+
+ if (ip_is_fragment(iph) ||
+ unlikely(thoff != sizeof(struct iphdr)))
+ return false;
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP)
+ return false;
+
+ if (iph->ttl <= 1)
+ return false;
+
+ if (!pskb_network_may_pull(skb, iph->protocol == IPPROTO_TCP ?
+ thoff + sizeof(struct tcphdr) :
+ thoff + sizeof(*ports)))
+ return false;
+
+ iph = ip_hdr(skb);
+ if (iph->protocol == IPPROTO_TCP)
+ *tcph = (void *)(skb_network_header(skb) + thoff);
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_v4.s_addr = iph->saddr;
+ tuple->dst_v4.s_addr = iph->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET;
+ tuple->l4proto = iph->protocol;
+
+ return true;
+}
+
+static bool
+tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct tcphdr **tcph)
+{
+ struct flow_ports *ports;
+ struct ipv6hdr *ip6h;
+ unsigned int thoff;
+
+ if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
+ return false;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->nexthdr != IPPROTO_TCP &&
+ ip6h->nexthdr != IPPROTO_UDP)
+ return false;
+
+ if (ip6h->hop_limit <= 1)
+ return false;
+
+ thoff = sizeof(*ip6h);
+ if (!pskb_network_may_pull(skb, ip6h->nexthdr == IPPROTO_TCP ?
+ thoff + sizeof(struct tcphdr) :
+ thoff + sizeof(*ports)))
+ return false;
+
+ ip6h = ipv6_hdr(skb);
+ if (ip6h->nexthdr == IPPROTO_TCP)
+ *tcph = (void *)(skb_network_header(skb) + thoff);
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+ tuple->src_v6 = ip6h->saddr;
+ tuple->dst_v6 = ip6h->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET6;
+ tuple->l4proto = ip6h->nexthdr;
+
+ return true;
+}
+
+static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
+ struct sk_buff *skb,
+ u8 family)
+{
+ struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct flow_offload_tuple tuple = {};
+ enum ip_conntrack_info ctinfo;
+ struct tcphdr *tcph = NULL;
+ struct flow_offload *flow;
+ struct nf_conn *ct;
+ u8 dir;
+
+ /* Previously seen or loopback */
+ ct = nf_ct_get(skb, &ctinfo);
+ if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
+ return false;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
+ return false;
+ break;
+ case NFPROTO_IPV6:
+ if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ tuplehash = flow_offload_lookup(nf_ft, &tuple);
+ if (!tuplehash)
+ return false;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ ct = flow->ct;
+
+ if (tcph && (unlikely(tcph->fin || tcph->rst))) {
+ flow_offload_teardown(flow);
+ return false;
+ }
+
+ ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
+ IP_CT_ESTABLISHED_REPLY;
+
+ flow_offload_refresh(nf_ft, flow);
+ nf_conntrack_get(&ct->ct_general);
+ nf_ct_set(skb, ct, ctinfo);
+
+ return true;
+}
+
+static int tcf_ct_flow_tables_init(void)
+{
+ return rhashtable_init(&zones_ht, &zones_params);
+}
+
+static void tcf_ct_flow_tables_uninit(void)
+{
+ rhashtable_destroy(&zones_ht);
+}
+
static struct tc_action_ops act_ct_ops;
static unsigned int ct_net_id;
@@ -207,6 +726,8 @@ static void tcf_ct_params_free(struct rcu_head *head)
struct tcf_ct_params *params = container_of(head,
struct tcf_ct_params, rcu);
+ tcf_ct_flow_table_put(params);
+
if (params->tmpl)
nf_conntrack_put(&params->tmpl->ct_general);
kfree(params);
@@ -387,6 +908,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
struct nf_hook_state state;
int nh_ofs, err, retval;
struct tcf_ct_params *p;
+ bool skip_add = false;
struct nf_conn *ct;
u8 family;
@@ -436,6 +958,11 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
*/
cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
if (!cached) {
+ if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) {
+ skip_add = true;
+ goto do_nat;
+ }
+
/* Associate skb with specified zone. */
if (tmpl) {
ct = nf_ct_get(skb, &ctinfo);
@@ -453,6 +980,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
goto out_push;
}
+do_nat:
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
goto out_push;
@@ -470,6 +998,8 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
* even if the connection is already confirmed.
*/
nf_conntrack_confirm(skb);
+ } else if (!skip_add) {
+ tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
}
out_push:
@@ -730,6 +1260,10 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
if (err)
goto cleanup;
+ err = tcf_ct_flow_table_get(params);
+ if (err)
+ goto cleanup;
+
spin_lock_bh(&c->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
params = rcu_replace_pointer(c->params, params,
@@ -739,7 +1273,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
if (params)
- kfree_rcu(params, rcu);
+ call_rcu(&params->rcu, tcf_ct_params_free);
if (res == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
@@ -974,13 +1508,46 @@ static struct pernet_operations ct_net_ops = {
static int __init ct_init_module(void)
{
- return tcf_register_action(&act_ct_ops, &ct_net_ops);
+ int err;
+
+ act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
+ if (!act_ct_wq)
+ return -ENOMEM;
+
+ err = tcf_ct_flow_tables_init();
+ if (err)
+ goto err_tbl_init;
+
+ err = tcf_register_action(&act_ct_ops, &ct_net_ops);
+ if (err)
+ goto err_register;
+
+ return 0;
+
+err_tbl_init:
+ destroy_workqueue(act_ct_wq);
+err_register:
+ tcf_ct_flow_tables_uninit();
+ return err;
}
static void __exit ct_cleanup_module(void)
{
tcf_unregister_action(&act_ct_ops, &ct_net_ops);
+ tcf_ct_flow_tables_uninit();
+ destroy_workqueue(act_ct_wq);
+}
+
+void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie)
+{
+ enum ip_conntrack_info ctinfo = cookie & NFCT_INFOMASK;
+ struct nf_conn *ct;
+
+ ct = (struct nf_conn *)(cookie & NFCT_PTRMASK);
+ nf_conntrack_get(&ct->ct_general);
+ nf_ct_set(skb, ct, ctinfo);
}
+EXPORT_SYMBOL_GPL(tcf_ct_flow_table_restore_skb);
module_init(ct_init_module);
module_exit(ct_cleanup_module);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1ad300e6dbc0..83dd82fc9f40 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -284,10 +284,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
/* mirror is always swallowed */
if (is_redirect) {
- skb2->tc_redirected = 1;
- skb2->tc_from_ingress = skb2->tc_at_ingress;
- if (skb2->tc_from_ingress)
- skb2->tstamp = 0;
+ skb_set_redirected(skb2, skb2->tc_at_ingress);
+
/* let's the caller reinsert the packet, if possible */
if (use_reinsert) {
res->ingress = want_ingress;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 3ad718576304..d41d6200d9de 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -409,6 +409,16 @@ done:
return p->tcf_action;
}
+static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse, bool hw)
+{
+ struct tcf_pedit *d = to_pedit(a);
+ struct tcf_t *tm = &d->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, false, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
@@ -485,6 +495,7 @@ static struct tc_action_ops act_pedit_ops = {
.id = TCA_ID_PEDIT,
.owner = THIS_MODULE,
.act = tcf_pedit_act,
+ .stats_update = tcf_pedit_stats_update,
.dump = tcf_pedit_dump,
.cleanup = tcf_pedit_cleanup,
.init = tcf_pedit_init,
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index ce948c1e24dc..5e2df590bb58 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -267,14 +267,12 @@ tcf_sample_get_group(const struct tc_action *a,
struct tcf_sample *s = to_sample(a);
struct psample_group *group;
- spin_lock_bh(&s->tcf_lock);
group = rcu_dereference_protected(s->psample_group,
lockdep_is_held(&s->tcf_lock));
if (group) {
psample_group_take(group);
*destructor = tcf_psample_group_put;
}
- spin_unlock_bh(&s->tcf_lock);
return group;
}
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index e857424c387c..b125b2be4467 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -73,6 +73,16 @@ err:
return TC_ACT_SHOT;
}
+static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes,
+ u32 packets, u64 lastuse, bool hw)
+{
+ struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_t *tm = &d->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, false, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) },
[TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
@@ -323,6 +333,7 @@ static struct tc_action_ops act_skbedit_ops = {
.id = TCA_ID_SKBEDIT,
.owner = THIS_MODULE,
.act = tcf_skbedit_act,
+ .stats_update = tcf_skbedit_stats_update,
.dump = tcf_skbedit_dump,
.init = tcf_skbedit_init,
.cleanup = tcf_skbedit_cleanup,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index c2cdd0fc2e70..f6a3b969ead0 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -22,6 +22,7 @@
#include <linux/idr.h>
#include <linux/rhashtable.h>
#include <linux/jhash.h>
+#include <linux/rculist.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -354,7 +355,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
chain = kzalloc(sizeof(*chain), GFP_KERNEL);
if (!chain)
return NULL;
- list_add_tail(&chain->list, &block->chain_list);
+ list_add_tail_rcu(&chain->list, &block->chain_list);
mutex_init(&chain->filter_chain_lock);
chain->block = block;
chain->index = chain_index;
@@ -394,7 +395,7 @@ static bool tcf_chain_detach(struct tcf_chain *chain)
ASSERT_BLOCK_LOCKED(block);
- list_del(&chain->list);
+ list_del_rcu(&chain->list);
if (!chain->index)
block->chain0.chain = NULL;
@@ -453,6 +454,20 @@ static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
return NULL;
}
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block,
+ u32 chain_index)
+{
+ struct tcf_chain *chain;
+
+ list_for_each_entry_rcu(chain, &block->chain_list, list) {
+ if (chain->index == chain_index)
+ return chain;
+ }
+ return NULL;
+}
+#endif
+
static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
u32 seq, u16 flags, int event, bool unicast);
@@ -693,7 +708,7 @@ static void tc_indr_block_call(struct tcf_block *block,
};
INIT_LIST_HEAD(&bo.cb_list);
- flow_indr_block_call(dev, &bo, command);
+ flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK);
tcf_block_setup(block, &bo);
}
@@ -1559,12 +1574,15 @@ static int tcf_block_setup(struct tcf_block *block,
* to this qdisc, (optionally) tests for protocol and asks
* specific classifiers.
*/
-int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
- struct tcf_result *res, bool compat_mode)
+static inline int __tcf_classify(struct sk_buff *skb,
+ const struct tcf_proto *tp,
+ const struct tcf_proto *orig_tp,
+ struct tcf_result *res,
+ bool compat_mode,
+ u32 *last_executed_chain)
{
#ifdef CONFIG_NET_CLS_ACT
const int max_reclassify_loop = 4;
- const struct tcf_proto *orig_tp = tp;
const struct tcf_proto *first_tp;
int limit = 0;
@@ -1582,21 +1600,11 @@ reclassify:
#ifdef CONFIG_NET_CLS_ACT
if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) {
first_tp = orig_tp;
+ *last_executed_chain = first_tp->chain->index;
goto reset;
} else if (unlikely(TC_ACT_EXT_CMP(err, TC_ACT_GOTO_CHAIN))) {
first_tp = res->goto_tp;
-
-#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- {
- struct tc_skb_ext *ext;
-
- ext = skb_ext_add(skb, TC_SKB_EXT);
- if (WARN_ON_ONCE(!ext))
- return TC_ACT_SHOT;
-
- ext->chain = err & TC_ACT_EXT_VAL_MASK;
- }
-#endif
+ *last_executed_chain = err & TC_ACT_EXT_VAL_MASK;
goto reset;
}
#endif
@@ -1619,8 +1627,64 @@ reset:
goto reclassify;
#endif
}
+
+int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res, bool compat_mode)
+{
+ u32 last_executed_chain = 0;
+
+ return __tcf_classify(skb, tp, tp, res, compat_mode,
+ &last_executed_chain);
+}
EXPORT_SYMBOL(tcf_classify);
+int tcf_classify_ingress(struct sk_buff *skb,
+ const struct tcf_block *ingress_block,
+ const struct tcf_proto *tp,
+ struct tcf_result *res, bool compat_mode)
+{
+#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ u32 last_executed_chain = 0;
+
+ return __tcf_classify(skb, tp, tp, res, compat_mode,
+ &last_executed_chain);
+#else
+ u32 last_executed_chain = tp ? tp->chain->index : 0;
+ const struct tcf_proto *orig_tp = tp;
+ struct tc_skb_ext *ext;
+ int ret;
+
+ ext = skb_ext_find(skb, TC_SKB_EXT);
+
+ if (ext && ext->chain) {
+ struct tcf_chain *fchain;
+
+ fchain = tcf_chain_lookup_rcu(ingress_block, ext->chain);
+ if (!fchain)
+ return TC_ACT_SHOT;
+
+ /* Consume, so cloned/redirect skbs won't inherit ext */
+ skb_ext_del(skb, TC_SKB_EXT);
+
+ tp = rcu_dereference_bh(fchain->filter_chain);
+ }
+
+ ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode,
+ &last_executed_chain);
+
+ /* If we missed on some chain */
+ if (ret == TC_ACT_UNSPEC && last_executed_chain) {
+ ext = skb_ext_add(skb, TC_SKB_EXT);
+ if (WARN_ON_ONCE(!ext))
+ return TC_ACT_SHOT;
+ ext->chain = last_executed_chain;
+ }
+
+ return ret;
+#endif
+}
+EXPORT_SYMBOL(tcf_classify_ingress);
+
struct tcf_chain_info {
struct tcf_proto __rcu **pprev;
struct tcf_proto __rcu *next;
@@ -3382,14 +3446,40 @@ int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp,
}
EXPORT_SYMBOL(tc_setup_cb_reoffload);
+static int tcf_act_get_cookie(struct flow_action_entry *entry,
+ const struct tc_action *act)
+{
+ struct tc_cookie *cookie;
+ int err = 0;
+
+ rcu_read_lock();
+ cookie = rcu_dereference(act->act_cookie);
+ if (cookie) {
+ entry->cookie = flow_action_cookie_create(cookie->data,
+ cookie->len,
+ GFP_ATOMIC);
+ if (!entry->cookie)
+ err = -ENOMEM;
+ }
+ rcu_read_unlock();
+ return err;
+}
+
+static void tcf_act_put_cookie(struct flow_action_entry *entry)
+{
+ flow_action_cookie_destroy(entry->cookie);
+}
+
void tc_cleanup_flow_action(struct flow_action *flow_action)
{
struct flow_action_entry *entry;
int i;
- flow_action_for_each(i, entry, flow_action)
+ flow_action_for_each(i, entry, flow_action) {
+ tcf_act_put_cookie(entry);
if (entry->destructor)
entry->destructor(entry->destructor_priv);
+ }
}
EXPORT_SYMBOL(tc_cleanup_flow_action);
@@ -3433,22 +3523,30 @@ static void tcf_sample_get_group(struct flow_action_entry *entry,
}
int tc_setup_flow_action(struct flow_action *flow_action,
- const struct tcf_exts *exts, bool rtnl_held)
+ const struct tcf_exts *exts)
{
- const struct tc_action *act;
+ struct tc_action *act;
int i, j, k, err = 0;
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_ANY != FLOW_ACTION_HW_STATS_ANY);
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE);
+ BUILD_BUG_ON(TCA_ACT_HW_STATS_DELAYED != FLOW_ACTION_HW_STATS_DELAYED);
+
if (!exts)
return 0;
- if (!rtnl_held)
- rtnl_lock();
-
j = 0;
tcf_exts_for_each_action(i, act, exts) {
struct flow_action_entry *entry;
entry = &flow_action->entries[j];
+ spin_lock_bh(&act->tcfa_lock);
+ err = tcf_act_get_cookie(entry, act);
+ if (err)
+ goto err_out_locked;
+
+ entry->hw_stats = act->hw_stats;
+
if (is_tcf_gact_ok(act)) {
entry->id = FLOW_ACTION_ACCEPT;
} else if (is_tcf_gact_shot(act)) {
@@ -3489,13 +3587,13 @@ int tc_setup_flow_action(struct flow_action *flow_action,
break;
default:
err = -EOPNOTSUPP;
- goto err_out;
+ goto err_out_locked;
}
} else if (is_tcf_tunnel_set(act)) {
entry->id = FLOW_ACTION_TUNNEL_ENCAP;
err = tcf_tunnel_encap_get_tunnel(entry, act);
if (err)
- goto err_out;
+ goto err_out_locked;
} else if (is_tcf_tunnel_release(act)) {
entry->id = FLOW_ACTION_TUNNEL_DECAP;
} else if (is_tcf_pedit(act)) {
@@ -3509,12 +3607,13 @@ int tc_setup_flow_action(struct flow_action *flow_action,
break;
default:
err = -EOPNOTSUPP;
- goto err_out;
+ goto err_out_locked;
}
entry->mangle.htype = tcf_pedit_htype(act, k);
entry->mangle.mask = tcf_pedit_mask(act, k);
entry->mangle.val = tcf_pedit_val(act, k);
entry->mangle.offset = tcf_pedit_offset(act, k);
+ entry->hw_stats = act->hw_stats;
entry = &flow_action->entries[++j];
}
} else if (is_tcf_csum(act)) {
@@ -3538,6 +3637,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->id = FLOW_ACTION_CT;
entry->ct.action = tcf_ct_action(act);
entry->ct.zone = tcf_ct_zone(act);
+ entry->ct.flow_table = tcf_ct_ft(act);
} else if (is_tcf_mpls(act)) {
switch (tcf_mpls_action(act)) {
case TCA_MPLS_ACT_PUSH:
@@ -3560,28 +3660,32 @@ int tc_setup_flow_action(struct flow_action *flow_action,
entry->mpls_mangle.ttl = tcf_mpls_ttl(act);
break;
default:
- goto err_out;
+ goto err_out_locked;
}
} else if (is_tcf_skbedit_ptype(act)) {
entry->id = FLOW_ACTION_PTYPE;
entry->ptype = tcf_skbedit_ptype(act);
+ } else if (is_tcf_skbedit_priority(act)) {
+ entry->id = FLOW_ACTION_PRIORITY;
+ entry->priority = tcf_skbedit_priority(act);
} else {
err = -EOPNOTSUPP;
- goto err_out;
+ goto err_out_locked;
}
+ spin_unlock_bh(&act->tcfa_lock);
if (!is_tcf_pedit(act))
j++;
}
err_out:
- if (!rtnl_held)
- rtnl_unlock();
-
if (err)
tc_cleanup_flow_action(flow_action);
return err;
+err_out_locked:
+ spin_unlock_bh(&act->tcfa_lock);
+ goto err_out;
}
EXPORT_SYMBOL(tc_setup_flow_action);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index f9c0d1e8d380..74a0febcafb8 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -305,6 +305,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct cls_fl_filter *f;
list_for_each_entry_rcu(mask, &head->masks, list) {
+ flow_dissector_init_keys(&skb_key.control, &skb_key.basic);
fl_clear_masked_range(&skb_key, mask);
skb_flow_dissect_meta(skb, &mask->dissector, &skb_key);
@@ -449,8 +450,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
cls_flower.rule->match.key = &f->mkey;
cls_flower.classid = f->res.classid;
- err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
- rtnl_held);
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
if (err) {
kfree(cls_flower.rule);
if (skip_sw) {
@@ -492,7 +492,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes,
cls_flower.stats.pkts,
- cls_flower.stats.lastused);
+ cls_flower.stats.lastused,
+ cls_flower.stats.used_hw_stats,
+ cls_flower.stats.used_hw_stats_valid);
}
static void __fl_put(struct cls_fl_filter *f)
@@ -691,6 +693,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
.len = 128 / BITS_PER_BYTE },
[TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY,
.len = 128 / BITS_PER_BYTE },
+ [TCA_FLOWER_FLAGS] = { .type = NLA_U32 },
};
static const struct nla_policy
@@ -737,7 +740,8 @@ static void fl_set_key_val(struct nlattr **tb,
}
static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
- struct fl_flow_key *mask)
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
{
fl_set_key_val(tb, &key->tp_range.tp_min.dst,
TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst,
@@ -752,20 +756,30 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src,
TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
- if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
- htons(key->tp_range.tp_max.dst) <=
- htons(key->tp_range.tp_min.dst)) ||
- (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
- htons(key->tp_range.tp_max.src) <=
- htons(key->tp_range.tp_min.src)))
+ if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
+ htons(key->tp_range.tp_max.dst) <=
+ htons(key->tp_range.tp_min.dst)) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_PORT_DST_MIN],
+ "Invalid destination port range (min must be strictly smaller than max)");
return -EINVAL;
+ }
+ if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
+ htons(key->tp_range.tp_max.src) <=
+ htons(key->tp_range.tp_min.src)) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_PORT_SRC_MIN],
+ "Invalid source port range (min must be strictly smaller than max)");
+ return -EINVAL;
+ }
return 0;
}
static int fl_set_key_mpls(struct nlattr **tb,
struct flow_dissector_key_mpls *key_val,
- struct flow_dissector_key_mpls *key_mask)
+ struct flow_dissector_key_mpls *key_mask,
+ struct netlink_ext_ack *extack)
{
if (tb[TCA_FLOWER_KEY_MPLS_TTL]) {
key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]);
@@ -774,24 +788,36 @@ static int fl_set_key_mpls(struct nlattr **tb,
if (tb[TCA_FLOWER_KEY_MPLS_BOS]) {
u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]);
- if (bos & ~MPLS_BOS_MASK)
+ if (bos & ~MPLS_BOS_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_BOS],
+ "Bottom Of Stack (BOS) must be 0 or 1");
return -EINVAL;
+ }
key_val->mpls_bos = bos;
key_mask->mpls_bos = MPLS_BOS_MASK;
}
if (tb[TCA_FLOWER_KEY_MPLS_TC]) {
u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]);
- if (tc & ~MPLS_TC_MASK)
+ if (tc & ~MPLS_TC_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_TC],
+ "Traffic Class (TC) must be between 0 and 7");
return -EINVAL;
+ }
key_val->mpls_tc = tc;
key_mask->mpls_tc = MPLS_TC_MASK;
}
if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) {
u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]);
- if (label & ~MPLS_LABEL_MASK)
+ if (label & ~MPLS_LABEL_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_LABEL],
+ "Label must be between 0 and 1048575");
return -EINVAL;
+ }
key_val->mpls_label = label;
key_mask->mpls_label = MPLS_LABEL_MASK;
}
@@ -832,14 +858,16 @@ static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
}
}
-static int fl_set_key_flags(struct nlattr **tb,
- u32 *flags_key, u32 *flags_mask)
+static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key,
+ u32 *flags_mask, struct netlink_ext_ack *extack)
{
u32 key, mask;
/* mask is mandatory for flags */
- if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
+ if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) {
+ NL_SET_ERR_MSG(extack, "Missing flags mask");
return -EINVAL;
+ }
key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
@@ -1363,7 +1391,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
sizeof(key->icmp.code));
} else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) ||
key->basic.n_proto == htons(ETH_P_MPLS_MC)) {
- ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls);
+ ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls, extack);
if (ret)
return ret;
} else if (key->basic.n_proto == htons(ETH_P_ARP) ||
@@ -1388,7 +1416,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (key->basic.ip_proto == IPPROTO_TCP ||
key->basic.ip_proto == IPPROTO_UDP ||
key->basic.ip_proto == IPPROTO_SCTP) {
- ret = fl_set_key_port_range(tb, key, mask);
+ ret = fl_set_key_port_range(tb, key, mask, extack);
if (ret)
return ret;
}
@@ -1450,7 +1478,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
return ret;
if (tb[TCA_FLOWER_KEY_FLAGS])
- ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
+ ret = fl_set_key_flags(tb, &key->control.flags,
+ &mask->control.flags, extack);
return ret;
}
@@ -1999,8 +2028,7 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
cls_flower.rule->match.mask = &f->mask->key;
cls_flower.rule->match.key = &f->mkey;
- err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts,
- true);
+ err = tc_setup_flow_action(&cls_flower.rule->action, &f->exts);
if (err) {
kfree(cls_flower.rule);
if (tc_skip_sw(f->flags)) {
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 039cc86974f4..8d39dbcf1746 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -97,7 +97,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
cls_mall.command = TC_CLSMATCHALL_REPLACE;
cls_mall.cookie = cookie;
- err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
if (err) {
kfree(cls_mall.rule);
mall_destroy_hw_filter(tp, head, cookie, NULL);
@@ -157,6 +157,7 @@ static void *mall_get(struct tcf_proto *tp, u32 handle)
static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = {
[TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC },
[TCA_MATCHALL_CLASSID] = { .type = NLA_U32 },
+ [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 },
};
static int mall_set_parms(struct net *net, struct tcf_proto *tp,
@@ -301,7 +302,7 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb,
TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
cls_mall.cookie = (unsigned long)head;
- err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts, true);
+ err = tc_setup_flow_action(&cls_mall.rule->action, &head->exts);
if (err) {
kfree(cls_mall.rule);
if (add && tc_skip_sw(head->flags)) {
@@ -337,7 +338,9 @@ static void mall_stats_hw_filter(struct tcf_proto *tp,
tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true);
tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes,
- cls_mall.stats.pkts, cls_mall.stats.lastused);
+ cls_mall.stats.pkts, cls_mall.stats.lastused,
+ cls_mall.stats.used_hw_stats,
+ cls_mall.stats.used_hw_stats_valid);
}
static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 6f8786b06bde..5efa3e7ace15 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -534,8 +534,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
fp = &b->ht[h];
for (pfp = rtnl_dereference(*fp); pfp;
fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
- if (pfp == f) {
- *fp = f->next;
+ if (pfp == fold) {
+ rcu_assign_pointer(*fp, fold->next);
break;
}
}
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 09b7dc5fe7e0..9904299424a1 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -261,8 +261,10 @@ static void tcindex_partial_destroy_work(struct work_struct *work)
struct tcindex_data,
rwork);
+ rtnl_lock();
kfree(p->perfect);
kfree(p);
+ rtnl_unlock();
}
static void tcindex_free_perfect_hash(struct tcindex_data *cp)
@@ -357,6 +359,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
if (tcindex_alloc_perfect_hash(net, cp) < 0)
goto errout;
+ cp->alloc_hash = cp->hash;
for (i = 0; i < min(cp->hash, p->hash); i++)
cp->perfect[i].res = p->perfect[i].res;
balloc = 1;
diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
index 9fff6480acc6..eecfe072c508 100644
--- a/net/sched/em_ipt.c
+++ b/net/sched/em_ipt.c
@@ -22,7 +22,7 @@ struct em_ipt_match {
const struct xt_match *match;
u32 hook;
u8 nfproto;
- u8 match_data[0] __aligned(8);
+ u8 match_data[] __aligned(8);
};
struct em_ipt_xt_match {
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 88c7ce42df7e..2c1192a2ee5e 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -16,7 +16,7 @@
struct nbyte_data {
struct tcf_em_nbyte hdr;
- char pattern[0];
+ char pattern[];
};
static int em_nbyte_change(struct net *net, void *data, int data_len,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 50794125bf02..0d99df1e764d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -618,21 +618,28 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
}
EXPORT_SYMBOL(qdisc_watchdog_init);
-void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
+void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
+ u64 delta_ns)
{
if (test_bit(__QDISC_STATE_DEACTIVATED,
&qdisc_root_sleeping(wd->qdisc)->state))
return;
- if (wd->last_expires == expires)
- return;
+ if (hrtimer_is_queued(&wd->timer)) {
+ /* If timer is already set in [expires, expires + delta_ns],
+ * do not reprogram it.
+ */
+ if (wd->last_expires - expires <= delta_ns)
+ return;
+ }
wd->last_expires = expires;
- hrtimer_start(&wd->timer,
- ns_to_ktime(expires),
- HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_range_ns(&wd->timer,
+ ns_to_ktime(expires),
+ delta_ns,
+ HRTIMER_MODE_ABS_PINNED);
}
-EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
+EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
{
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index f4f9b8cdbffb..ee12ca9f55b4 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -58,7 +58,7 @@ struct atm_flow_data {
struct atm_flow_data *excess; /* flow for excess traffic;
NULL to set CLP instead */
int hdr_len;
- unsigned char hdr[0]; /* header data; MUST BE LAST */
+ unsigned char hdr[]; /* header data; MUST BE LAST */
};
struct atm_qdisc_data {
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index b2905b03a432..2eaac2ff380f 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -181,6 +181,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
s64 credits;
int len;
+ /* The previous packet is still being sent */
+ if (now < q->last) {
+ qdisc_watchdog_schedule_ns(&q->watchdog, q->last);
+ return NULL;
+ }
if (q->credits < 0) {
credits = timediff_to_credits(now - q->last, q->idleslope);
@@ -212,7 +217,12 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
credits += q->credits;
q->credits = max_t(s64, credits, q->locredit);
- q->last = now;
+ /* Estimate of the transmission of the last byte of the packet in ns */
+ if (unlikely(atomic64_read(&q->port_rate) == 0))
+ q->last = now;
+ else
+ q->last = now + div64_s64(len * NSEC_PER_SEC,
+ atomic64_read(&q->port_rate));
return skb;
}
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 37c8aa75d70c..a579a4131d22 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -12,6 +12,7 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
/* 1 band FIFO pseudo-"scheduler" */
@@ -51,8 +52,49 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return NET_XMIT_CN;
}
-static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
+static void fifo_offload_init(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_fifo_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_FIFO_REPLACE;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static void fifo_offload_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+ struct tc_fifo_qopt_offload qopt;
+
+ if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+ return;
+
+ qopt.command = TC_FIFO_DESTROY;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static int fifo_offload_dump(struct Qdisc *sch)
+{
+ struct tc_fifo_qopt_offload qopt;
+
+ qopt.command = TC_FIFO_STATS;
+ qopt.handle = sch->handle;
+ qopt.parent = sch->parent;
+ qopt.stats.bstats = &sch->bstats;
+ qopt.stats.qstats = &sch->qstats;
+
+ return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_FIFO, &qopt);
+}
+
+static int __fifo_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
{
bool bypass;
bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
@@ -82,10 +124,35 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
sch->flags |= TCQ_F_CAN_BYPASS;
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
+
return 0;
}
-static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+static int fifo_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ err = __fifo_init(sch, opt, extack);
+ if (err)
+ return err;
+
+ fifo_offload_init(sch);
+ return 0;
+}
+
+static int fifo_hd_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ return __fifo_init(sch, opt, extack);
+}
+
+static void fifo_destroy(struct Qdisc *sch)
+{
+ fifo_offload_destroy(sch);
+}
+
+static int __fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct tc_fifo_qopt opt = { .limit = sch->limit };
@@ -97,6 +164,22 @@ nla_put_failure:
return -1;
}
+static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ int err;
+
+ err = fifo_offload_dump(sch);
+ if (err)
+ return err;
+
+ return __fifo_dump(sch, skb);
+}
+
+static int fifo_hd_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ return __fifo_dump(sch, skb);
+}
+
struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.id = "pfifo",
.priv_size = 0,
@@ -104,6 +187,7 @@ struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.init = fifo_init,
+ .destroy = fifo_destroy,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
@@ -118,6 +202,7 @@ struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.init = fifo_init,
+ .destroy = fifo_destroy,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
@@ -131,10 +216,10 @@ struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
.enqueue = pfifo_tail_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
- .init = fifo_init,
+ .init = fifo_hd_init,
.reset = qdisc_reset_queue,
- .change = fifo_init,
- .dump = fifo_dump,
+ .change = fifo_hd_init,
+ .dump = fifo_hd_dump,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index a5a295477ecc..4c060134c736 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -121,6 +121,8 @@ struct fq_sched_data {
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
+
+ u32 timer_slack; /* hrtimer slack in ns */
struct qdisc_watchdog watchdog;
};
@@ -504,8 +506,9 @@ begin:
head = &q->old_flows;
if (!head->first) {
if (q->time_next_delayed_flow != ~0ULL)
- qdisc_watchdog_schedule_ns(&q->watchdog,
- q->time_next_delayed_flow);
+ qdisc_watchdog_schedule_range_ns(&q->watchdog,
+ q->time_next_delayed_flow,
+ q->timer_slack);
return NULL;
}
}
@@ -735,6 +738,8 @@ static int fq_resize(struct Qdisc *sch, u32 log)
}
static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
+ [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK },
+
[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
@@ -744,8 +749,10 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
+ [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 },
[TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 },
[TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 },
+ [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt,
@@ -832,6 +839,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
q->ce_threshold = (u64)NSEC_PER_USEC *
nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]);
+ if (tb[TCA_FQ_TIMER_SLACK])
+ q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]);
+
if (!err) {
sch_tree_unlock(sch);
err = fq_resize(sch, fq_log);
@@ -883,6 +893,8 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
q->orphan_mask = 1024 - 1;
q->low_rate_threshold = 550000 / 8;
+ q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */
+
/* Default ce_threshold of 4294 seconds */
q->ce_threshold = (u64)NSEC_PER_USEC * ~0U;
@@ -923,7 +935,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD,
q->low_rate_threshold) ||
nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) ||
- nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
+ nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) ||
+ nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack))
goto nla_put_failure;
return nla_nest_end(skb, opts);
@@ -946,7 +959,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
st.flows_plimit = q->stat_flows_plimit;
st.pkts_too_long = q->stat_pkts_too_long;
st.allocation_errors = q->stat_allocation_errors;
- st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns();
+ st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack -
+ ktime_get_ns();
st.flows = q->flows;
st.inactive_flows = q->inactive_flows;
st.throttled_flows = q->throttled_flows;
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index 214657eb3dfd..a9da8776bf5b 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -189,7 +189,6 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
out:
q->stats.dropped++;
sel_flow->vars.accu_prob = 0;
- sel_flow->vars.accu_prob_overflows = 0;
__qdisc_drop(skb, to_free);
qdisc_qstats_drop(sch);
return NET_XMIT_CN;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6c9595f1048a..2efd5b61acef 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1391,6 +1391,14 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);
+void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
+ struct tcf_block *block)
+{
+ miniqp->miniq1.block = block;
+ miniqp->miniq2.block = block;
+}
+EXPORT_SYMBOL(mini_qdisc_pair_block_init);
+
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
struct mini_Qdisc __rcu **p_miniq)
{
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index bf56aa519797..84838128b9c5 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -78,6 +78,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
{
struct ingress_sched_data *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ int err;
net_inc_ingress_queue();
@@ -87,7 +88,13 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
q->block_info.chain_head_change = clsact_chain_head_change;
q->block_info.chain_head_change_priv = &q->miniqp;
- return tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
+ err = tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
+ if (err)
+ return err;
+
+ mini_qdisc_pair_block_init(&q->miniqp, q->block);
+
+ return 0;
}
static void ingress_destroy(struct Qdisc *sch)
@@ -226,6 +233,8 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
if (err)
return err;
+ mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block);
+
mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 42e557d48e4e..84f82771cdf5 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -66,7 +66,7 @@
struct disttable {
u32 size;
- s16 table[0];
+ s16 table[];
};
struct netem_sched_data {
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index 915bcdb59a9f..c65077f0c0f3 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -31,7 +31,7 @@ struct pie_sched_data {
};
bool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
- struct pie_vars *vars, u32 qlen, u32 packet_size)
+ struct pie_vars *vars, u32 backlog, u32 packet_size)
{
u64 rnd;
u64 local_prob = vars->prob;
@@ -51,7 +51,7 @@ bool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
/* If we have fewer than 2 mtu-sized packets, disable pie_drop_early,
* similar to min_th in RED
*/
- if (qlen < 2 * mtu)
+ if (backlog < 2 * mtu)
return false;
/* If bytemode is turned on, use packet size to compute new
@@ -62,27 +62,19 @@ bool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
else
local_prob = vars->prob;
- if (local_prob == 0) {
+ if (local_prob == 0)
vars->accu_prob = 0;
- vars->accu_prob_overflows = 0;
- }
-
- if (local_prob > MAX_PROB - vars->accu_prob)
- vars->accu_prob_overflows++;
-
- vars->accu_prob += local_prob;
+ else
+ vars->accu_prob += local_prob;
- if (vars->accu_prob_overflows == 0 &&
- vars->accu_prob < (MAX_PROB / 100) * 85)
+ if (vars->accu_prob < (MAX_PROB / 100) * 85)
return false;
- if (vars->accu_prob_overflows == 8 &&
- vars->accu_prob >= MAX_PROB / 2)
+ if (vars->accu_prob >= (MAX_PROB / 2) * 17)
return true;
prandom_bytes(&rnd, 8);
- if (rnd < local_prob) {
+ if ((rnd >> BITS_PER_BYTE) < local_prob) {
vars->accu_prob = 0;
- vars->accu_prob_overflows = 0;
return true;
}
@@ -129,7 +121,6 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
out:
q->stats.dropped++;
q->vars.accu_prob = 0;
- q->vars.accu_prob_overflows = 0;
return qdisc_drop(skb, sch, to_free);
}
@@ -215,7 +206,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt,
}
void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
- struct pie_vars *vars, u32 qlen)
+ struct pie_vars *vars, u32 backlog)
{
psched_time_t now = psched_get_time();
u32 dtime = 0;
@@ -231,7 +222,7 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
vars->dq_tstamp = now;
- if (qlen == 0)
+ if (backlog == 0)
vars->qdelay = 0;
if (dtime == 0)
@@ -244,7 +235,7 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
* we have enough packets to calculate the drain rate. Save
* current time as dq_tstamp and start measurement cycle.
*/
- if (qlen >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) {
+ if (backlog >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) {
vars->dq_tstamp = psched_get_time();
vars->dq_count = 0;
}
@@ -283,7 +274,7 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
* dq_count to 0 to re-enter the if block when the next
* packet is dequeued
*/
- if (qlen < QUEUE_THRESHOLD) {
+ if (backlog < QUEUE_THRESHOLD) {
vars->dq_count = DQCOUNT_INVALID;
} else {
vars->dq_count = 0;
@@ -307,7 +298,7 @@ burst_allowance_reduction:
EXPORT_SYMBOL_GPL(pie_process_dequeue);
void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
- u32 qlen)
+ u32 backlog)
{
psched_time_t qdelay = 0; /* in pschedtime */
psched_time_t qdelay_old = 0; /* in pschedtime */
@@ -322,7 +313,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
vars->qdelay_old = vars->qdelay;
if (vars->avg_dq_rate > 0)
- qdelay = (qlen << PIE_SCALE) / vars->avg_dq_rate;
+ qdelay = (backlog << PIE_SCALE) / vars->avg_dq_rate;
else
qdelay = 0;
} else {
@@ -330,10 +321,10 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
qdelay_old = vars->qdelay_old;
}
- /* If qdelay is zero and qlen is not, it means qlen is very small,
+ /* If qdelay is zero and backlog is not, it means backlog is very small,
* so we do not update probabilty in this round.
*/
- if (qdelay == 0 && qlen != 0)
+ if (qdelay == 0 && backlog != 0)
update_prob = false;
/* In the algorithm, alpha and beta are between 0 and 2 with typical
@@ -363,8 +354,8 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
}
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
- delta += alpha * (u64)(qdelay - params->target);
- delta += beta * (u64)(qdelay - qdelay_old);
+ delta += alpha * (qdelay - params->target);
+ delta += beta * (qdelay - qdelay_old);
oldprob = vars->prob;
@@ -409,7 +400,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
vars->prob -= vars->prob / 64;
vars->qdelay = qdelay;
- vars->qlen_old = qlen;
+ vars->backlog_old = backlog;
/* We restart the measurement cycle if the following conditions are met
* 1. If the delay has been low for 2 consecutive Tupdate periods
@@ -502,7 +493,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct pie_sched_data *q = qdisc_priv(sch);
struct tc_pie_xstats st = {
- .prob = q->vars.prob,
+ .prob = q->vars.prob << BITS_PER_BYTE,
.delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
NSEC_PER_USEC,
.packets_in = q->stats.packets_in,
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 1695421333e3..c7de47c942e3 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -35,7 +35,11 @@
struct red_sched_data {
u32 limit; /* HARD maximal queue length */
+
unsigned char flags;
+ /* Non-flags in tc_red_qopt.flags. */
+ unsigned char userbits;
+
struct timer_list adapt_timer;
struct Qdisc *sch;
struct red_parms parms;
@@ -44,6 +48,8 @@ struct red_sched_data {
struct Qdisc *qdisc;
};
+static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS | TC_RED_NODROP;
+
static inline int red_use_ecn(struct red_sched_data *q)
{
return q->flags & TC_RED_ECN;
@@ -54,6 +60,11 @@ static inline int red_use_harddrop(struct red_sched_data *q)
return q->flags & TC_RED_HARDDROP;
}
+static int red_use_nodrop(struct red_sched_data *q)
+{
+ return q->flags & TC_RED_NODROP;
+}
+
static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -74,23 +85,36 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
case RED_PROB_MARK:
qdisc_qstats_overlimit(sch);
- if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
+ if (!red_use_ecn(q)) {
q->stats.prob_drop++;
goto congestion_drop;
}
- q->stats.prob_mark++;
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.prob_mark++;
+ } else if (!red_use_nodrop(q)) {
+ q->stats.prob_drop++;
+ goto congestion_drop;
+ }
+
+ /* Non-ECT packet in ECN nodrop mode: queue it. */
break;
case RED_HARD_MARK:
qdisc_qstats_overlimit(sch);
- if (red_use_harddrop(q) || !red_use_ecn(q) ||
- !INET_ECN_set_ce(skb)) {
+ if (red_use_harddrop(q) || !red_use_ecn(q)) {
+ q->stats.forced_drop++;
+ goto congestion_drop;
+ }
+
+ if (INET_ECN_set_ce(skb)) {
+ q->stats.forced_mark++;
+ } else if (!red_use_nodrop(q)) {
q->stats.forced_drop++;
goto congestion_drop;
}
- q->stats.forced_mark++;
+ /* Non-ECT packet in ECN nodrop mode: queue it. */
break;
}
@@ -165,6 +189,7 @@ static int red_offload(struct Qdisc *sch, bool enable)
opt.set.limit = q->limit;
opt.set.is_ecn = red_use_ecn(q);
opt.set.is_harddrop = red_use_harddrop(q);
+ opt.set.is_nodrop = red_use_nodrop(q);
opt.set.qstats = &sch->qstats;
} else {
opt.command = TC_RED_DESTROY;
@@ -183,9 +208,12 @@ static void red_destroy(struct Qdisc *sch)
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
+ [TCA_RED_UNSPEC] = { .strict_start_type = TCA_RED_FLAGS },
[TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
[TCA_RED_STAB] = { .len = RED_STAB_SIZE },
[TCA_RED_MAX_P] = { .type = NLA_U32 },
+ [TCA_RED_FLAGS] = { .type = NLA_BITFIELD32,
+ .validation_data = &red_supported_flags },
};
static int red_change(struct Qdisc *sch, struct nlattr *opt,
@@ -194,7 +222,10 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
struct Qdisc *old_child = NULL, *child = NULL;
struct red_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_RED_MAX + 1];
+ struct nla_bitfield32 flags_bf;
struct tc_red_qopt *ctl;
+ unsigned char userbits;
+ unsigned char flags;
int err;
u32 max_P;
@@ -216,6 +247,12 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
return -EINVAL;
+ err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS,
+ tb[TCA_RED_FLAGS], red_supported_flags,
+ &flags_bf, &userbits, extack);
+ if (err)
+ return err;
+
if (ctl->limit > 0) {
child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit,
extack);
@@ -227,7 +264,14 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
}
sch_tree_lock(sch);
- q->flags = ctl->flags;
+
+ flags = (q->flags & ~flags_bf.selector) | flags_bf.value;
+ err = red_validate_flags(flags, extack);
+ if (err)
+ goto unlock_out;
+
+ q->flags = flags;
+ q->userbits = userbits;
q->limit = ctl->limit;
if (child) {
qdisc_tree_flush_backlog(q->qdisc);
@@ -256,6 +300,12 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
if (old_child)
qdisc_put(old_child);
return 0;
+
+unlock_out:
+ sch_tree_unlock(sch);
+ if (child)
+ qdisc_put(child);
+ return err;
}
static inline void red_adaptative_timer(struct timer_list *t)
@@ -302,7 +352,8 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
struct nlattr *opts = NULL;
struct tc_red_qopt opt = {
.limit = q->limit,
- .flags = q->flags,
+ .flags = (q->flags & TC_RED_HISTORIC_FLAGS) |
+ q->userbits,
.qth_min = q->parms.qth_min >> q->parms.Wlog,
.qth_max = q->parms.qth_max >> q->parms.Wlog,
.Wlog = q->parms.Wlog,
@@ -319,7 +370,9 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
- nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
+ nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) ||
+ nla_put_bitfield32(skb, TCA_RED_FLAGS,
+ q->flags, red_supported_flags))
goto nla_put_failure;
return nla_nest_end(skb, opts);
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 660fc45ee40f..b1eb12d33b9a 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -564,8 +564,10 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch)
prio = skb->priority;
tc = netdev_get_prio_tc_map(dev, prio);
- if (!(gate_mask & BIT(tc)))
+ if (!(gate_mask & BIT(tc))) {
+ skb = NULL;
continue;
+ }
len = qdisc_pkt_len(skb);
guard = ktime_add_ns(taprio_get_time(q),
@@ -575,13 +577,17 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch)
* guard band ...
*/
if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- ktime_after(guard, entry->close_time))
+ ktime_after(guard, entry->close_time)) {
+ skb = NULL;
continue;
+ }
/* ... and no budget. */
if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- atomic_sub_return(len, &entry->budget) < 0)
+ atomic_sub_return(len, &entry->budget) < 0) {
+ skb = NULL;
continue;
+ }
skb = child->ops->dequeue(child);
if (unlikely(!skb))
@@ -768,6 +774,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = { .type = NLA_S64 },
[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
[TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 },
+ [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 },
};
static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index 8a15146faaeb..493fc01e5d2b 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -237,15 +237,11 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc)
addrcnt++;
return nla_total_size(sizeof(struct sctp_info))
- + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
- + nla_total_size(1) /* INET_DIAG_TOS */
- + nla_total_size(1) /* INET_DIAG_TCLASS */
- + nla_total_size(4) /* INET_DIAG_MARK */
- + nla_total_size(4) /* INET_DIAG_CLASS_ID */
+ nla_total_size(addrlen * asoc->peer.transport_count)
+ nla_total_size(addrlen * addrcnt)
- + nla_total_size(sizeof(struct inet_diag_meminfo))
+ nla_total_size(sizeof(struct inet_diag_msg))
+ + inet_diag_msg_attrs_size()
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ 64;
}
@@ -432,11 +428,12 @@ static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo);
}
-static int sctp_diag_dump_one(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+static int sctp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
+ struct sk_buff *in_skb = cb->skb;
struct net *net = sock_net(in_skb->sk);
+ const struct nlmsghdr *nlh = cb->nlh;
union sctp_addr laddr, paddr;
struct sctp_comm_param commp = {
.skb = in_skb,
@@ -470,7 +467,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb,
}
static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
u32 idiag_states = r->idiag_states;
struct net *net = sock_net(skb->sk);
diff --git a/net/sctp/input.c b/net/sctp/input.c
index efaaefc3bb1c..55d4fc6f371d 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -548,6 +548,7 @@ out:
/* Common cleanup code for icmp/icmpv6 error handler. */
void sctp_err_finish(struct sock *sk, struct sctp_transport *t)
+ __releases(&((__sk)->sk_lock.slock))
{
bh_unlock_sock(sk);
sctp_transport_put(t);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index bc734cfaa29e..c87af430107a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -228,7 +228,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
{
struct sctp_association *asoc = t->asoc;
struct dst_entry *dst = NULL;
- struct flowi6 *fl6 = &fl->u.ip6;
+ struct flowi _fl;
+ struct flowi6 *fl6 = &_fl.u.ip6;
struct sctp_bind_addr *bp;
struct ipv6_pinfo *np = inet6_sk(sk);
struct sctp_sockaddr_entry *laddr;
@@ -238,7 +239,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
enum sctp_scope scope;
__u8 matchlen = 0;
- memset(fl6, 0, sizeof(struct flowi6));
+ memset(&_fl, 0, sizeof(_fl));
fl6->daddr = daddr->v6.sin6_addr;
fl6->fl6_dport = daddr->v6.sin6_port;
fl6->flowi6_proto = IPPROTO_SCTP;
@@ -276,8 +277,11 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
rcu_read_unlock();
dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
- if (!asoc || saddr)
+ if (!asoc || saddr) {
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
goto out;
+ }
bp = &asoc->base.bind_addr;
scope = sctp_scope(daddr);
@@ -300,6 +304,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
if ((laddr->a.sa.sa_family == AF_INET6) &&
(sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) {
rcu_read_unlock();
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
goto out;
}
}
@@ -338,6 +344,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
if (!IS_ERR_OR_NULL(dst))
dst_release(dst);
dst = bdst;
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
break;
}
@@ -351,6 +359,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
dst_release(dst);
dst = bdst;
matchlen = bmatchlen;
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
}
rcu_read_unlock();
@@ -359,14 +369,12 @@ out:
struct rt6_info *rt;
rt = (struct rt6_info *)dst;
- t->dst = dst;
t->dst_cookie = rt6_get_cookie(rt);
pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n",
&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
- &fl6->saddr);
+ &fl->u.ip6.saddr);
} else {
t->dst = NULL;
-
pr_debug("no route\n");
}
}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 78af2fcf90cc..092d1afdee0d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -409,7 +409,8 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
{
struct sctp_association *asoc = t->asoc;
struct rtable *rt;
- struct flowi4 *fl4 = &fl->u.ip4;
+ struct flowi _fl;
+ struct flowi4 *fl4 = &_fl.u.ip4;
struct sctp_bind_addr *bp;
struct sctp_sockaddr_entry *laddr;
struct dst_entry *dst = NULL;
@@ -419,7 +420,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
if (t->dscp & SCTP_DSCP_SET_MASK)
tos = t->dscp & SCTP_DSCP_VAL_MASK;
- memset(fl4, 0x0, sizeof(struct flowi4));
+ memset(&_fl, 0x0, sizeof(_fl));
fl4->daddr = daddr->v4.sin_addr.s_addr;
fl4->fl4_dport = daddr->v4.sin_port;
fl4->flowi4_proto = IPPROTO_SCTP;
@@ -438,8 +439,11 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
&fl4->saddr);
rt = ip_route_output_key(sock_net(sk), fl4);
- if (!IS_ERR(rt))
+ if (!IS_ERR(rt)) {
dst = &rt->dst;
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
+ }
/* If there is no association or if a source address is passed, no
* more validation is required.
@@ -502,27 +506,33 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
false);
if (!odev || odev->ifindex != fl4->flowi4_oif) {
- if (!dst)
+ if (!dst) {
dst = &rt->dst;
- else
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
+ } else {
dst_release(&rt->dst);
+ }
continue;
}
dst_release(dst);
dst = &rt->dst;
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
break;
}
out_unlock:
rcu_read_unlock();
out:
- t->dst = dst;
- if (dst)
+ if (dst) {
pr_debug("rt_dst:%pI4, rt_src:%pI4\n",
- &fl4->daddr, &fl4->saddr);
- else
+ &fl->u.ip4.daddr, &fl->u.ip4.saddr);
+ } else {
+ t->dst = NULL;
pr_debug("no route\n");
+ }
}
/* For v4, the source address is cached in the route entry(dst). So no need
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 748e3b19ec1d..6a16af4b1ef6 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -170,6 +170,16 @@ static inline bool sctp_chunk_length_valid(struct sctp_chunk *chunk,
return true;
}
+/* Check for format error in an ABORT chunk */
+static inline bool sctp_err_chunk_valid(struct sctp_chunk *chunk)
+{
+ struct sctp_errhdr *err;
+
+ sctp_walk_errors(err, chunk->chunk_hdr);
+
+ return (void *)err == (void *)chunk->chunk_end;
+}
+
/**********************************************************
* These are the state functions for handling chunk events.
**********************************************************/
@@ -2255,6 +2265,9 @@ enum sctp_disposition sctp_sf_shutdown_pending_abort(
sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+ if (!sctp_err_chunk_valid(chunk))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
}
@@ -2298,6 +2311,9 @@ enum sctp_disposition sctp_sf_shutdown_sent_abort(
sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+ if (!sctp_err_chunk_valid(chunk))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
/* Stop the T2-shutdown timer. */
sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
@@ -2565,6 +2581,9 @@ enum sctp_disposition sctp_sf_do_9_1_abort(
sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest))
return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands);
+ if (!sctp_err_chunk_valid(chunk))
+ return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+
return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands);
}
@@ -2582,16 +2601,8 @@ static enum sctp_disposition __sctp_sf_do_9_1_abort(
/* See if we have an error cause code in the chunk. */
len = ntohs(chunk->chunk_hdr->length);
- if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) {
- struct sctp_errhdr *err;
-
- sctp_walk_errors(err, chunk->chunk_hdr);
- if ((void *)err != (void *)chunk->chunk_end)
- return sctp_sf_pdiscard(net, ep, asoc, type, arg,
- commands);
-
+ if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
error = ((struct sctp_errhdr *)chunk->skb->data)->cause;
- }
sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET));
/* ASSOC_FAILED will DELETE_TCB. */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 1b56fc440606..827a9903ee28 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -147,29 +147,44 @@ static void sctp_clear_owner_w(struct sctp_chunk *chunk)
skb_orphan(chunk->skb);
}
+#define traverse_and_process() \
+do { \
+ msg = chunk->msg; \
+ if (msg == prev_msg) \
+ continue; \
+ list_for_each_entry(c, &msg->chunks, frag_list) { \
+ if ((clear && asoc->base.sk == c->skb->sk) || \
+ (!clear && asoc->base.sk != c->skb->sk)) \
+ cb(c); \
+ } \
+ prev_msg = msg; \
+} while (0)
+
static void sctp_for_each_tx_datachunk(struct sctp_association *asoc,
+ bool clear,
void (*cb)(struct sctp_chunk *))
{
+ struct sctp_datamsg *msg, *prev_msg = NULL;
struct sctp_outq *q = &asoc->outqueue;
+ struct sctp_chunk *chunk, *c;
struct sctp_transport *t;
- struct sctp_chunk *chunk;
list_for_each_entry(t, &asoc->peer.transport_addr_list, transports)
list_for_each_entry(chunk, &t->transmitted, transmitted_list)
- cb(chunk);
+ traverse_and_process();
list_for_each_entry(chunk, &q->retransmit, transmitted_list)
- cb(chunk);
+ traverse_and_process();
list_for_each_entry(chunk, &q->sacked, transmitted_list)
- cb(chunk);
+ traverse_and_process();
list_for_each_entry(chunk, &q->abandoned, transmitted_list)
- cb(chunk);
+ traverse_and_process();
list_for_each_entry(chunk, &q->out_chunk_list, list)
- cb(chunk);
+ traverse_and_process();
}
static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk,
@@ -5333,14 +5348,14 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
EXPORT_SYMBOL_GPL(sctp_get_sctp_info);
/* use callback to avoid exporting the core structure */
-void sctp_transport_walk_start(struct rhashtable_iter *iter)
+void sctp_transport_walk_start(struct rhashtable_iter *iter) __acquires(RCU)
{
rhltable_walk_enter(&sctp_transport_hashtable, iter);
rhashtable_walk_start(iter);
}
-void sctp_transport_walk_stop(struct rhashtable_iter *iter)
+void sctp_transport_walk_stop(struct rhashtable_iter *iter) __releases(RCU)
{
rhashtable_walk_stop(iter);
rhashtable_walk_exit(iter);
@@ -9574,9 +9589,9 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
* paths won't try to lock it and then oldsk.
*/
lock_sock_nested(newsk, SINGLE_DEPTH_NESTING);
- sctp_for_each_tx_datachunk(assoc, sctp_clear_owner_w);
+ sctp_for_each_tx_datachunk(assoc, true, sctp_clear_owner_w);
sctp_assoc_migrate(assoc, newsk);
- sctp_for_each_tx_datachunk(assoc, sctp_set_owner_w);
+ sctp_for_each_tx_datachunk(assoc, false, sctp_set_owner_w);
/* If the association on the newsk is already closed before accept()
* is called, set RCV_SHUTDOWN flag.
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index cee5bf4a9bb9..6fd44bdb0fc3 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -470,6 +470,8 @@ static void smc_switch_to_fallback(struct smc_sock *smc)
if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
smc->clcsock->file = smc->sk.sk_socket->file;
smc->clcsock->file->private_data = smc->clcsock;
+ smc->clcsock->wq.fasync_list =
+ smc->sk.sk_socket->wq.fasync_list;
}
}
@@ -510,15 +512,18 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
static int smc_connect_abort(struct smc_sock *smc, int reason_code,
int local_contact)
{
+ bool is_smcd = smc->conn.lgr->is_smcd;
+
if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(smc->conn.lgr);
- if (smc->conn.lgr->is_smcd)
+ smc_lgr_cleanup_early(&smc->conn);
+ else
+ smc_conn_free(&smc->conn);
+ if (is_smcd)
/* there is only one lgr role for SMC-D; use server lock */
mutex_unlock(&smc_server_lgr_pending);
else
mutex_unlock(&smc_client_lgr_pending);
- smc_conn_free(&smc->conn);
smc->connect_nonblock = 0;
return reason_code;
}
@@ -1089,7 +1094,6 @@ static void smc_listen_out_err(struct smc_sock *new_smc)
if (newsmcsk->sk_state == SMC_INIT)
sock_put(&new_smc->sk); /* passive closing */
newsmcsk->sk_state = SMC_CLOSED;
- smc_conn_free(&new_smc->conn);
smc_listen_out(new_smc);
}
@@ -1100,12 +1104,13 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
{
/* RDMA setup failed, switch back to TCP */
if (local_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
if (reason_code < 0) { /* error, no fallback possible */
smc_listen_out_err(new_smc);
return;
}
- smc_conn_free(&new_smc->conn);
smc_switch_to_fallback(new_smc);
new_smc->fallback_rsn = reason_code;
if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
@@ -1168,16 +1173,18 @@ static int smc_listen_ism_init(struct smc_sock *new_smc,
new_smc->conn.lgr->vlan_id,
new_smc->conn.lgr->smcd)) {
if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
- smc_conn_free(&new_smc->conn);
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
return SMC_CLC_DECL_SMCDNOTALK;
}
/* Create send and receive buffers */
if (smc_buf_create(new_smc, true)) {
if (ini->cln_first_contact == SMC_FIRST_CONTACT)
- smc_lgr_forget(new_smc->conn.lgr);
- smc_conn_free(&new_smc->conn);
+ smc_lgr_cleanup_early(&new_smc->conn);
+ else
+ smc_conn_free(&new_smc->conn);
return SMC_CLC_DECL_MEM;
}
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 0879f7bed967..ea0068f0173c 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -349,7 +349,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
smc->conn.lgr->sync_err = 1;
- smc_lgr_terminate(smc->conn.lgr, true);
+ smc_lgr_terminate_sched(smc->conn.lgr);
}
}
@@ -372,7 +372,10 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
dclc.hdr.version = SMC_CLC_V1;
dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0;
- memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
+ if ((!smc->conn.lgr || !smc->conn.lgr->is_smcd) &&
+ smc_ib_is_valid_local_systemid())
+ memcpy(dclc.id_for_peer, local_systemid,
+ sizeof(local_systemid));
dclc.peer_diagnosis = htonl(peer_diag_info);
memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2249de5379ee..824c5211b027 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -46,6 +46,7 @@ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
struct smc_buf_desc *buf_desc);
+static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
/* return head of link group list and its lock for a given link group */
static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
@@ -162,6 +163,18 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
conn->lgr = NULL;
}
+void smc_lgr_cleanup_early(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ if (!lgr)
+ return;
+
+ smc_conn_free(conn);
+ smc_lgr_forget(lgr);
+ smc_lgr_schedule_free_work_fast(lgr);
+}
+
/* Send delete link, either as client to request the initiation
* of the DELETE LINK sequence from server; or as server to
* initiate the delete processing. See smc_llc_rx_delete_link().
@@ -229,7 +242,7 @@ static void smc_lgr_terminate_work(struct work_struct *work)
struct smc_link_group *lgr = container_of(work, struct smc_link_group,
terminate_work);
- smc_lgr_terminate(lgr, true);
+ __smc_lgr_terminate(lgr, true);
}
/* create a new SMC link group */
@@ -576,15 +589,15 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr)
} else {
struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
- wake_up(&lnk->wr_reg_wait);
- if (lnk->state != SMC_LNK_INACTIVE) {
- smc_link_send_delete(lnk, false);
+ if (lnk->state != SMC_LNK_INACTIVE)
smc_llc_link_inactive(lnk);
- }
}
}
-/* terminate link group */
+/* terminate link group
+ * @soft: true if link group shutdown can take its time
+ * false if immediate link group shutdown is required
+ */
static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
{
struct smc_connection *conn;
@@ -622,25 +635,20 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
smc_lgr_free(lgr);
}
-/* unlink and terminate link group
- * @soft: true if link group shutdown can take its time
- * false if immediate link group shutdown is required
- */
-void smc_lgr_terminate(struct smc_link_group *lgr, bool soft)
+/* unlink link group and schedule termination */
+void smc_lgr_terminate_sched(struct smc_link_group *lgr)
{
spinlock_t *lgr_lock;
smc_lgr_list_head(lgr, &lgr_lock);
spin_lock_bh(lgr_lock);
- if (lgr->terminating) {
+ if (list_empty(&lgr->list) || lgr->terminating || lgr->freeing) {
spin_unlock_bh(lgr_lock);
return; /* lgr already terminating */
}
- if (!soft)
- lgr->freeing = 1;
list_del_init(&lgr->list);
spin_unlock_bh(lgr_lock);
- __smc_lgr_terminate(lgr, soft);
+ schedule_work(&lgr->terminate_work);
}
/* Called when IB port is terminated */
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index c472e12951d1..8041db20c753 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -285,18 +285,13 @@ static inline struct smc_connection *smc_lgr_find_conn(
return res;
}
-static inline void smc_lgr_terminate_sched(struct smc_link_group *lgr)
-{
- if (!lgr->terminating && !lgr->freeing)
- schedule_work(&lgr->terminate_work);
-}
-
struct smc_sock;
struct smc_clc_msg_accept_confirm;
struct smc_clc_msg_local;
void smc_lgr_forget(struct smc_link_group *lgr);
-void smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
+void smc_lgr_cleanup_early(struct smc_connection *conn);
+void smc_lgr_terminate_sched(struct smc_link_group *lgr);
void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
unsigned short vlan);
@@ -316,7 +311,6 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
void smc_conn_free(struct smc_connection *conn);
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
-void smcd_conn_free(struct smc_connection *conn);
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
int smc_core_init(void);
void smc_core_exit(void);
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index f38727ecf8b2..e1f64f4ba236 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -39,16 +39,15 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
{
struct smc_sock *smc = smc_sk(sk);
+ memset(r, 0, sizeof(*r));
r->diag_family = sk->sk_family;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
if (!smc->clcsock)
return;
r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
r->id.idiag_dport = smc->clcsock->sk->sk_dport;
r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
- sock_diag_save_cookie(sk, r->id.idiag_cookie);
if (sk->sk_protocol == SMCPROTO_SMC) {
- memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
- memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 548632621f4b..04b6fefb8bce 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -37,11 +37,7 @@ struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
.list = LIST_HEAD_INIT(smc_ib_devices.list),
};
-#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%"
-
-u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
- * identifier
- */
+u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
static int smc_ib_modify_qp_init(struct smc_link *lnk)
{
@@ -168,6 +164,15 @@ static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
{
memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
sizeof(smcibdev->mac[ibport - 1]));
+}
+
+bool smc_ib_is_valid_local_systemid(void)
+{
+ return !is_zero_ether_addr(&local_systemid[2]);
+}
+
+static void smc_ib_init_local_systemid(void)
+{
get_random_bytes(&local_systemid[0], 2);
}
@@ -224,8 +229,7 @@ static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
rc = smc_ib_fill_mac(smcibdev, ibport);
if (rc)
goto out;
- if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
- sizeof(local_systemid)) &&
+ if (!smc_ib_is_valid_local_systemid() &&
smc_ib_port_active(smcibdev, ibport))
/* create unique system identifier */
smc_ib_define_local_systemid(smcibdev, ibport);
@@ -257,6 +261,7 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
struct ib_event *ibevent)
{
struct smc_ib_device *smcibdev;
+ bool schedule = false;
u8 port_idx;
smcibdev = container_of(handler, struct smc_ib_device, event_handler);
@@ -266,22 +271,35 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler,
/* terminate all ports on device */
for (port_idx = 0; port_idx < SMC_MAX_PORTS; port_idx++) {
set_bit(port_idx, &smcibdev->port_event_mask);
- set_bit(port_idx, smcibdev->ports_going_away);
+ if (!test_and_set_bit(port_idx,
+ smcibdev->ports_going_away))
+ schedule = true;
}
- schedule_work(&smcibdev->port_event_work);
+ if (schedule)
+ schedule_work(&smcibdev->port_event_work);
break;
- case IB_EVENT_PORT_ERR:
case IB_EVENT_PORT_ACTIVE:
- case IB_EVENT_GID_CHANGE:
port_idx = ibevent->element.port_num - 1;
- if (port_idx < SMC_MAX_PORTS) {
- set_bit(port_idx, &smcibdev->port_event_mask);
- if (ibevent->event == IB_EVENT_PORT_ERR)
- set_bit(port_idx, smcibdev->ports_going_away);
- else if (ibevent->event == IB_EVENT_PORT_ACTIVE)
- clear_bit(port_idx, smcibdev->ports_going_away);
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (test_and_clear_bit(port_idx, smcibdev->ports_going_away))
schedule_work(&smcibdev->port_event_work);
- }
+ break;
+ case IB_EVENT_PORT_ERR:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ case IB_EVENT_GID_CHANGE:
+ port_idx = ibevent->element.port_num - 1;
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
break;
default:
break;
@@ -316,11 +334,11 @@ static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
case IB_EVENT_QP_FATAL:
case IB_EVENT_QP_ACCESS_ERR:
port_idx = ibevent->element.qp->port - 1;
- if (port_idx < SMC_MAX_PORTS) {
- set_bit(port_idx, &smcibdev->port_event_mask);
- set_bit(port_idx, smcibdev->ports_going_away);
+ if (port_idx >= SMC_MAX_PORTS)
+ break;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ if (!test_and_set_bit(port_idx, smcibdev->ports_going_away))
schedule_work(&smcibdev->port_event_work);
- }
break;
default:
break;
@@ -573,6 +591,8 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
struct smc_ib_device *smcibdev;
smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
+ if (!smcibdev || smcibdev->ibdev != ibdev)
+ return;
ib_set_client_data(ibdev, &smc_ib_client, NULL);
spin_lock(&smc_ib_devices.lock);
list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
@@ -580,6 +600,7 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
smc_smcr_terminate_all(smcibdev);
smc_ib_cleanup_per_ibdev(smcibdev);
ib_unregister_event_handler(&smcibdev->event_handler);
+ cancel_work_sync(&smcibdev->port_event_work);
kfree(smcibdev);
}
@@ -591,6 +612,7 @@ static struct ib_client smc_ib_client = {
int __init smc_ib_register_client(void)
{
+ smc_ib_init_local_systemid();
return ib_register_client(&smc_ib_client);
}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 255db87547d3..5c2b115d36da 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -84,4 +84,5 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
enum dma_data_direction data_direction);
int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
unsigned short vlan_id, u8 gid[], u8 *sgid_index);
+bool smc_ib_is_valid_local_systemid(void);
#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index a9f6431dd69a..0e52aab53d97 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -614,7 +614,7 @@ static void smc_llc_testlink_work(struct work_struct *work)
rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
SMC_LLC_WAIT_TIME);
if (rc <= 0) {
- smc_lgr_terminate(smc_get_lgr(link), true);
+ smc_lgr_terminate_sched(smc_get_lgr(link));
return;
}
next_interval = link->llc_testlink_time;
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 0d42e7716b91..9f1ade86d70e 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -284,7 +284,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL);
if (rc)
- smc_lgr_terminate(lgr, true);
+ smc_lgr_terminate_sched(lgr);
return rc;
}
diff --git a/net/socket.c b/net/socket.c
index b79a05de7c6e..2dd739fba866 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1707,7 +1707,8 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
- int __user *upeer_addrlen, int flags)
+ int __user *upeer_addrlen, int flags,
+ unsigned long nofile)
{
struct socket *sock, *newsock;
struct file *newfile;
@@ -1738,7 +1739,7 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
*/
__module_get(newsock->ops->owner);
- newfd = get_unused_fd_flags(flags);
+ newfd = __get_unused_fd_flags(flags, nofile);
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
@@ -1807,7 +1808,8 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
f = fdget(fd);
if (f.file) {
ret = __sys_accept4_file(f.file, 0, upeer_sockaddr,
- upeer_addrlen, flags);
+ upeer_addrlen, flags,
+ rlimit(RLIMIT_NOFILE));
if (f.flags)
fput(f.file);
}
@@ -2226,10 +2228,10 @@ struct used_address {
unsigned int name_len;
};
-static int copy_msghdr_from_user(struct msghdr *kmsg,
- struct user_msghdr __user *umsg,
- struct sockaddr __user **save_addr,
- struct iovec **iov)
+int __copy_msghdr_from_user(struct msghdr *kmsg,
+ struct user_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec __user **uiov, size_t *nsegs)
{
struct user_msghdr msg;
ssize_t err;
@@ -2271,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
+ *uiov = msg.msg_iov;
+ *nsegs = msg.msg_iovlen;
+ return 0;
+}
+
+static int copy_msghdr_from_user(struct msghdr *kmsg,
+ struct user_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec **iov)
+{
+ struct user_msghdr msg;
+ ssize_t err;
+
+ err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
+ &msg.msg_iovlen);
+ if (err)
+ return err;
err = import_iovec(save_addr ? READ : WRITE,
msg.msg_iov, msg.msg_iovlen,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 095be887753e..125297c9aa3e 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -288,8 +288,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct ib_reg_wr *reg_wr;
+ int i, n, dma_nents;
struct ib_mr *ibmr;
- int i, n;
u8 key;
if (nsegs > ia->ri_max_frwr_depth)
@@ -313,15 +313,16 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
break;
}
mr->mr_dir = rpcrdma_data_dir(writing);
+ mr->mr_nents = i;
- mr->mr_nents =
- ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
- if (!mr->mr_nents)
+ dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents,
+ mr->mr_dir);
+ if (!dma_nents)
goto out_dmamap_err;
ibmr = mr->frwr.fr_mr;
- n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
- if (unlikely(n != mr->mr_nents))
+ n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
+ if (n != dma_nents)
goto out_mapmr_err;
ibmr->iova &= 0x00000000ffffffff;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 3a1d428c1336..f25604d68337 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -29,7 +29,7 @@ struct switchdev_deferred_item {
struct list_head list;
struct net_device *dev;
switchdev_deferred_func_t *func;
- unsigned long data[0];
+ unsigned long data[];
};
static struct switchdev_deferred_item *switchdev_deferred_dequeue(void)
@@ -475,6 +475,9 @@ static int __switchdev_handle_port_obj_add(struct net_device *dev,
* necessary to go through this helper.
*/
netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ if (netif_is_bridge_master(lower_dev))
+ continue;
+
err = __switchdev_handle_port_obj_add(lower_dev, port_obj_info,
check_cb, add_cb);
if (err && err != -EOPNOTSUPP)
@@ -526,6 +529,9 @@ static int __switchdev_handle_port_obj_del(struct net_device *dev,
* necessary to go through this helper.
*/
netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ if (netif_is_bridge_master(lower_dev))
+ continue;
+
err = __switchdev_handle_port_obj_del(lower_dev, port_obj_info,
check_cb, del_cb);
if (err && err != -EOPNOTSUPP)
@@ -576,6 +582,9 @@ static int __switchdev_handle_port_attr_set(struct net_device *dev,
* necessary to go through this helper.
*/
netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ if (netif_is_bridge_master(lower_dev))
+ continue;
+
err = __switchdev_handle_port_attr_set(lower_dev, port_attr_info,
check_cb, set_cb);
if (err && err != -EOPNOTSUPP)
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 58708b4c7719..6dce2abf436e 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -322,9 +322,13 @@ static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head)
void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id)
{
struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
- struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *self;
struct tipc_peer *peer, *prev, *head;
+ if (!mon)
+ return;
+
+ self = get_self(net, bearer_id);
write_lock_bh(&mon->lock);
peer = get_peer(mon, addr);
if (!peer)
@@ -407,11 +411,15 @@ exit:
void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id)
{
struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
- struct tipc_peer *self = get_self(net, bearer_id);
+ struct tipc_peer *self;
struct tipc_peer *peer, *head;
struct tipc_mon_domain *dom;
int applied;
+ if (!mon)
+ return;
+
+ self = get_self(net, bearer_id);
write_lock_bh(&mon->lock);
peer = get_peer(mon, addr);
if (!peer) {
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 0d515d20b056..4d0e0bdd997b 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -736,9 +736,6 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
msg_set_destport(msg, dport);
*err = TIPC_OK;
- if (!skb_cloned(skb))
- return true;
-
return true;
}
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 6d466ebdb64f..871feadbbc19 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -394,6 +394,11 @@ static inline u32 msg_connected(struct tipc_msg *m)
return msg_type(m) == TIPC_CONN_MSG;
}
+static inline u32 msg_direct(struct tipc_msg *m)
+{
+ return msg_type(m) == TIPC_DIRECT_MSG;
+}
+
static inline u32 msg_errcode(struct tipc_msg *m)
{
return msg_bits(m, 1, 25, 0xf);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 7c35094c20b8..bb9862410e68 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -116,6 +116,7 @@ const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
[TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 },
[TIPC_NLA_PROP_TOL] = { .type = NLA_U32 },
[TIPC_NLA_PROP_WIN] = { .type = NLA_U32 },
+ [TIPC_NLA_PROP_MTU] = { .type = NLA_U32 },
[TIPC_NLA_PROP_BROADCAST] = { .type = NLA_U32 },
[TIPC_NLA_PROP_BROADCAST_RATIO] = { .type = NLA_U32 }
};
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 99b28b69fc17..10292c942384 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -278,7 +278,7 @@ struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos)
}
#endif
-void tipc_node_free(struct rcu_head *rp)
+static void tipc_node_free(struct rcu_head *rp)
{
struct tipc_node *n = container_of(rp, struct tipc_node, rcu);
@@ -1586,7 +1586,8 @@ static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list)
case TIPC_MEDIUM_IMPORTANCE:
case TIPC_HIGH_IMPORTANCE:
case TIPC_CRITICAL_IMPORTANCE:
- if (msg_connected(hdr) || msg_named(hdr)) {
+ if (msg_connected(hdr) || msg_named(hdr) ||
+ msg_direct(hdr)) {
tipc_loopback_trace(peer_net, list);
spin_lock_init(&list->lock);
tipc_sk_rcv(peer_net, list);
@@ -2798,7 +2799,7 @@ static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id)
return 0;
}
-int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
+static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1];
struct net *net = sock_net(skb->sk);
@@ -2875,7 +2876,8 @@ int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info)
return err;
}
-int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info)
+static int __tipc_nl_node_flush_key(struct sk_buff *skb,
+ struct genl_info *info)
{
struct net *net = sock_net(skb->sk);
struct tipc_net *tn = tipc_net(net);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index f9b4fb92c0b1..87466607097f 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1461,7 +1461,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
}
__skb_queue_head_init(&pkts);
- mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false);
+ mtu = tipc_node_get_mtu(net, dnode, tsk->portid, true);
rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
if (unlikely(rc != dlen))
return rc;
@@ -2441,6 +2441,8 @@ static int tipc_wait_for_connect(struct socket *sock, long *timeo_p)
return -ETIMEDOUT;
if (signal_pending(current))
return sock_intr_errno(*timeo_p);
+ if (sk->sk_state == TIPC_DISCONNECTING)
+ break;
add_wait_queue(sk_sleep(sk), &wait);
done = sk_wait_event(sk, timeo_p, tipc_sk_connected(sk),
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 1ba5a92832bb..a562ebaaa33c 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -366,7 +366,7 @@ static int tls_do_allocation(struct sock *sk,
if (!offload_ctx->open_record) {
if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
sk->sk_allocation))) {
- sk->sk_prot->enter_memory_pressure(sk);
+ READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
return -ENOMEM;
}
@@ -593,7 +593,7 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
u32 seq, u64 *p_record_sn)
{
u64 record_sn = context->hint_record_sn;
- struct tls_record_info *info;
+ struct tls_record_info *info, *last;
info = context->retransmit_hint;
if (!info ||
@@ -605,6 +605,24 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
struct tls_record_info, list);
if (!info)
return NULL;
+ /* send the start_marker record if seq number is before the
+ * tls offload start marker sequence number. This record is
+ * required to handle TCP packets which are before TLS offload
+ * started.
+ * And if it's not start marker, look if this seq number
+ * belongs to the list.
+ */
+ if (likely(!tls_record_is_start_marker(info))) {
+ /* we have the first record, get the last record to see
+ * if this seq number belongs to the list.
+ */
+ last = list_last_entry(&context->records_list,
+ struct tls_record_info, list);
+
+ if (!between(seq, tls_record_start_seq(info),
+ last->end_seq))
+ return NULL;
+ }
record_sn = context->unacked_record_sn;
}
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 94774c0e5ff3..156efce50dbd 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -63,13 +63,14 @@ static DEFINE_MUTEX(tcpv4_prot_mutex);
static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG];
static struct proto_ops tls_sw_proto_ops;
static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
- struct proto *base);
+ const struct proto *base);
void update_sk_prot(struct sock *sk, struct tls_context *ctx)
{
int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
- sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf];
+ WRITE_ONCE(sk->sk_prot,
+ &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf]);
}
int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -312,7 +313,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
write_lock_bh(&sk->sk_callback_lock);
if (free_ctx)
rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
- sk->sk_prot = ctx->sk_proto;
+ WRITE_ONCE(sk->sk_prot, ctx->sk_proto);
if (sk->sk_write_space == tls_write_space)
sk->sk_write_space = ctx->sk_write_space;
write_unlock_bh(&sk->sk_callback_lock);
@@ -621,38 +622,39 @@ struct tls_context *tls_ctx_create(struct sock *sk)
mutex_init(&ctx->tx_lock);
rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
- ctx->sk_proto = sk->sk_prot;
+ ctx->sk_proto = READ_ONCE(sk->sk_prot);
return ctx;
}
static void tls_build_proto(struct sock *sk)
{
int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
+ const struct proto *prot = READ_ONCE(sk->sk_prot);
/* Build IPv6 TLS whenever the address of tcpv6 _prot changes */
if (ip_ver == TLSV6 &&
- unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
+ unlikely(prot != smp_load_acquire(&saved_tcpv6_prot))) {
mutex_lock(&tcpv6_prot_mutex);
- if (likely(sk->sk_prot != saved_tcpv6_prot)) {
- build_protos(tls_prots[TLSV6], sk->sk_prot);
- smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
+ if (likely(prot != saved_tcpv6_prot)) {
+ build_protos(tls_prots[TLSV6], prot);
+ smp_store_release(&saved_tcpv6_prot, prot);
}
mutex_unlock(&tcpv6_prot_mutex);
}
if (ip_ver == TLSV4 &&
- unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv4_prot))) {
+ unlikely(prot != smp_load_acquire(&saved_tcpv4_prot))) {
mutex_lock(&tcpv4_prot_mutex);
- if (likely(sk->sk_prot != saved_tcpv4_prot)) {
- build_protos(tls_prots[TLSV4], sk->sk_prot);
- smp_store_release(&saved_tcpv4_prot, sk->sk_prot);
+ if (likely(prot != saved_tcpv4_prot)) {
+ build_protos(tls_prots[TLSV4], prot);
+ smp_store_release(&saved_tcpv4_prot, prot);
}
mutex_unlock(&tcpv4_prot_mutex);
}
}
static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
- struct proto *base)
+ const struct proto *base)
{
prot[TLS_BASE][TLS_BASE] = *base;
prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt;
@@ -742,7 +744,8 @@ static void tls_update(struct sock *sk, struct proto *p,
ctx->sk_write_space = write_space;
ctx->sk_proto = p;
} else {
- sk->sk_prot = p;
+ /* Pairs with lockless read in sk_clone_lock(). */
+ WRITE_ONCE(sk->sk_prot, p);
sk->sk_write_space = write_space;
}
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 62c12cb5763e..3385a7a0b231 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -682,6 +682,7 @@ static int unix_set_peek_off(struct sock *sk, int val)
return 0;
}
+#ifdef CONFIG_PROC_FS
static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -689,9 +690,13 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
if (sk) {
u = unix_sk(sock->sk);
- seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds));
+ seq_printf(m, "scm_fds: %u\n",
+ atomic_read(&u->scm_stat.nr_fds));
}
}
+#else
+#define unix_show_fdinfo NULL
+#endif
static const struct proto_ops unix_stream_ops = {
.family = PF_UNIX,
@@ -1207,6 +1212,7 @@ out:
}
static long unix_wait_for_peer(struct sock *other, long timeo)
+ __releases(&unix_sk(other)->lock)
{
struct unix_sock *u = unix_sk(other);
int sched;
@@ -1597,10 +1603,8 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
- lockdep_assert_held(&sk->sk_receive_queue.lock);
-
if (unlikely(fp && fp->count))
- u->scm_stat.nr_fds += fp->count;
+ atomic_add(fp->count, &u->scm_stat.nr_fds);
}
static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
@@ -1608,10 +1612,8 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
- lockdep_assert_held(&sk->sk_receive_queue.lock);
-
if (unlikely(fp && fp->count))
- u->scm_stat.nr_fds -= fp->count;
+ atomic_sub(fp->count, &u->scm_stat.nr_fds);
}
/*
@@ -1800,10 +1802,8 @@ restart_locked:
if (sock_flag(other, SOCK_RCVTSTAMP))
__net_timestamp(skb);
maybe_add_creds(skb, sock, other);
- spin_lock(&other->sk_receive_queue.lock);
scm_stat_add(other, skb);
- __skb_queue_tail(&other->sk_receive_queue, skb);
- spin_unlock(&other->sk_receive_queue.lock);
+ skb_queue_tail(&other->sk_receive_queue, skb);
unix_state_unlock(other);
other->sk_data_ready(other);
sock_put(other);
@@ -1905,10 +1905,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
goto pipe_err_free;
maybe_add_creds(skb, sock, other);
- spin_lock(&other->sk_receive_queue.lock);
scm_stat_add(other, skb);
- __skb_queue_tail(&other->sk_receive_queue, skb);
- spin_unlock(&other->sk_receive_queue.lock);
+ skb_queue_tail(&other->sk_receive_queue, skb);
unix_state_unlock(other);
other->sk_data_ready(other);
sent += size;
@@ -2108,9 +2106,12 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
skip = sk_peek_offset(sk, flags);
skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
- scm_stat_del, &skip, &err, &last);
- if (skb)
+ &skip, &err, &last);
+ if (skb) {
+ if (!(flags & MSG_PEEK))
+ scm_stat_del(sk, skb);
break;
+ }
mutex_unlock(&u->iolock);
@@ -2404,9 +2405,7 @@ unlock:
sk_peek_offset_bwd(sk, chunk);
if (UNIXCB(skb).fp) {
- spin_lock(&sk->sk_receive_queue.lock);
scm_stat_del(sk, skb);
- spin_unlock(&sk->sk_receive_queue.lock);
unix_detach_fds(&scm, skb);
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 9c5b2a91baad..a5f28708e0e7 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -451,6 +451,12 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
if (vsk->transport == new_transport)
return 0;
+ /* transport->release() must be called with sock lock acquired.
+ * This path can only be taken during vsock_stream_connect(),
+ * where we have already held the sock lock.
+ * In the other cases, this function is called on a new socket
+ * which is not assigned to any transport.
+ */
vsk->transport->release(vsk);
vsock_deassign_transport(vsk);
}
@@ -753,20 +759,18 @@ static void __vsock_release(struct sock *sk, int level)
vsk = vsock_sk(sk);
pending = NULL; /* Compiler warning. */
- /* The release call is supposed to use lock_sock_nested()
- * rather than lock_sock(), if a sock lock should be acquired.
- */
- if (vsk->transport)
- vsk->transport->release(vsk);
- else if (sk->sk_type == SOCK_STREAM)
- vsock_remove_sock(vsk);
-
/* When "level" is SINGLE_DEPTH_NESTING, use the nested
* version to avoid the warning "possible recursive locking
* detected". When "level" is 0, lock_sock_nested(sk, level)
* is the same as lock_sock(sk).
*/
lock_sock_nested(sk, level);
+
+ if (vsk->transport)
+ vsk->transport->release(vsk);
+ else if (sk->sk_type == SOCK_STREAM)
+ vsock_remove_sock(vsk);
+
sock_orphan(sk);
sk->sk_shutdown = SHUTDOWN_MASK;
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 3492c021925f..630b851f8150 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -526,12 +526,9 @@ static bool hvs_close_lock_held(struct vsock_sock *vsk)
static void hvs_release(struct vsock_sock *vsk)
{
- struct sock *sk = sk_vsock(vsk);
bool remove_sock;
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
remove_sock = hvs_close_lock_held(vsk);
- release_sock(sk);
if (remove_sock)
vsock_remove_sock(vsk);
}
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index d9f0c9c5425a..709038a4783e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -829,7 +829,6 @@ void virtio_transport_release(struct vsock_sock *vsk)
struct sock *sk = &vsk->sk;
bool remove_sock = true;
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
if (sk->sk_type == SOCK_STREAM)
remove_sock = virtio_transport_close(vsk);
@@ -837,7 +836,6 @@ void virtio_transport_release(struct vsock_sock *vsk)
list_del(&pkt->list);
virtio_transport_free_pkt(pkt);
}
- release_sock(sk);
if (remove_sock)
vsock_remove_sock(vsk);
@@ -1153,6 +1151,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
virtio_transport_free_pkt(pkt);
break;
default:
+ (void)virtio_transport_reset_no_sock(t, pkt);
virtio_transport_free_pkt(pkt);
break;
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 3e25229a059d..341402b4f178 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -693,8 +693,14 @@ int wiphy_register(struct wiphy *wiphy)
~(BIT(NL80211_PREAMBLE_LEGACY) |
BIT(NL80211_PREAMBLE_HT) |
BIT(NL80211_PREAMBLE_VHT) |
+ BIT(NL80211_PREAMBLE_HE) |
BIT(NL80211_PREAMBLE_DMG))))
return -EINVAL;
+ if (WARN_ON((wiphy->pmsr_capa->ftm.trigger_based ||
+ wiphy->pmsr_capa->ftm.non_trigger_based) &&
+ !(wiphy->pmsr_capa->ftm.preambles &
+ BIT(NL80211_PREAMBLE_HE))))
+ return -EINVAL;
if (WARN_ON(wiphy->pmsr_capa->ftm.bandwidths &
~(BIT(NL80211_CHAN_WIDTH_20_NOHT) |
BIT(NL80211_CHAN_WIDTH_20) |
diff --git a/net/wireless/core.h b/net/wireless/core.h
index ed487e324571..bb897a803ffe 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -385,7 +385,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev,
struct net_device *dev);
int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid,
u16 frame_type, const u8 *match_data,
- int match_len);
+ int match_len, struct netlink_ext_ack *extack);
void cfg80211_mlme_unreg_wk(struct work_struct *wk);
void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid);
void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev);
diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c
index a9c0f368db5d..24e18405cdb4 100644
--- a/net/wireless/ethtool.c
+++ b/net/wireless/ethtool.c
@@ -7,9 +7,13 @@
void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct device *pdev = wiphy_dev(wdev->wiphy);
- strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name,
- sizeof(info->driver));
+ if (pdev->driver)
+ strlcpy(info->driver, pdev->driver->name,
+ sizeof(info->driver));
+ else
+ strlcpy(info->driver, "N/A", sizeof(info->driver));
strlcpy(info->version, init_utsname()->release, sizeof(info->version));
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index f9462010575f..e4805a3bd310 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -4,6 +4,7 @@
*
* Copyright (c) 2009, Jouni Malinen <j@w1.fi>
* Copyright (c) 2015 Intel Deutschland GmbH
+ * Copyright (C) 2019 Intel Corporation
*/
#include <linux/kernel.h>
@@ -470,7 +471,7 @@ void cfg80211_mlme_unreg_wk(struct work_struct *wk)
int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
u16 frame_type, const u8 *match_data,
- int match_len)
+ int match_len, struct netlink_ext_ack *extack)
{
struct wiphy *wiphy = wdev->wiphy;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
@@ -481,15 +482,38 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
if (!wdev->wiphy->mgmt_stypes)
return -EOPNOTSUPP;
- if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT)
+ if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT) {
+ NL_SET_ERR_MSG(extack, "frame type not management");
return -EINVAL;
+ }
- if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE))
+ if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) {
+ NL_SET_ERR_MSG(extack, "Invalid frame type");
return -EINVAL;
+ }
mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4;
- if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type)))
+ if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type))) {
+ NL_SET_ERR_MSG(extack,
+ "Registration to specific type not supported");
+ return -EINVAL;
+ }
+
+ /*
+ * To support Pre Association Security Negotiation (PASN), registration
+ * for authentication frames should be supported. However, as some
+ * versions of the user space daemons wrongly register to all types of
+ * authentication frames (which might result in unexpected behavior)
+ * allow such registration if the request is for a specific
+ * authentication algorithm number.
+ */
+ if (wdev->iftype == NL80211_IFTYPE_STATION &&
+ (frame_type & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_AUTH &&
+ !(match_data && match_len >= 2)) {
+ NL_SET_ERR_MSG(extack,
+ "Authentication algorithm number required");
return -EINVAL;
+ }
nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL);
if (!nreg)
@@ -504,6 +528,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
continue;
if (memcmp(reg->match, match_data, mlen) == 0) {
+ NL_SET_ERR_MSG(extack, "Match already configured");
err = -EALREADY;
break;
}
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 123b8d720a59..5fa402144cda 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5,7 +5,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2019 Intel Corporation
+ * Copyright (C) 2018-2020 Intel Corporation
*/
#include <linux/if.h>
@@ -20,6 +20,7 @@
#include <linux/netlink.h>
#include <linux/nospec.h>
#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
#include <net/net_namespace.h>
#include <net/genetlink.h>
#include <net/cfg80211.h>
@@ -275,6 +276,8 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = {
[NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES] = { .type = NLA_U8 },
[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI] = { .type = NLA_FLAG },
[NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG },
+ [NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG },
+ [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG },
};
static const struct nla_policy
@@ -321,6 +324,29 @@ he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = {
NLA_POLICY_RANGE(NLA_U8, 1, 20),
};
+static const struct nla_policy
+he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = {
+ [NL80211_HE_BSS_COLOR_ATTR_COLOR] = NLA_POLICY_RANGE(NLA_U8, 1, 63),
+ [NL80211_HE_BSS_COLOR_ATTR_DISABLED] = { .type = NLA_FLAG },
+ [NL80211_HE_BSS_COLOR_ATTR_PARTIAL] = { .type = NLA_FLAG },
+};
+
+static const struct nla_policy
+nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = {
+ [NL80211_TID_CONFIG_ATTR_VIF_SUPP] = { .type = NLA_U64 },
+ [NL80211_TID_CONFIG_ATTR_PEER_SUPP] = { .type = NLA_U64 },
+ [NL80211_TID_CONFIG_ATTR_OVERRIDE] = { .type = NLA_FLAG },
+ [NL80211_TID_CONFIG_ATTR_TIDS] = NLA_POLICY_RANGE(NLA_U16, 1, 0xff),
+ [NL80211_TID_CONFIG_ATTR_NOACK] =
+ NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
+ [NL80211_TID_CONFIG_ATTR_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1),
+ [NL80211_TID_CONFIG_ATTR_RETRY_LONG] = NLA_POLICY_MIN(NLA_U8, 1),
+ [NL80211_TID_CONFIG_ATTR_AMPDU_CTRL] =
+ NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
+ [NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL] =
+ NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE),
+};
+
const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD },
[NL80211_ATTR_WIPHY] = { .type = NLA_U32 },
@@ -361,7 +387,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_KEY] = { .type = NLA_NESTED, },
[NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY,
.len = WLAN_MAX_KEY_LEN },
- [NL80211_ATTR_KEY_IDX] = NLA_POLICY_MAX(NLA_U8, 5),
+ [NL80211_ATTR_KEY_IDX] = NLA_POLICY_MAX(NLA_U8, 7),
[NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 },
[NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG },
[NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 16 },
@@ -437,6 +463,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
[NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG },
[NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
+ [NL80211_ATTR_STATUS_CODE] = { .type = NLA_U16 },
[NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
[NL80211_ATTR_PID] = { .type = NLA_U32 },
@@ -468,6 +495,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_WOWLAN_TRIGGERS] = { .type = NLA_NESTED },
[NL80211_ATTR_STA_PLINK_STATE] =
NLA_POLICY_MAX(NLA_U8, NUM_NL80211_PLINK_STATES - 1),
+ [NL80211_ATTR_MEASUREMENT_DURATION] = { .type = NLA_U16 },
+ [NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY] = { .type = NLA_FLAG },
[NL80211_ATTR_MESH_PEER_AID] =
NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
[NL80211_ATTR_SCHED_SCAN_INTERVAL] = { .type = NLA_U32 },
@@ -529,6 +558,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_MDID] = { .type = NLA_U16 },
[NL80211_ATTR_IE_RIC] = { .type = NLA_BINARY,
.len = IEEE80211_MAX_DATA_LEN },
+ [NL80211_ATTR_CRIT_PROT_ID] = { .type = NLA_U16 },
+ [NL80211_ATTR_MAX_CRIT_PROT_DURATION] = { .type = NLA_U16 },
[NL80211_ATTR_PEER_AID] =
NLA_POLICY_RANGE(NLA_U16, 1, IEEE80211_MAX_AID),
[NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 },
@@ -559,6 +590,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
NLA_POLICY_MAX(NLA_U8, IEEE80211_NUM_UPS - 1),
[NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 },
[NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 },
+ [NL80211_ATTR_OPER_CLASS] = { .type = NLA_U8 },
[NL80211_ATTR_MAC_MASK] = {
.type = NLA_EXACT_LEN_WARN,
.len = ETH_ALEN
@@ -625,6 +657,12 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_TWT_RESPONDER] = { .type = NLA_FLAG },
[NL80211_ATTR_HE_OBSS_PD] = NLA_POLICY_NESTED(he_obss_pd_policy),
[NL80211_ATTR_VLAN_ID] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2),
+ [NL80211_ATTR_HE_BSS_COLOR] = NLA_POLICY_NESTED(he_bss_color_policy),
+ [NL80211_ATTR_TID_CONFIG] =
+ NLA_POLICY_NESTED_ARRAY(nl80211_tid_config_attr_policy),
+ [NL80211_ATTR_CONTROL_PORT_NO_PREAUTH] = { .type = NLA_FLAG },
+ [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100),
};
/* policy for the key attributes */
@@ -964,6 +1002,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
if ((chan->flags & IEEE80211_CHAN_NO_10MHZ) &&
nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_10MHZ))
goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_HE) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE))
+ goto nla_put_failure;
}
if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
@@ -1025,7 +1066,7 @@ struct key_parse {
struct key_params p;
int idx;
int type;
- bool def, defmgmt;
+ bool def, defmgmt, defbeacon;
bool def_uni, def_multi;
};
@@ -1041,12 +1082,13 @@ static int nl80211_parse_key_new(struct genl_info *info, struct nlattr *key,
k->def = !!tb[NL80211_KEY_DEFAULT];
k->defmgmt = !!tb[NL80211_KEY_DEFAULT_MGMT];
+ k->defbeacon = !!tb[NL80211_KEY_DEFAULT_BEACON];
if (k->def) {
k->def_uni = true;
k->def_multi = true;
}
- if (k->defmgmt)
+ if (k->defmgmt || k->defbeacon)
k->def_multi = true;
if (tb[NL80211_KEY_IDX])
@@ -1153,14 +1195,17 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
if (err)
return err;
- if (k->def && k->defmgmt) {
- GENL_SET_ERR_MSG(info, "key with def && defmgmt is invalid");
+ if ((k->def ? 1 : 0) + (k->defmgmt ? 1 : 0) +
+ (k->defbeacon ? 1 : 0) > 1) {
+ GENL_SET_ERR_MSG(info,
+ "key with multiple default flags is invalid");
return -EINVAL;
}
- if (k->defmgmt) {
+ if (k->defmgmt || k->defbeacon) {
if (k->def_uni || !k->def_multi) {
- GENL_SET_ERR_MSG(info, "defmgmt key must be mcast");
+ GENL_SET_ERR_MSG(info,
+ "defmgmt/defbeacon key must be mcast");
return -EINVAL;
}
}
@@ -1172,14 +1217,20 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k)
"defmgmt key idx not 4 or 5");
return -EINVAL;
}
+ } else if (k->defbeacon) {
+ if (k->idx < 6 || k->idx > 7) {
+ GENL_SET_ERR_MSG(info,
+ "defbeacon key idx not 6 or 7");
+ return -EINVAL;
+ }
} else if (k->def) {
if (k->idx < 0 || k->idx > 3) {
GENL_SET_ERR_MSG(info, "def key idx not 0-3");
return -EINVAL;
}
} else {
- if (k->idx < 0 || k->idx > 5) {
- GENL_SET_ERR_MSG(info, "key idx not 0-5");
+ if (k->idx < 0 || k->idx > 7) {
+ GENL_SET_ERR_MSG(info, "key idx not 0-7");
return -EINVAL;
}
}
@@ -1838,6 +1889,12 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap,
nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST,
cap->ftm.max_ftms_per_burst))
return -ENOBUFS;
+ if (cap->ftm.trigger_based &&
+ nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED))
+ return -ENOBUFS;
+ if (cap->ftm.non_trigger_based &&
+ nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED))
+ return -ENOBUFS;
nla_nest_end(msg, ftm);
return 0;
@@ -1885,6 +1942,88 @@ static int nl80211_send_pmsr_capa(struct cfg80211_registered_device *rdev,
return 0;
}
+static int
+nl80211_put_iftype_akm_suites(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ int i;
+ struct nlattr *nested, *nested_akms;
+ const struct wiphy_iftype_akm_suites *iftype_akms;
+
+ if (!rdev->wiphy.num_iftype_akm_suites ||
+ !rdev->wiphy.iftype_akm_suites)
+ return 0;
+
+ nested = nla_nest_start(msg, NL80211_ATTR_IFTYPE_AKM_SUITES);
+ if (!nested)
+ return -ENOBUFS;
+
+ for (i = 0; i < rdev->wiphy.num_iftype_akm_suites; i++) {
+ nested_akms = nla_nest_start(msg, i + 1);
+ if (!nested_akms)
+ return -ENOBUFS;
+
+ iftype_akms = &rdev->wiphy.iftype_akm_suites[i];
+
+ if (nl80211_put_iftypes(msg, NL80211_IFTYPE_AKM_ATTR_IFTYPES,
+ iftype_akms->iftypes_mask))
+ return -ENOBUFS;
+
+ if (nla_put(msg, NL80211_IFTYPE_AKM_ATTR_SUITES,
+ sizeof(u32) * iftype_akms->n_akm_suites,
+ iftype_akms->akm_suites)) {
+ return -ENOBUFS;
+ }
+ nla_nest_end(msg, nested_akms);
+ }
+
+ nla_nest_end(msg, nested);
+
+ return 0;
+}
+
+static int
+nl80211_put_tid_config_support(struct cfg80211_registered_device *rdev,
+ struct sk_buff *msg)
+{
+ struct nlattr *supp;
+
+ if (!rdev->wiphy.tid_config_support.vif &&
+ !rdev->wiphy.tid_config_support.peer)
+ return 0;
+
+ supp = nla_nest_start(msg, NL80211_ATTR_TID_CONFIG);
+ if (!supp)
+ return -ENOSPC;
+
+ if (rdev->wiphy.tid_config_support.vif &&
+ nla_put_u64_64bit(msg, NL80211_TID_CONFIG_ATTR_VIF_SUPP,
+ rdev->wiphy.tid_config_support.vif,
+ NL80211_TID_CONFIG_ATTR_PAD))
+ goto fail;
+
+ if (rdev->wiphy.tid_config_support.peer &&
+ nla_put_u64_64bit(msg, NL80211_TID_CONFIG_ATTR_PEER_SUPP,
+ rdev->wiphy.tid_config_support.peer,
+ NL80211_TID_CONFIG_ATTR_PAD))
+ goto fail;
+
+ /* for now we just use the same value ... makes more sense */
+ if (nla_put_u8(msg, NL80211_TID_CONFIG_ATTR_RETRY_SHORT,
+ rdev->wiphy.tid_config_support.max_retry))
+ goto fail;
+ if (nla_put_u8(msg, NL80211_TID_CONFIG_ATTR_RETRY_LONG,
+ rdev->wiphy.tid_config_support.max_retry))
+ goto fail;
+
+ nla_nest_end(msg, supp);
+
+ return 0;
+fail:
+ nla_nest_cancel(msg, supp);
+ return -ENOBUFS;
+}
+
struct nl80211_dump_wiphy_state {
s64 filter_wiphy;
long start;
@@ -2443,6 +2582,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
rdev->wiphy.akm_suites))
goto nla_put_failure;
+ if (nl80211_put_iftype_akm_suites(rdev, msg))
+ goto nla_put_failure;
+
+ if (nl80211_put_tid_config_support(rdev, msg))
+ goto nla_put_failure;
+
/* done */
state->split_start = 0;
break;
@@ -3474,7 +3619,7 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev,
enum nl80211_iftype iftype)
{
if (!use_4addr) {
- if (netdev && (netdev->priv_flags & IFF_BRIDGE_PORT))
+ if (netdev && netif_is_bridge_port(netdev))
return -EBUSY;
return 0;
}
@@ -3762,8 +3907,14 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info)
void *hdr;
struct sk_buff *msg;
- if (info->attrs[NL80211_ATTR_KEY_IDX])
+ if (info->attrs[NL80211_ATTR_KEY_IDX]) {
key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]);
+ if (key_idx > 5 &&
+ !wiphy_ext_feature_isset(
+ &rdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_PROTECTION))
+ return -EINVAL;
+ }
if (info->attrs[NL80211_ATTR_MAC])
mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
@@ -3839,7 +3990,7 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
/* Only support setting default key and
* Extended Key ID action NL80211_KEY_SET_TX.
*/
- if (!key.def && !key.defmgmt &&
+ if (!key.def && !key.defmgmt && !key.defbeacon &&
!(key.p.mode == NL80211_KEY_SET_TX))
return -EINVAL;
@@ -3886,6 +4037,24 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info)
#ifdef CONFIG_CFG80211_WEXT
dev->ieee80211_ptr->wext.default_mgmt_key = key.idx;
#endif
+ } else if (key.defbeacon) {
+ if (key.def_uni || !key.def_multi) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!rdev->ops->set_default_beacon_key) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = nl80211_key_allowed(dev->ieee80211_ptr);
+ if (err)
+ goto out;
+
+ err = rdev_set_default_beacon_key(rdev, dev, key.idx);
+ if (err)
+ goto out;
} else if (key.p.mode == NL80211_KEY_SET_TX &&
wiphy_ext_feature_isset(&rdev->wiphy,
NL80211_EXT_FEATURE_EXT_KEY_ID)) {
@@ -3923,8 +4092,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
if (err)
return err;
- if (!key.p.key)
+ if (!key.p.key) {
+ GENL_SET_ERR_MSG(info, "no key");
return -EINVAL;
+ }
if (info->attrs[NL80211_ATTR_MAC])
mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]);
@@ -3938,8 +4109,10 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
/* for now */
if (key.type != NL80211_KEYTYPE_PAIRWISE &&
- key.type != NL80211_KEYTYPE_GROUP)
+ key.type != NL80211_KEYTYPE_GROUP) {
+ GENL_SET_ERR_MSG(info, "key type not pairwise or group");
return -EINVAL;
+ }
if (key.type == NL80211_KEYTYPE_GROUP &&
info->attrs[NL80211_ATTR_VLAN_ID])
@@ -3950,15 +4123,22 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info)
if (cfg80211_validate_key_settings(rdev, &key.p, key.idx,
key.type == NL80211_KEYTYPE_PAIRWISE,
- mac_addr))
+ mac_addr)) {
+ GENL_SET_ERR_MSG(info, "key setting validation failed");
return -EINVAL;
+ }
wdev_lock(dev->ieee80211_ptr);
err = nl80211_key_allowed(dev->ieee80211_ptr);
- if (!err)
+ if (err)
+ GENL_SET_ERR_MSG(info, "key not allowed");
+ if (!err) {
err = rdev_add_key(rdev, dev, key.idx,
key.type == NL80211_KEYTYPE_PAIRWISE,
mac_addr, &key.p);
+ if (err)
+ GENL_SET_ERR_MSG(info, "key addition failed");
+ }
wdev_unlock(dev->ieee80211_ptr);
return err;
@@ -4511,6 +4691,30 @@ static int nl80211_parse_he_obss_pd(struct nlattr *attrs,
return 0;
}
+static int nl80211_parse_he_bss_color(struct nlattr *attrs,
+ struct cfg80211_he_bss_color *he_bss_color)
+{
+ struct nlattr *tb[NL80211_HE_BSS_COLOR_ATTR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NL80211_HE_BSS_COLOR_ATTR_MAX, attrs,
+ he_bss_color_policy, NULL);
+ if (err)
+ return err;
+
+ if (!tb[NL80211_HE_BSS_COLOR_ATTR_COLOR])
+ return -EINVAL;
+
+ he_bss_color->color =
+ nla_get_u8(tb[NL80211_HE_BSS_COLOR_ATTR_COLOR]);
+ he_bss_color->disabled =
+ nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_DISABLED]);
+ he_bss_color->partial =
+ nla_get_flag(tb[NL80211_HE_BSS_COLOR_ATTR_PARTIAL]);
+
+ return 0;
+}
+
static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params,
const u8 *rates)
{
@@ -4555,6 +4759,9 @@ static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len);
if (cap && cap[1] >= sizeof(*params->he_cap) + 1)
params->he_cap = (void *)(cap + 3);
+ cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ies, ies_len);
+ if (cap && cap[1] >= sizeof(*params->he_oper) + 1)
+ params->he_oper = (void *)(cap + 3);
}
static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev,
@@ -4799,6 +5006,13 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
err = nl80211_parse_he_obss_pd(
info->attrs[NL80211_ATTR_HE_OBSS_PD],
&params.he_obss_pd);
+ goto out;
+ }
+
+ if (info->attrs[NL80211_ATTR_HE_BSS_COLOR]) {
+ err = nl80211_parse_he_bss_color(
+ info->attrs[NL80211_ATTR_HE_BSS_COLOR],
+ &params.he_bss_color);
if (err)
return err;
}
@@ -4822,6 +5036,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
}
wdev_unlock(wdev);
+out:
kfree(params.acl);
return err;
@@ -6070,11 +6285,22 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_MAC])
params.mac = nla_data(info->attrs[NL80211_ATTR_MAC]);
- if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
- dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN &&
- dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT &&
- dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ switch (dev->ieee80211_ptr->iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_P2P_GO:
+ /* always accept these */
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ /* conditionally accept */
+ if (wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_DEL_IBSS_STA))
+ break;
return -EINVAL;
+ default:
+ return -EINVAL;
+ }
if (!rdev->ops->del_station)
return -EOPNOTSUPP;
@@ -9105,6 +9331,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
return r;
settings->control_port_over_nl80211 = true;
+
+ if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_PREAUTH])
+ settings->control_port_no_preauth = true;
}
if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) {
@@ -10293,6 +10522,15 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
pmksa.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]);
}
+ if (info->attrs[NL80211_ATTR_PMK_LIFETIME])
+ pmksa.pmk_lifetime =
+ nla_get_u32(info->attrs[NL80211_ATTR_PMK_LIFETIME]);
+
+ if (info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD])
+ pmksa.pmk_reauth_threshold =
+ nla_get_u8(
+ info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]);
+
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT &&
!(dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP &&
@@ -10538,8 +10776,9 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
return -EOPNOTSUPP;
return cfg80211_mlme_register_mgmt(wdev, info->snd_portid, frame_type,
- nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]),
- nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]));
+ nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]),
+ nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]),
+ info->extack);
}
static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
@@ -13801,6 +14040,141 @@ static int nl80211_probe_mesh_link(struct sk_buff *skb, struct genl_info *info)
return rdev_probe_mesh_link(rdev, dev, dest, buf, len);
}
+static int parse_tid_conf(struct cfg80211_registered_device *rdev,
+ struct nlattr *attrs[], struct net_device *dev,
+ struct cfg80211_tid_cfg *tid_conf,
+ struct genl_info *info, const u8 *peer)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ u64 mask;
+ int err;
+
+ if (!attrs[NL80211_TID_CONFIG_ATTR_TIDS])
+ return -EINVAL;
+
+ tid_conf->config_override =
+ nla_get_flag(attrs[NL80211_TID_CONFIG_ATTR_OVERRIDE]);
+ tid_conf->tids = nla_get_u16(attrs[NL80211_TID_CONFIG_ATTR_TIDS]);
+
+ if (tid_conf->config_override) {
+ if (rdev->ops->reset_tid_config) {
+ err = rdev_reset_tid_config(rdev, dev, peer,
+ tid_conf->tids);
+ /* If peer is there no other configuration will be
+ * allowed
+ */
+ if (err || peer)
+ return err;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ if (attrs[NL80211_TID_CONFIG_ATTR_NOACK]) {
+ tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_NOACK);
+ tid_conf->noack =
+ nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_NOACK]);
+ }
+
+ if (attrs[NL80211_TID_CONFIG_ATTR_RETRY_SHORT]) {
+ tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_RETRY_SHORT);
+ tid_conf->retry_short =
+ nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RETRY_SHORT]);
+
+ if (tid_conf->retry_short > rdev->wiphy.max_data_retry_count)
+ return -EINVAL;
+ }
+
+ if (attrs[NL80211_TID_CONFIG_ATTR_RETRY_LONG]) {
+ tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_RETRY_LONG);
+ tid_conf->retry_long =
+ nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RETRY_LONG]);
+
+ if (tid_conf->retry_long > rdev->wiphy.max_data_retry_count)
+ return -EINVAL;
+ }
+
+ if (attrs[NL80211_TID_CONFIG_ATTR_AMPDU_CTRL]) {
+ tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_AMPDU_CTRL);
+ tid_conf->ampdu =
+ nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMPDU_CTRL]);
+ }
+
+ if (attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]) {
+ tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL);
+ tid_conf->rtscts =
+ nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]);
+ }
+
+ if (peer)
+ mask = rdev->wiphy.tid_config_support.peer;
+ else
+ mask = rdev->wiphy.tid_config_support.vif;
+
+ if (tid_conf->mask & ~mask) {
+ NL_SET_ERR_MSG(extack, "unsupported TID configuration");
+ return -ENOTSUPP;
+ }
+
+ return 0;
+}
+
+static int nl80211_set_tid_config(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct nlattr *attrs[NL80211_TID_CONFIG_ATTR_MAX + 1];
+ struct net_device *dev = info->user_ptr[1];
+ struct cfg80211_tid_config *tid_config;
+ struct nlattr *tid;
+ int conf_idx = 0, rem_conf;
+ int ret = -EINVAL;
+ u32 num_conf = 0;
+
+ if (!info->attrs[NL80211_ATTR_TID_CONFIG])
+ return -EINVAL;
+
+ if (!rdev->ops->set_tid_config)
+ return -EOPNOTSUPP;
+
+ nla_for_each_nested(tid, info->attrs[NL80211_ATTR_TID_CONFIG],
+ rem_conf)
+ num_conf++;
+
+ tid_config = kzalloc(struct_size(tid_config, tid_conf, num_conf),
+ GFP_KERNEL);
+ if (!tid_config)
+ return -ENOMEM;
+
+ tid_config->n_tid_conf = num_conf;
+
+ if (info->attrs[NL80211_ATTR_MAC])
+ tid_config->peer = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+ nla_for_each_nested(tid, info->attrs[NL80211_ATTR_TID_CONFIG],
+ rem_conf) {
+ ret = nla_parse_nested(attrs, NL80211_TID_CONFIG_ATTR_MAX,
+ tid, NULL, NULL);
+
+ if (ret)
+ goto bad_tid_conf;
+
+ ret = parse_tid_conf(rdev, attrs, dev,
+ &tid_config->tid_conf[conf_idx],
+ info, tid_config->peer);
+ if (ret)
+ goto bad_tid_conf;
+
+ conf_idx++;
+ }
+
+ ret = rdev_set_tid_config(rdev, dev, tid_config);
+
+bad_tid_conf:
+ kfree(tid_config);
+ return ret;
+}
+
#define NL80211_FLAG_NEED_WIPHY 0x01
#define NL80211_FLAG_NEED_NETDEV 0x02
#define NL80211_FLAG_NEED_RTNL 0x04
@@ -14755,6 +15129,13 @@ static const struct genl_ops nl80211_ops[] = {
.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
NL80211_FLAG_NEED_RTNL,
},
+ {
+ .cmd = NL80211_CMD_SET_TID_CONFIG,
+ .doit = nl80211_set_tid_config,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_NETDEV |
+ NL80211_FLAG_NEED_RTNL,
+ },
};
static struct genl_family nl80211_fam __ro_after_init = {
@@ -16409,7 +16790,7 @@ void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac,
goto nla_put_failure;
if ((sta_opmode->changed & STA_OPMODE_MAX_BW_CHANGED) &&
- nla_put_u8(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw))
+ nla_put_u32(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw))
goto nla_put_failure;
if ((sta_opmode->changed & STA_OPMODE_N_SS_CHANGED) &&
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index c09fbf09549d..63dc8023447f 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -126,6 +126,38 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
"FTM: civic location request not supported");
}
+ out->ftm.trigger_based =
+ !!tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED];
+ if (out->ftm.trigger_based && !capa->ftm.trigger_based) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED],
+ "FTM: trigger based ranging is not supported");
+ return -EINVAL;
+ }
+
+ out->ftm.non_trigger_based =
+ !!tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED];
+ if (out->ftm.non_trigger_based && !capa->ftm.non_trigger_based) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED],
+ "FTM: trigger based ranging is not supported");
+ return -EINVAL;
+ }
+
+ if (out->ftm.trigger_based && out->ftm.non_trigger_based) {
+ NL_SET_ERR_MSG(info->extack,
+ "FTM: can't set both trigger based and non trigger based");
+ return -EINVAL;
+ }
+
+ if ((out->ftm.trigger_based || out->ftm.non_trigger_based) &&
+ out->ftm.preamble != NL80211_PREAMBLE_HE) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE],
+ "FTM: non EDCA based ranging must use HE preamble");
+ return -EINVAL;
+ }
+
return 0;
}
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index e0d34f796d0b..99462f0c4e08 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -136,6 +136,19 @@ rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int
+rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev,
+ struct net_device *netdev, u8 key_index)
+{
+ int ret;
+
+ trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, key_index);
+ ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev,
+ key_index);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
static inline int rdev_start_ap(struct cfg80211_registered_device *rdev,
struct net_device *dev,
struct cfg80211_ap_settings *settings)
@@ -1313,4 +1326,28 @@ rdev_probe_mesh_link(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_tid_config *tid_conf)
+{
+ int ret;
+
+ trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf);
+ ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, const u8 *peer,
+ u8 tids)
+{
+ int ret;
+
+ trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids);
+ ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
#endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index fff9a74891fc..d476d4da0d09 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1569,6 +1569,8 @@ static u32 map_regdom_flags(u32 rd_flags)
channel_flags |= IEEE80211_CHAN_NO_80MHZ;
if (rd_flags & NL80211_RRF_NO_160MHZ)
channel_flags |= IEEE80211_CHAN_NO_160MHZ;
+ if (rd_flags & NL80211_RRF_NO_HE)
+ channel_flags |= IEEE80211_CHAN_NO_HE;
return channel_flags;
}
@@ -2276,7 +2278,7 @@ static void handle_channel_custom(struct wiphy *wiphy,
break;
}
- if (IS_ERR(reg_rule)) {
+ if (IS_ERR_OR_NULL(reg_rule)) {
pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n",
chan->center_freq);
if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index aef240fdf8df..4000382aef48 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -556,9 +556,8 @@ cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid)
{
struct cfg80211_sched_scan_request *pos;
- WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
-
- list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list) {
+ list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list,
+ lockdep_rtnl_is_held()) {
if (pos->reqid == reqid)
return pos;
}
@@ -1434,8 +1433,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
}
rcu_assign_pointer(tmp.pub.ies, ies);
- signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
- wiphy->max_adj_channel_rssi_comp;
+ signal_valid = data->chan == channel;
res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, ts);
if (!res)
return NULL;
@@ -1852,8 +1850,7 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS);
ether_addr_copy(tmp.parent_bssid, data->parent_bssid);
- signal_valid = abs(data->chan->center_freq - channel->center_freq) <=
- wiphy->max_adj_channel_rssi_comp;
+ signal_valid = data->chan == channel;
res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid,
jiffies);
if (!res)
@@ -2022,7 +2019,11 @@ void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
spin_lock_bh(&rdev->bss_lock);
- if (WARN_ON(cbss->pub.channel == chan))
+ /*
+ * Some APs use CSA also for bandwidth changes, i.e., without actually
+ * changing the control channel, so no need to update in such a case.
+ */
+ if (cbss->pub.channel == chan)
goto done;
/* use transmitting bss */
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index d32a2ec4d96a..ac3e60aa1fc8 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -1111,9 +1111,16 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
* Delete all the keys ... pairwise keys can't really
* exist any more anyway, but default keys might.
*/
- if (rdev->ops->del_key)
- for (i = 0; i < 6; i++)
+ if (rdev->ops->del_key) {
+ int max_key_idx = 5;
+
+ if (wiphy_ext_feature_isset(
+ wdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_PROTECTION))
+ max_key_idx = 7;
+ for (i = 0; i <= max_key_idx; i++)
rdev_del_key(rdev, dev, i, false, NULL);
+ }
rdev_set_qos_map(rdev, dev, NULL);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 3ef1679b0e66..839df54cee21 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -510,6 +510,23 @@ TRACE_EVENT(rdev_set_default_mgmt_key,
WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index)
);
+TRACE_EVENT(rdev_set_default_beacon_key,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index),
+ TP_ARGS(wiphy, netdev, key_index),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u8, key_index)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->key_index = key_index;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index)
+);
+
TRACE_EVENT(rdev_start_ap,
TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
struct cfg80211_ap_settings *settings),
@@ -3463,6 +3480,43 @@ TRACE_EVENT(rdev_probe_mesh_link,
WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest))
);
+TRACE_EVENT(rdev_set_tid_config,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_tid_config *tid_conf),
+ TP_ARGS(wiphy, netdev, tid_conf),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, tid_conf->peer);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT,
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer))
+);
+
+TRACE_EVENT(rdev_reset_tid_config,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ const u8 *peer, u8 tids),
+ TP_ARGS(wiphy, netdev, peer, tids),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ MAC_ENTRY(peer)
+ __field(u8, tids)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ MAC_ASSIGN(peer, peer);
+ __entry->tids = tids;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", peer: " MAC_PR_FMT ", tids: 0x%x",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tids)
+);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 8481e9ac33da..6590efbbcbb9 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -231,7 +231,12 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
struct key_params *params, int key_idx,
bool pairwise, const u8 *mac_addr)
{
- if (key_idx < 0 || key_idx > 5)
+ int max_key_idx = 5;
+
+ if (wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_PROTECTION))
+ max_key_idx = 7;
+ if (key_idx < 0 || key_idx > max_key_idx)
return -EINVAL;
if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN))
@@ -934,7 +939,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
return -EOPNOTSUPP;
/* if it's part of a bridge, reject changing type to station/ibss */
- if ((dev->priv_flags & IFF_BRIDGE_PORT) &&
+ if (netif_is_bridge_port(dev) &&
(ntype == NL80211_IFTYPE_ADHOC ||
ntype == NL80211_IFTYPE_STATION ||
ntype == NL80211_IFTYPE_P2P_CLIENT))
diff --git a/net/x25/x25_forward.c b/net/x25/x25_forward.c
index c82999941d3f..d48ad6d29197 100644
--- a/net/x25/x25_forward.c
+++ b/net/x25/x25_forward.c
@@ -131,13 +131,11 @@ out:
void x25_clear_forward_by_lci(unsigned int lci)
{
- struct x25_forward *fwd;
- struct list_head *entry, *tmp;
+ struct x25_forward *fwd, *tmp;
write_lock_bh(&x25_forward_list_lock);
- list_for_each_safe(entry, tmp, &x25_forward_list) {
- fwd = list_entry(entry, struct x25_forward, node);
+ list_for_each_entry_safe(fwd, tmp, &x25_forward_list, node) {
if (fwd->lci == lci) {
list_del(&fwd->node);
kfree(fwd);
@@ -149,13 +147,11 @@ void x25_clear_forward_by_lci(unsigned int lci)
void x25_clear_forward_by_dev(struct net_device *dev)
{
- struct x25_forward *fwd;
- struct list_head *entry, *tmp;
+ struct x25_forward *fwd, *tmp;
write_lock_bh(&x25_forward_list_lock);
- list_for_each_safe(entry, tmp, &x25_forward_list) {
- fwd = list_entry(entry, struct x25_forward, node);
+ list_for_each_entry_safe(fwd, tmp, &x25_forward_list, node) {
if ((fwd->dev1 == dev) || (fwd->dev2 == dev)){
list_del(&fwd->node);
kfree(fwd);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index df600487a68d..356f90e4522b 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -217,6 +217,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
static void xsk_flush(struct xdp_sock *xs)
{
xskq_prod_submit(xs->rx);
+ __xskq_cons_release(xs->umem->fq);
sock_def_readable(&xs->sk);
}
@@ -304,6 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
+ __xskq_cons_release(xs->tx);
xs->sk.sk_write_space(&xs->sk);
}
rcu_read_unlock();
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index bec2af11853a..b50bb5c76da5 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -19,13 +19,13 @@ struct xdp_ring {
/* Used for the RX and TX queues for packets */
struct xdp_rxtx_ring {
struct xdp_ring ptrs;
- struct xdp_desc desc[0] ____cacheline_aligned_in_smp;
+ struct xdp_desc desc[] ____cacheline_aligned_in_smp;
};
/* Used for the fill and completion queues for buffers */
struct xdp_umem_ring {
struct xdp_ring ptrs;
- u64 desc[0] ____cacheline_aligned_in_smp;
+ u64 desc[] ____cacheline_aligned_in_smp;
};
struct xsk_queue {
@@ -271,7 +271,8 @@ static inline void xskq_cons_release(struct xsk_queue *q)
{
/* To improve performance, only update local state here.
* Reflect this to global state when we get new entries
- * from the ring in xskq_cons_get_entries().
+ * from the ring in xskq_cons_get_entries() and whenever
+ * Rx or Tx processing are completed in the NAPI loop.
*/
q->cached_cons++;
}
diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c
index f15d6a564b0e..037ea156d2f9 100644
--- a/net/xfrm/espintcp.c
+++ b/net/xfrm/espintcp.c
@@ -100,7 +100,7 @@ static int espintcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
flags |= nonblock ? MSG_DONTWAIT : 0;
- skb = __skb_recv_datagram(sk, &ctx->ike_queue, flags, NULL, &off, &err);
+ skb = __skb_recv_datagram(sk, &ctx->ike_queue, flags, &off, &err);
if (!skb)
return err;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 50f567a88f45..6cc7f7f1dd68 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -46,6 +46,25 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb,
pskb_pull(skb, skb->mac_len + x->props.header_len);
}
+static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
+ unsigned int hsize)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ int phlen = 0;
+
+ if (xo->flags & XFRM_GSO_SEGMENT)
+ skb->transport_header = skb->network_header + hsize;
+
+ skb_reset_mac_len(skb);
+ if (x->sel.family != AF_INET6) {
+ phlen = IPV4_BEET_PHMAXLEN;
+ if (x->outer_mode.family == AF_INET6)
+ phlen += sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ }
+
+ pskb_pull(skb, skb->mac_len + hsize + (x->props.header_len - phlen));
+}
+
/* Adjust pointers into the packet when IPsec is done at layer2 */
static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
{
@@ -66,9 +85,16 @@ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
return __xfrm_transport_prep(x, skb,
sizeof(struct ipv6hdr));
break;
+ case XFRM_MODE_BEET:
+ if (x->outer_mode.family == AF_INET)
+ return __xfrm_mode_beet_prep(x, skb,
+ sizeof(struct iphdr));
+ if (x->outer_mode.family == AF_INET6)
+ return __xfrm_mode_beet_prep(x, skb,
+ sizeof(struct ipv6hdr));
+ break;
case XFRM_MODE_ROUTEOPTIMIZATION:
case XFRM_MODE_IN_TRIGGER:
- case XFRM_MODE_BEET:
break;
}
}
@@ -78,8 +104,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
int err;
unsigned long flags;
struct xfrm_state *x;
- struct sk_buff *skb2, *nskb;
struct softnet_data *sd;
+ struct sk_buff *skb2, *nskb, *pskb = NULL;
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
struct sec_path *sp;
@@ -168,14 +194,14 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
} else {
if (skb == skb2)
skb = nskb;
-
- if (!skb)
- return NULL;
+ else
+ pskb->next = nskb;
continue;
}
skb_push(skb2, skb2->data - skb_mac_header(skb2));
+ pskb = skb2;
}
return skb;
@@ -383,6 +409,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
return xfrm_dev_feat_change(dev);
case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
return xfrm_dev_down(dev);
}
return NOTIFY_DONE;
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index dc651a628dcf..3361e3ac5714 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -300,10 +300,10 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
} else {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
}
dst_release(dst);
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index fafc7aba705f..2fd3d990d992 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -535,8 +535,8 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
{
struct sk_buff *segs, *nskb;
- BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
- BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_GSO_CB_OFFSET);
segs = skb_gso_segment(skb, 0);
kfree_skb(skb);
if (IS_ERR(segs))
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index dbda08ec566e..297b2fdb3c29 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -434,7 +434,9 @@ EXPORT_SYMBOL(xfrm_policy_destroy);
static void xfrm_policy_kill(struct xfrm_policy *policy)
{
+ write_lock_bh(&policy->lock);
policy->walk.dead = 1;
+ write_unlock_bh(&policy->lock);
atomic_inc(&policy->genid);
@@ -2613,7 +2615,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
xdst->xfrm_genid = xfrm[i]->genid;
dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
- dst1->flags |= DST_HOST;
dst1->lastuse = now;
dst1->input = dst_discard;
@@ -2899,7 +2900,7 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net,
dst_copy_metrics(dst1, dst);
dst1->obsolete = DST_OBSOLETE_FORCE_CHK;
- dst1->flags |= DST_HOST | DST_XFRM_QUEUE;
+ dst1->flags |= DST_XFRM_QUEUE;
dst1->lastuse = jiffies;
dst1->input = dst_discard;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 170d6e7f31d3..8be2d926acc2 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -612,7 +612,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
{
struct xfrm_state *x;
- x = kmem_cache_alloc(xfrm_state_cache, GFP_ATOMIC | __GFP_ZERO);
+ x = kmem_cache_zalloc(xfrm_state_cache, GFP_ATOMIC);
if (x) {
write_pnet(&x->xs_net, net);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b88ba45ff1ac..e6cfaa680ef3 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -110,7 +110,8 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs)
return 0;
uctx = nla_data(rt);
- if (uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len))
+ if (uctx->len > nla_len(rt) ||
+ uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len))
return -EINVAL;
return 0;
@@ -2275,6 +2276,9 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
err = verify_newpolicy_info(&ua->policy);
if (err)
goto free_state;
+ err = verify_sec_ctx_len(attrs);
+ if (err)
+ goto free_state;
/* build an XP */
xp = xfrm_policy_construct(net, &ua->policy, attrs, &err);